1 // © 2016 and later: Unicode, Inc. and others.
2 // License & terms of use: http://www.unicode.org/copyright.html
3 //
4 //  file:  regexcmp.cpp
5 //
6 //  Copyright (C) 2002-2016 International Business Machines Corporation and others.
7 //  All Rights Reserved.
8 //
9 //  This file contains the ICU regular expression compiler, which is responsible
10 //  for processing a regular expression pattern into the compiled form that
11 //  is used by the match finding engine.
12 //
13 
14 #include "unicode/utypes.h"
15 
16 #if !UCONFIG_NO_REGULAR_EXPRESSIONS
17 
18 #include "unicode/ustring.h"
19 #include "unicode/unistr.h"
20 #include "unicode/uniset.h"
21 #include "unicode/uchar.h"
22 #include "unicode/uchriter.h"
23 #include "unicode/parsepos.h"
24 #include "unicode/parseerr.h"
25 #include "unicode/regex.h"
26 #include "unicode/utf.h"
27 #include "unicode/utf16.h"
28 #include "patternprops.h"
29 #include "putilimp.h"
30 #include "cmemory.h"
31 #include "cstr.h"
32 #include "cstring.h"
33 #include "uvectr32.h"
34 #include "uvectr64.h"
35 #include "uassert.h"
36 #include "uinvchar.h"
37 
38 #include "regeximp.h"
39 #include "regexcst.h"   // Contains state table for the regex pattern parser.
40                         //   generated by a Perl script.
41 #include "regexcmp.h"
42 #include "regexst.h"
43 #include "regextxt.h"
44 
45 
46 
47 U_NAMESPACE_BEGIN
48 
49 
50 //------------------------------------------------------------------------------
51 //
52 //  Constructor.
53 //
54 //------------------------------------------------------------------------------
RegexCompile(RegexPattern * rxp,UErrorCode & status)55 RegexCompile::RegexCompile(RegexPattern *rxp, UErrorCode &status) :
56    fParenStack(status), fSetStack(status), fSetOpStack(status)
57 {
58     // Lazy init of all shared global sets (needed for init()'s empty text)
59     RegexStaticSets::initGlobals(&status);
60 
61     fStatus           = &status;
62 
63     fRXPat            = rxp;
64     fScanIndex        = 0;
65     fLastChar         = -1;
66     fPeekChar         = -1;
67     fLineNum          = 1;
68     fCharNum          = 0;
69     fQuoteMode        = FALSE;
70     fInBackslashQuote = FALSE;
71     fModeFlags        = fRXPat->fFlags | 0x80000000;
72     fEOLComments      = TRUE;
73 
74     fMatchOpenParen   = -1;
75     fMatchCloseParen  = -1;
76     fCaptureName      = NULL;
77     fLastSetLiteral   = U_SENTINEL;
78 
79     if (U_SUCCESS(status) && U_FAILURE(rxp->fDeferredStatus)) {
80         status = rxp->fDeferredStatus;
81     }
82 }
83 
84 static const UChar      chAmp       = 0x26;      // '&'
85 static const UChar      chDash      = 0x2d;      // '-'
86 
87 
88 //------------------------------------------------------------------------------
89 //
90 //  Destructor
91 //
92 //------------------------------------------------------------------------------
~RegexCompile()93 RegexCompile::~RegexCompile() {
94     delete fCaptureName;         // Normally will be NULL, but can exist if pattern
95                                  //   compilation stops with a syntax error.
96 }
97 
addCategory(UnicodeSet * set,int32_t value,UErrorCode & ec)98 static inline void addCategory(UnicodeSet *set, int32_t value, UErrorCode& ec) {
99     set->addAll(UnicodeSet().applyIntPropertyValue(UCHAR_GENERAL_CATEGORY_MASK, value, ec));
100 }
101 
102 //------------------------------------------------------------------------------
103 //
104 //  Compile regex pattern.   The state machine for rexexp pattern parsing is here.
105 //                           The state tables are hand-written in the file regexcst.txt,
106 //                           and converted to the form used here by a perl
107 //                           script regexcst.pl
108 //
109 //------------------------------------------------------------------------------
compile(const UnicodeString & pat,UParseError & pp,UErrorCode & e)110 void    RegexCompile::compile(
111                          const UnicodeString &pat,   // Source pat to be compiled.
112                          UParseError &pp,            // Error position info
113                          UErrorCode &e)              // Error Code
114 {
115     fRXPat->fPatternString = new UnicodeString(pat);
116     UText patternText = UTEXT_INITIALIZER;
117     utext_openConstUnicodeString(&patternText, fRXPat->fPatternString, &e);
118 
119     if (U_SUCCESS(e)) {
120         compile(&patternText, pp, e);
121         utext_close(&patternText);
122     }
123 }
124 
125 //
126 //   compile, UText mode
127 //     All the work is actually done here.
128 //
compile(UText * pat,UParseError & pp,UErrorCode & e)129 void    RegexCompile::compile(
130                          UText *pat,                 // Source pat to be compiled.
131                          UParseError &pp,            // Error position info
132                          UErrorCode &e)              // Error Code
133 {
134     fStatus             = &e;
135     fParseErr           = &pp;
136     fStackPtr           = 0;
137     fStack[fStackPtr]   = 0;
138 
139     if (U_FAILURE(*fStatus)) {
140         return;
141     }
142 
143     // There should be no pattern stuff in the RegexPattern object.  They can not be reused.
144     U_ASSERT(fRXPat->fPattern == NULL || utext_nativeLength(fRXPat->fPattern) == 0);
145 
146     // Prepare the RegexPattern object to receive the compiled pattern.
147     fRXPat->fPattern        = utext_clone(fRXPat->fPattern, pat, FALSE, TRUE, fStatus);
148     if (U_FAILURE(*fStatus)) {
149         return;
150     }
151     fRXPat->fStaticSets     = RegexStaticSets::gStaticSets->fPropSets;
152     fRXPat->fStaticSets8    = RegexStaticSets::gStaticSets->fPropSets8;
153 
154 
155     // Initialize the pattern scanning state machine
156     fPatternLength = utext_nativeLength(pat);
157     uint16_t                state = 1;
158     const RegexTableEl      *tableEl;
159 
160     // UREGEX_LITERAL force entire pattern to be treated as a literal string.
161     if (fModeFlags & UREGEX_LITERAL) {
162         fQuoteMode = TRUE;
163     }
164 
165     nextChar(fC);                        // Fetch the first char from the pattern string.
166 
167     //
168     // Main loop for the regex pattern parsing state machine.
169     //   Runs once per state transition.
170     //   Each time through optionally performs, depending on the state table,
171     //      - an advance to the the next pattern char
172     //      - an action to be performed.
173     //      - pushing or popping a state to/from the local state return stack.
174     //   file regexcst.txt is the source for the state table.  The logic behind
175     //     recongizing the pattern syntax is there, not here.
176     //
177     for (;;) {
178         //  Bail out if anything has gone wrong.
179         //  Regex pattern parsing stops on the first error encountered.
180         if (U_FAILURE(*fStatus)) {
181             break;
182         }
183 
184         U_ASSERT(state != 0);
185 
186         // Find the state table element that matches the input char from the pattern, or the
187         //    class of the input character.  Start with the first table row for this
188         //    state, then linearly scan forward until we find a row that matches the
189         //    character.  The last row for each state always matches all characters, so
190         //    the search will stop there, if not before.
191         //
192         tableEl = &gRuleParseStateTable[state];
193         REGEX_SCAN_DEBUG_PRINTF(("char, line, col = (\'%c\', %d, %d)    state=%s ",
194             fC.fChar, fLineNum, fCharNum, RegexStateNames[state]));
195 
196         for (;;) {    // loop through table rows belonging to this state, looking for one
197                       //   that matches the current input char.
198             REGEX_SCAN_DEBUG_PRINTF(("."));
199             if (tableEl->fCharClass < 127 && fC.fQuoted == FALSE &&   tableEl->fCharClass == fC.fChar) {
200                 // Table row specified an individual character, not a set, and
201                 //   the input character is not quoted, and
202                 //   the input character matched it.
203                 break;
204             }
205             if (tableEl->fCharClass == 255) {
206                 // Table row specified default, match anything character class.
207                 break;
208             }
209             if (tableEl->fCharClass == 254 && fC.fQuoted)  {
210                 // Table row specified "quoted" and the char was quoted.
211                 break;
212             }
213             if (tableEl->fCharClass == 253 && fC.fChar == (UChar32)-1)  {
214                 // Table row specified eof and we hit eof on the input.
215                 break;
216             }
217 
218             if (tableEl->fCharClass >= 128 && tableEl->fCharClass < 240 &&   // Table specs a char class &&
219                 fC.fQuoted == FALSE &&                                       //   char is not escaped &&
220                 fC.fChar != (UChar32)-1) {                                   //   char is not EOF
221                 U_ASSERT(tableEl->fCharClass <= 137);
222                 if (RegexStaticSets::gStaticSets->fRuleSets[tableEl->fCharClass-128].contains(fC.fChar)) {
223                     // Table row specified a character class, or set of characters,
224                     //   and the current char matches it.
225                     break;
226                 }
227             }
228 
229             // No match on this row, advance to the next  row for this state,
230             tableEl++;
231         }
232         REGEX_SCAN_DEBUG_PRINTF(("\n"));
233 
234         //
235         // We've found the row of the state table that matches the current input
236         //   character from the rules string.
237         // Perform any action specified  by this row in the state table.
238         if (doParseActions(tableEl->fAction) == FALSE) {
239             // Break out of the state machine loop if the
240             //   the action signalled some kind of error, or
241             //   the action was to exit, occurs on normal end-of-rules-input.
242             break;
243         }
244 
245         if (tableEl->fPushState != 0) {
246             fStackPtr++;
247             if (fStackPtr >= kStackSize) {
248                 error(U_REGEX_INTERNAL_ERROR);
249                 REGEX_SCAN_DEBUG_PRINTF(("RegexCompile::parse() - state stack overflow.\n"));
250                 fStackPtr--;
251             }
252             fStack[fStackPtr] = tableEl->fPushState;
253         }
254 
255         //
256         //  NextChar.  This is where characters are actually fetched from the pattern.
257         //             Happens under control of the 'n' tag in the state table.
258         //
259         if (tableEl->fNextChar) {
260             nextChar(fC);
261         }
262 
263         // Get the next state from the table entry, or from the
264         //   state stack if the next state was specified as "pop".
265         if (tableEl->fNextState != 255) {
266             state = tableEl->fNextState;
267         } else {
268             state = fStack[fStackPtr];
269             fStackPtr--;
270             if (fStackPtr < 0) {
271                 // state stack underflow
272                 // This will occur if the user pattern has mis-matched parentheses,
273                 //   with extra close parens.
274                 //
275                 fStackPtr++;
276                 error(U_REGEX_MISMATCHED_PAREN);
277             }
278         }
279 
280     }
281 
282     if (U_FAILURE(*fStatus)) {
283         // Bail out if the pattern had errors.
284         //   Set stack cleanup:  a successful compile would have left it empty,
285         //   but errors can leave temporary sets hanging around.
286         while (!fSetStack.empty()) {
287             delete (UnicodeSet *)fSetStack.pop();
288         }
289         return;
290     }
291 
292     //
293     // The pattern has now been read and processed, and the compiled code generated.
294     //
295 
296     //
297     // The pattern's fFrameSize so far has accumulated the requirements for
298     //   storage for capture parentheses, counters, etc. that are encountered
299     //   in the pattern.  Add space for the two variables that are always
300     //   present in the saved state:  the input string position (int64_t) and
301     //   the position in the compiled pattern.
302     //
303     allocateStackData(RESTACKFRAME_HDRCOUNT);
304 
305     //
306     // Optimization pass 1: NOPs, back-references, and case-folding
307     //
308     stripNOPs();
309 
310     //
311     // Get bounds for the minimum and maximum length of a string that this
312     //   pattern can match.  Used to avoid looking for matches in strings that
313     //   are too short.
314     //
315     fRXPat->fMinMatchLen = minMatchLength(3, fRXPat->fCompiledPat->size()-1);
316 
317     //
318     // Optimization pass 2: match start type
319     //
320     matchStartType();
321 
322     //
323     // Set up fast latin-1 range sets
324     //
325     int32_t numSets = fRXPat->fSets->size();
326     fRXPat->fSets8 = new Regex8BitSet[numSets];
327     // Null pointer check.
328     if (fRXPat->fSets8 == NULL) {
329         e = *fStatus = U_MEMORY_ALLOCATION_ERROR;
330         return;
331     }
332     int32_t i;
333     for (i=0; i<numSets; i++) {
334         UnicodeSet *s = (UnicodeSet *)fRXPat->fSets->elementAt(i);
335         fRXPat->fSets8[i].init(s);
336     }
337 
338 }
339 
340 
341 
342 
343 
344 //------------------------------------------------------------------------------
345 //
346 //  doParseAction        Do some action during regex pattern parsing.
347 //                       Called by the parse state machine.
348 //
349 //                       Generation of the match engine PCode happens here, or
350 //                       in functions called from the parse actions defined here.
351 //
352 //
353 //------------------------------------------------------------------------------
doParseActions(int32_t action)354 UBool RegexCompile::doParseActions(int32_t action)
355 {
356     UBool   returnVal = TRUE;
357 
358     switch ((Regex_PatternParseAction)action) {
359 
360     case doPatStart:
361         // Start of pattern compiles to:
362         //0   SAVE   2        Fall back to position of FAIL
363         //1   jmp    3
364         //2   FAIL            Stop if we ever reach here.
365         //3   NOP             Dummy, so start of pattern looks the same as
366         //                    the start of an ( grouping.
367         //4   NOP             Resreved, will be replaced by a save if there are
368         //                    OR | operators at the top level
369         appendOp(URX_STATE_SAVE, 2);
370         appendOp(URX_JMP,  3);
371         appendOp(URX_FAIL, 0);
372 
373         // Standard open nonCapture paren action emits the two NOPs and
374         //   sets up the paren stack frame.
375         doParseActions(doOpenNonCaptureParen);
376         break;
377 
378     case doPatFinish:
379         // We've scanned to the end of the pattern
380         //  The end of pattern compiles to:
381         //        URX_END
382         //    which will stop the runtime match engine.
383         //  Encountering end of pattern also behaves like a close paren,
384         //   and forces fixups of the State Save at the beginning of the compiled pattern
385         //   and of any OR operations at the top level.
386         //
387         handleCloseParen();
388         if (fParenStack.size() > 0) {
389             // Missing close paren in pattern.
390             error(U_REGEX_MISMATCHED_PAREN);
391         }
392 
393         // add the END operation to the compiled pattern.
394         appendOp(URX_END, 0);
395 
396         // Terminate the pattern compilation state machine.
397         returnVal = FALSE;
398         break;
399 
400 
401 
402     case doOrOperator:
403         // Scanning a '|', as in (A|B)
404         {
405             // Generate code for any pending literals preceding the '|'
406             fixLiterals(FALSE);
407 
408             // Insert a SAVE operation at the start of the pattern section preceding
409             //   this OR at this level.  This SAVE will branch the match forward
410             //   to the right hand side of the OR in the event that the left hand
411             //   side fails to match and backtracks.  Locate the position for the
412             //   save from the location on the top of the parentheses stack.
413             int32_t savePosition = fParenStack.popi();
414             int32_t op = (int32_t)fRXPat->fCompiledPat->elementAti(savePosition);
415             U_ASSERT(URX_TYPE(op) == URX_NOP);  // original contents of reserved location
416             op = buildOp(URX_STATE_SAVE, fRXPat->fCompiledPat->size()+1);
417             fRXPat->fCompiledPat->setElementAt(op, savePosition);
418 
419             // Append an JMP operation into the compiled pattern.  The operand for
420             //  the JMP will eventually be the location following the ')' for the
421             //  group.  This will be patched in later, when the ')' is encountered.
422             appendOp(URX_JMP, 0);
423 
424             // Push the position of the newly added JMP op onto the parentheses stack.
425             // This registers if for fixup when this block's close paren is encountered.
426             fParenStack.push(fRXPat->fCompiledPat->size()-1, *fStatus);
427 
428             // Append a NOP to the compiled pattern.  This is the slot reserved
429             //   for a SAVE in the event that there is yet another '|' following
430             //   this one.
431             appendOp(URX_NOP, 0);
432             fParenStack.push(fRXPat->fCompiledPat->size()-1, *fStatus);
433         }
434         break;
435 
436 
437     case doBeginNamedCapture:
438         // Scanning (?<letter.
439         //   The first letter of the name will come through again under doConinueNamedCapture.
440         fCaptureName = new UnicodeString();
441         if (fCaptureName == NULL) {
442             error(U_MEMORY_ALLOCATION_ERROR);
443         }
444         break;
445 
446     case  doContinueNamedCapture:
447         fCaptureName->append(fC.fChar);
448         break;
449 
450     case doBadNamedCapture:
451         error(U_REGEX_INVALID_CAPTURE_GROUP_NAME);
452         break;
453 
454     case doOpenCaptureParen:
455         // Open Capturing Paren, possibly named.
456         //   Compile to a
457         //      - NOP, which later may be replaced by a save-state if the
458         //         parenthesized group gets a * quantifier, followed by
459         //      - START_CAPTURE  n    where n is stack frame offset to the capture group variables.
460         //      - NOP, which may later be replaced by a save-state if there
461         //             is an '|' alternation within the parens.
462         //
463         //    Each capture group gets three slots in the save stack frame:
464         //         0: Capture Group start position (in input string being matched.)
465         //         1: Capture Group end position.
466         //         2: Start of Match-in-progress.
467         //    The first two locations are for a completed capture group, and are
468         //     referred to by back references and the like.
469         //    The third location stores the capture start position when an START_CAPTURE is
470         //      encountered.  This will be promoted to a completed capture when (and if) the corresponding
471         //      END_CAPTURE is encountered.
472         {
473             fixLiterals();
474             appendOp(URX_NOP, 0);
475             int32_t  varsLoc = allocateStackData(3);    // Reserve three slots in match stack frame.
476             appendOp(URX_START_CAPTURE, varsLoc);
477             appendOp(URX_NOP, 0);
478 
479             // On the Parentheses stack, start a new frame and add the postions
480             //   of the two NOPs.  Depending on what follows in the pattern, the
481             //   NOPs may be changed to SAVE_STATE or JMP ops, with a target
482             //   address of the end of the parenthesized group.
483             fParenStack.push(fModeFlags, *fStatus);                       // Match mode state
484             fParenStack.push(capturing, *fStatus);                        // Frame type.
485             fParenStack.push(fRXPat->fCompiledPat->size()-3, *fStatus);   // The first  NOP location
486             fParenStack.push(fRXPat->fCompiledPat->size()-1, *fStatus);   // The second NOP loc
487 
488             // Save the mapping from group number to stack frame variable position.
489             fRXPat->fGroupMap->addElement(varsLoc, *fStatus);
490 
491             // If this is a named capture group, add the name->group number mapping.
492             if (fCaptureName != NULL) {
493                 int32_t groupNumber = fRXPat->fGroupMap->size();
494                 int32_t previousMapping = uhash_puti(fRXPat->fNamedCaptureMap, fCaptureName, groupNumber, fStatus);
495                 fCaptureName = NULL;    // hash table takes ownership of the name (key) string.
496                 if (previousMapping > 0 && U_SUCCESS(*fStatus)) {
497                     error(U_REGEX_INVALID_CAPTURE_GROUP_NAME);
498                 }
499             }
500         }
501         break;
502 
503     case doOpenNonCaptureParen:
504         // Open non-caputuring (grouping only) Paren.
505         //   Compile to a
506         //      - NOP, which later may be replaced by a save-state if the
507         //         parenthesized group gets a * quantifier, followed by
508         //      - NOP, which may later be replaced by a save-state if there
509         //             is an '|' alternation within the parens.
510         {
511             fixLiterals();
512             appendOp(URX_NOP, 0);
513             appendOp(URX_NOP, 0);
514 
515             // On the Parentheses stack, start a new frame and add the postions
516             //   of the two NOPs.
517             fParenStack.push(fModeFlags, *fStatus);                       // Match mode state
518             fParenStack.push(plain,      *fStatus);                       // Begin a new frame.
519             fParenStack.push(fRXPat->fCompiledPat->size()-2, *fStatus);   // The first  NOP location
520             fParenStack.push(fRXPat->fCompiledPat->size()-1, *fStatus);   // The second NOP loc
521         }
522          break;
523 
524 
525     case doOpenAtomicParen:
526         // Open Atomic Paren.  (?>
527         //   Compile to a
528         //      - NOP, which later may be replaced if the parenthesized group
529         //         has a quantifier, followed by
530         //      - STO_SP  save state stack position, so it can be restored at the ")"
531         //      - NOP, which may later be replaced by a save-state if there
532         //             is an '|' alternation within the parens.
533         {
534             fixLiterals();
535             appendOp(URX_NOP, 0);
536             int32_t  varLoc = allocateData(1);    // Reserve a data location for saving the state stack ptr.
537             appendOp(URX_STO_SP, varLoc);
538             appendOp(URX_NOP, 0);
539 
540             // On the Parentheses stack, start a new frame and add the postions
541             //   of the two NOPs.  Depending on what follows in the pattern, the
542             //   NOPs may be changed to SAVE_STATE or JMP ops, with a target
543             //   address of the end of the parenthesized group.
544             fParenStack.push(fModeFlags, *fStatus);                       // Match mode state
545             fParenStack.push(atomic, *fStatus);                           // Frame type.
546             fParenStack.push(fRXPat->fCompiledPat->size()-3, *fStatus);   // The first NOP
547             fParenStack.push(fRXPat->fCompiledPat->size()-1, *fStatus);   // The second NOP
548         }
549         break;
550 
551 
552     case doOpenLookAhead:
553         // Positive Look-ahead   (?=  stuff  )
554         //
555         //   Note:   Addition of transparent input regions, with the need to
556         //           restore the original regions when failing out of a lookahead
557         //           block, complicated this sequence.  Some conbined opcodes
558         //           might make sense - or might not, lookahead aren't that common.
559         //
560         //      Caution:  min match length optimization knows about this
561         //               sequence; don't change without making updates there too.
562         //
563         // Compiles to
564         //    1    LA_START     dataLoc     Saves SP, Input Pos, Active input region.
565         //    2.   STATE_SAVE   4            on failure of lookahead, goto 4
566         //    3    JMP          6           continue ...
567         //
568         //    4.   LA_END                   Look Ahead failed.  Restore regions.
569         //    5.   BACKTRACK                and back track again.
570         //
571         //    6.   NOP              reserved for use by quantifiers on the block.
572         //                          Look-ahead can't have quantifiers, but paren stack
573         //                             compile time conventions require the slot anyhow.
574         //    7.   NOP              may be replaced if there is are '|' ops in the block.
575         //    8.     code for parenthesized stuff.
576         //    9.   LA_END
577         //
578         //  Four data slots are reserved, for saving state on entry to the look-around
579         //    0:   stack pointer on entry.
580         //    1:   input position on entry.
581         //    2:   fActiveStart, the active bounds start on entry.
582         //    3:   fActiveLimit, the active bounds limit on entry.
583         {
584             fixLiterals();
585             int32_t dataLoc = allocateData(4);
586             appendOp(URX_LA_START, dataLoc);
587             appendOp(URX_STATE_SAVE, fRXPat->fCompiledPat->size()+ 2);
588             appendOp(URX_JMP, fRXPat->fCompiledPat->size()+ 3);
589             appendOp(URX_LA_END, dataLoc);
590             appendOp(URX_BACKTRACK, 0);
591             appendOp(URX_NOP, 0);
592             appendOp(URX_NOP, 0);
593 
594             // On the Parentheses stack, start a new frame and add the postions
595             //   of the NOPs.
596             fParenStack.push(fModeFlags, *fStatus);                       // Match mode state
597             fParenStack.push(lookAhead, *fStatus);                        // Frame type.
598             fParenStack.push(fRXPat->fCompiledPat->size()-2, *fStatus);   // The first  NOP location
599             fParenStack.push(fRXPat->fCompiledPat->size()-1, *fStatus);   // The second NOP location
600         }
601         break;
602 
603     case doOpenLookAheadNeg:
604         // Negated Lookahead.   (?! stuff )
605         // Compiles to
606         //    1.    LA_START    dataloc
607         //    2.    SAVE_STATE  7         // Fail within look-ahead block restores to this state,
608         //                                //   which continues with the match.
609         //    3.    NOP                   // Std. Open Paren sequence, for possible '|'
610         //    4.       code for parenthesized stuff.
611         //    5.    LA_END                // Cut back stack, remove saved state from step 2.
612         //    6.    BACKTRACK             // code in block succeeded, so neg. lookahead fails.
613         //    7.    END_LA                // Restore match region, in case look-ahead was using
614         //                                        an alternate (transparent) region.
615         //  Four data slots are reserved, for saving state on entry to the look-around
616         //    0:   stack pointer on entry.
617         //    1:   input position on entry.
618         //    2:   fActiveStart, the active bounds start on entry.
619         //    3:   fActiveLimit, the active bounds limit on entry.
620         {
621             fixLiterals();
622             int32_t dataLoc = allocateData(4);
623             appendOp(URX_LA_START, dataLoc);
624             appendOp(URX_STATE_SAVE, 0);    // dest address will be patched later.
625             appendOp(URX_NOP, 0);
626 
627             // On the Parentheses stack, start a new frame and add the postions
628             //   of the StateSave and NOP.
629             fParenStack.push(fModeFlags, *fStatus);                       // Match mode state
630             fParenStack.push(negLookAhead, *fStatus);                    // Frame type
631             fParenStack.push(fRXPat->fCompiledPat->size()-2, *fStatus);   // The STATE_SAVE location
632             fParenStack.push(fRXPat->fCompiledPat->size()-1, *fStatus);   // The second NOP location
633 
634             // Instructions #5 - #7 will be added when the ')' is encountered.
635         }
636         break;
637 
638     case doOpenLookBehind:
639         {
640             //   Compile a (?<= look-behind open paren.
641             //
642             //          Compiles to
643             //              0       URX_LB_START     dataLoc
644             //              1       URX_LB_CONT      dataLoc
645             //              2                        MinMatchLen
646             //              3                        MaxMatchLen
647             //              4       URX_NOP          Standard '(' boilerplate.
648             //              5       URX_NOP          Reserved slot for use with '|' ops within (block).
649             //              6         <code for LookBehind expression>
650             //              7       URX_LB_END       dataLoc    # Check match len, restore input  len
651             //              8       URX_LA_END       dataLoc    # Restore stack, input pos
652             //
653             //          Allocate a block of matcher data, to contain (when running a match)
654             //              0:    Stack ptr on entry
655             //              1:    Input Index on entry
656             //              2:    fActiveStart, the active bounds start on entry.
657             //              3:    fActiveLimit, the active bounds limit on entry.
658             //              4:    Start index of match current match attempt.
659             //          The first four items must match the layout of data for LA_START / LA_END
660 
661             // Generate match code for any pending literals.
662             fixLiterals();
663 
664             // Allocate data space
665             int32_t dataLoc = allocateData(5);
666 
667             // Emit URX_LB_START
668             appendOp(URX_LB_START, dataLoc);
669 
670             // Emit URX_LB_CONT
671             appendOp(URX_LB_CONT, dataLoc);
672             appendOp(URX_RESERVED_OP, 0);    // MinMatchLength.  To be filled later.
673             appendOp(URX_RESERVED_OP, 0);    // MaxMatchLength.  To be filled later.
674 
675             // Emit the NOPs
676             appendOp(URX_NOP, 0);
677             appendOp(URX_NOP, 0);
678 
679             // On the Parentheses stack, start a new frame and add the postions
680             //   of the URX_LB_CONT and the NOP.
681             fParenStack.push(fModeFlags, *fStatus);                       // Match mode state
682             fParenStack.push(lookBehind, *fStatus);                       // Frame type
683             fParenStack.push(fRXPat->fCompiledPat->size()-2, *fStatus);   // The first NOP location
684             fParenStack.push(fRXPat->fCompiledPat->size()-1, *fStatus);   // The 2nd   NOP location
685 
686             // The final two instructions will be added when the ')' is encountered.
687         }
688 
689         break;
690 
691     case doOpenLookBehindNeg:
692         {
693             //   Compile a (?<! negated look-behind open paren.
694             //
695             //          Compiles to
696             //              0       URX_LB_START     dataLoc    # Save entry stack, input len
697             //              1       URX_LBN_CONT     dataLoc    # Iterate possible match positions
698             //              2                        MinMatchLen
699             //              3                        MaxMatchLen
700             //              4                        continueLoc (9)
701             //              5       URX_NOP          Standard '(' boilerplate.
702             //              6       URX_NOP          Reserved slot for use with '|' ops within (block).
703             //              7         <code for LookBehind expression>
704             //              8       URX_LBN_END      dataLoc    # Check match len, cause a FAIL
705             //              9       ...
706             //
707             //          Allocate a block of matcher data, to contain (when running a match)
708             //              0:    Stack ptr on entry
709             //              1:    Input Index on entry
710             //              2:    fActiveStart, the active bounds start on entry.
711             //              3:    fActiveLimit, the active bounds limit on entry.
712             //              4:    Start index of match current match attempt.
713             //          The first four items must match the layout of data for LA_START / LA_END
714 
715             // Generate match code for any pending literals.
716             fixLiterals();
717 
718             // Allocate data space
719             int32_t dataLoc = allocateData(5);
720 
721             // Emit URX_LB_START
722             appendOp(URX_LB_START, dataLoc);
723 
724             // Emit URX_LBN_CONT
725             appendOp(URX_LBN_CONT, dataLoc);
726             appendOp(URX_RESERVED_OP, 0);    // MinMatchLength.  To be filled later.
727             appendOp(URX_RESERVED_OP, 0);    // MaxMatchLength.  To be filled later.
728             appendOp(URX_RESERVED_OP, 0);    // Continue Loc.    To be filled later.
729 
730             // Emit the NOPs
731             appendOp(URX_NOP, 0);
732             appendOp(URX_NOP, 0);
733 
734             // On the Parentheses stack, start a new frame and add the postions
735             //   of the URX_LB_CONT and the NOP.
736             fParenStack.push(fModeFlags, *fStatus);                       // Match mode state
737             fParenStack.push(lookBehindN, *fStatus);                      // Frame type
738             fParenStack.push(fRXPat->fCompiledPat->size()-2, *fStatus);   // The first NOP location
739             fParenStack.push(fRXPat->fCompiledPat->size()-1, *fStatus);   // The 2nd   NOP location
740 
741             // The final two instructions will be added when the ')' is encountered.
742         }
743         break;
744 
745     case doConditionalExpr:
746         // Conditionals such as (?(1)a:b)
747     case doPerlInline:
748         // Perl inline-condtionals.  (?{perl code}a|b) We're not perl, no way to do them.
749         error(U_REGEX_UNIMPLEMENTED);
750         break;
751 
752 
753     case doCloseParen:
754         handleCloseParen();
755         if (fParenStack.size() <= 0) {
756             //  Extra close paren, or missing open paren.
757             error(U_REGEX_MISMATCHED_PAREN);
758         }
759         break;
760 
761     case doNOP:
762         break;
763 
764 
765     case doBadOpenParenType:
766     case doRuleError:
767         error(U_REGEX_RULE_SYNTAX);
768         break;
769 
770 
771     case doMismatchedParenErr:
772         error(U_REGEX_MISMATCHED_PAREN);
773         break;
774 
775     case doPlus:
776         //  Normal '+'  compiles to
777         //     1.   stuff to be repeated  (already built)
778         //     2.   jmp-sav 1
779         //     3.   ...
780         //
781         //  Or, if the item to be repeated can match a zero length string,
782         //     1.   STO_INP_LOC  data-loc
783         //     2.      body of stuff to be repeated
784         //     3.   JMP_SAV_X    2
785         //     4.   ...
786 
787         //
788         //  Or, if the item to be repeated is simple
789         //     1.   Item to be repeated.
790         //     2.   LOOP_SR_I    set number  (assuming repeated item is a set ref)
791         //     3.   LOOP_C       stack location
792         {
793             int32_t  topLoc = blockTopLoc(FALSE);        // location of item #1
794             int32_t  frameLoc;
795 
796             // Check for simple constructs, which may get special optimized code.
797             if (topLoc == fRXPat->fCompiledPat->size() - 1) {
798                 int32_t repeatedOp = (int32_t)fRXPat->fCompiledPat->elementAti(topLoc);
799 
800                 if (URX_TYPE(repeatedOp) == URX_SETREF) {
801                     // Emit optimized code for [char set]+
802                     appendOp(URX_LOOP_SR_I, URX_VAL(repeatedOp));
803                     frameLoc = allocateStackData(1);
804                     appendOp(URX_LOOP_C, frameLoc);
805                     break;
806                 }
807 
808                 if (URX_TYPE(repeatedOp) == URX_DOTANY ||
809                     URX_TYPE(repeatedOp) == URX_DOTANY_ALL ||
810                     URX_TYPE(repeatedOp) == URX_DOTANY_UNIX) {
811                     // Emit Optimized code for .+ operations.
812                     int32_t loopOpI = buildOp(URX_LOOP_DOT_I, 0);
813                     if (URX_TYPE(repeatedOp) == URX_DOTANY_ALL) {
814                         // URX_LOOP_DOT_I operand is a flag indicating ". matches any" mode.
815                         loopOpI |= 1;
816                     }
817                     if (fModeFlags & UREGEX_UNIX_LINES) {
818                         loopOpI |= 2;
819                     }
820                     appendOp(loopOpI);
821                     frameLoc = allocateStackData(1);
822                     appendOp(URX_LOOP_C, frameLoc);
823                     break;
824                 }
825 
826             }
827 
828             // General case.
829 
830             // Check for minimum match length of zero, which requires
831             //    extra loop-breaking code.
832             if (minMatchLength(topLoc, fRXPat->fCompiledPat->size()-1) == 0) {
833                 // Zero length match is possible.
834                 // Emit the code sequence that can handle it.
835                 insertOp(topLoc);
836                 frameLoc = allocateStackData(1);
837 
838                 int32_t op = buildOp(URX_STO_INP_LOC, frameLoc);
839                 fRXPat->fCompiledPat->setElementAt(op, topLoc);
840 
841                 appendOp(URX_JMP_SAV_X, topLoc+1);
842             } else {
843                 // Simpler code when the repeated body must match something non-empty
844                 appendOp(URX_JMP_SAV, topLoc);
845             }
846         }
847         break;
848 
849     case doNGPlus:
850         //  Non-greedy '+?'  compiles to
851         //     1.   stuff to be repeated  (already built)
852         //     2.   state-save  1
853         //     3.   ...
854         {
855             int32_t topLoc      = blockTopLoc(FALSE);
856             appendOp(URX_STATE_SAVE, topLoc);
857         }
858         break;
859 
860 
861     case doOpt:
862         // Normal (greedy) ? quantifier.
863         //  Compiles to
864         //     1. state save 3
865         //     2.    body of optional block
866         //     3. ...
867         // Insert the state save into the compiled pattern, and we're done.
868         {
869             int32_t   saveStateLoc = blockTopLoc(TRUE);
870             int32_t   saveStateOp  = buildOp(URX_STATE_SAVE, fRXPat->fCompiledPat->size());
871             fRXPat->fCompiledPat->setElementAt(saveStateOp, saveStateLoc);
872         }
873         break;
874 
875     case doNGOpt:
876         // Non-greedy ?? quantifier
877         //   compiles to
878         //    1.  jmp   4
879         //    2.     body of optional block
880         //    3   jmp   5
881         //    4.  state save 2
882         //    5    ...
883         //  This code is less than ideal, with two jmps instead of one, because we can only
884         //  insert one instruction at the top of the block being iterated.
885         {
886             int32_t  jmp1_loc = blockTopLoc(TRUE);
887             int32_t  jmp2_loc = fRXPat->fCompiledPat->size();
888 
889             int32_t  jmp1_op  = buildOp(URX_JMP, jmp2_loc+1);
890             fRXPat->fCompiledPat->setElementAt(jmp1_op, jmp1_loc);
891 
892             appendOp(URX_JMP, jmp2_loc+2);
893 
894             appendOp(URX_STATE_SAVE, jmp1_loc+1);
895         }
896         break;
897 
898 
899     case doStar:
900         // Normal (greedy) * quantifier.
901         // Compiles to
902         //       1.   STATE_SAVE   4
903         //       2.      body of stuff being iterated over
904         //       3.   JMP_SAV      2
905         //       4.   ...
906         //
907         // Or, if the body is a simple [Set],
908         //       1.   LOOP_SR_I    set number
909         //       2.   LOOP_C       stack location
910         //       ...
911         //
912         // Or if this is a .*
913         //       1.   LOOP_DOT_I    (. matches all mode flag)
914         //       2.   LOOP_C        stack location
915         //
916         // Or, if the body can match a zero-length string, to inhibit infinite loops,
917         //       1.   STATE_SAVE   5
918         //       2.   STO_INP_LOC  data-loc
919         //       3.      body of stuff
920         //       4.   JMP_SAV_X    2
921         //       5.   ...
922         {
923             // location of item #1, the STATE_SAVE
924             int32_t   topLoc = blockTopLoc(FALSE);
925             int32_t   dataLoc = -1;
926 
927             // Check for simple *, where the construct being repeated
928             //   compiled to single opcode, and might be optimizable.
929             if (topLoc == fRXPat->fCompiledPat->size() - 1) {
930                 int32_t repeatedOp = (int32_t)fRXPat->fCompiledPat->elementAti(topLoc);
931 
932                 if (URX_TYPE(repeatedOp) == URX_SETREF) {
933                     // Emit optimized code for a [char set]*
934                     int32_t loopOpI = buildOp(URX_LOOP_SR_I, URX_VAL(repeatedOp));
935                     fRXPat->fCompiledPat->setElementAt(loopOpI, topLoc);
936                     dataLoc = allocateStackData(1);
937                     appendOp(URX_LOOP_C, dataLoc);
938                     break;
939                 }
940 
941                 if (URX_TYPE(repeatedOp) == URX_DOTANY ||
942                     URX_TYPE(repeatedOp) == URX_DOTANY_ALL ||
943                     URX_TYPE(repeatedOp) == URX_DOTANY_UNIX) {
944                     // Emit Optimized code for .* operations.
945                     int32_t loopOpI = buildOp(URX_LOOP_DOT_I, 0);
946                     if (URX_TYPE(repeatedOp) == URX_DOTANY_ALL) {
947                         // URX_LOOP_DOT_I operand is a flag indicating . matches any mode.
948                         loopOpI |= 1;
949                     }
950                     if ((fModeFlags & UREGEX_UNIX_LINES) != 0) {
951                         loopOpI |= 2;
952                     }
953                     fRXPat->fCompiledPat->setElementAt(loopOpI, topLoc);
954                     dataLoc = allocateStackData(1);
955                     appendOp(URX_LOOP_C, dataLoc);
956                     break;
957                 }
958             }
959 
960             // Emit general case code for this *
961             // The optimizations did not apply.
962 
963             int32_t   saveStateLoc = blockTopLoc(TRUE);
964             int32_t   jmpOp        = buildOp(URX_JMP_SAV, saveStateLoc+1);
965 
966             // Check for minimum match length of zero, which requires
967             //    extra loop-breaking code.
968             if (minMatchLength(saveStateLoc, fRXPat->fCompiledPat->size()-1) == 0) {
969                 insertOp(saveStateLoc);
970                 dataLoc = allocateStackData(1);
971 
972                 int32_t op = buildOp(URX_STO_INP_LOC, dataLoc);
973                 fRXPat->fCompiledPat->setElementAt(op, saveStateLoc+1);
974                 jmpOp      = buildOp(URX_JMP_SAV_X, saveStateLoc+2);
975             }
976 
977             // Locate the position in the compiled pattern where the match will continue
978             //   after completing the *.   (4 or 5 in the comment above)
979             int32_t continueLoc = fRXPat->fCompiledPat->size()+1;
980 
981             // Put together the save state op and store it into the compiled code.
982             int32_t saveStateOp = buildOp(URX_STATE_SAVE, continueLoc);
983             fRXPat->fCompiledPat->setElementAt(saveStateOp, saveStateLoc);
984 
985             // Append the URX_JMP_SAV or URX_JMPX operation to the compiled pattern.
986             appendOp(jmpOp);
987         }
988         break;
989 
990     case doNGStar:
991         // Non-greedy *? quantifier
992         // compiles to
993         //     1.   JMP    3
994         //     2.      body of stuff being iterated over
995         //     3.   STATE_SAVE  2
996         //     4    ...
997         {
998             int32_t     jmpLoc  = blockTopLoc(TRUE);                   // loc  1.
999             int32_t     saveLoc = fRXPat->fCompiledPat->size();        // loc  3.
1000             int32_t     jmpOp   = buildOp(URX_JMP, saveLoc);
1001             fRXPat->fCompiledPat->setElementAt(jmpOp, jmpLoc);
1002             appendOp(URX_STATE_SAVE, jmpLoc+1);
1003         }
1004         break;
1005 
1006 
1007     case doIntervalInit:
1008         // The '{' opening an interval quantifier was just scanned.
1009         // Init the counter varaiables that will accumulate the values as the digits
1010         //    are scanned.
1011         fIntervalLow = 0;
1012         fIntervalUpper = -1;
1013         break;
1014 
1015     case doIntevalLowerDigit:
1016         // Scanned a digit from the lower value of an {lower,upper} interval
1017         {
1018             int32_t digitValue = u_charDigitValue(fC.fChar);
1019             U_ASSERT(digitValue >= 0);
1020             int64_t val = (int64_t)fIntervalLow*10 + digitValue;
1021             if (val > INT32_MAX) {
1022                 error(U_REGEX_NUMBER_TOO_BIG);
1023             } else {
1024                 fIntervalLow = (int32_t)val;
1025             }
1026         }
1027         break;
1028 
1029     case doIntervalUpperDigit:
1030         // Scanned a digit from the upper value of an {lower,upper} interval
1031         {
1032             if (fIntervalUpper < 0) {
1033                 fIntervalUpper = 0;
1034             }
1035             int32_t digitValue = u_charDigitValue(fC.fChar);
1036             U_ASSERT(digitValue >= 0);
1037             int64_t val = (int64_t)fIntervalUpper*10 + digitValue;
1038             if (val > INT32_MAX) {
1039                 error(U_REGEX_NUMBER_TOO_BIG);
1040             } else {
1041                 fIntervalUpper = (int32_t)val;
1042             }
1043         }
1044         break;
1045 
1046     case doIntervalSame:
1047         // Scanned a single value interval like {27}.  Upper = Lower.
1048         fIntervalUpper = fIntervalLow;
1049         break;
1050 
1051     case doInterval:
1052         // Finished scanning a normal {lower,upper} interval.  Generate the code for it.
1053         if (compileInlineInterval() == FALSE) {
1054             compileInterval(URX_CTR_INIT, URX_CTR_LOOP);
1055         }
1056         break;
1057 
1058     case doPossessiveInterval:
1059         // Finished scanning a Possessive {lower,upper}+ interval.  Generate the code for it.
1060         {
1061             // Remember the loc for the top of the block being looped over.
1062             //   (Can not reserve a slot in the compiled pattern at this time, because
1063             //    compileInterval needs to reserve also, and blockTopLoc can only reserve
1064             //    once per block.)
1065             int32_t topLoc = blockTopLoc(FALSE);
1066 
1067             // Produce normal looping code.
1068             compileInterval(URX_CTR_INIT, URX_CTR_LOOP);
1069 
1070             // Surround the just-emitted normal looping code with a STO_SP ... LD_SP
1071             //  just as if the loop was inclosed in atomic parentheses.
1072 
1073             // First the STO_SP before the start of the loop
1074             insertOp(topLoc);
1075 
1076             int32_t  varLoc = allocateData(1);   // Reserve a data location for saving the
1077             int32_t  op     = buildOp(URX_STO_SP, varLoc);
1078             fRXPat->fCompiledPat->setElementAt(op, topLoc);
1079 
1080             int32_t loopOp = (int32_t)fRXPat->fCompiledPat->popi();
1081             U_ASSERT(URX_TYPE(loopOp) == URX_CTR_LOOP && URX_VAL(loopOp) == topLoc);
1082             loopOp++;     // point LoopOp after the just-inserted STO_SP
1083             fRXPat->fCompiledPat->push(loopOp, *fStatus);
1084 
1085             // Then the LD_SP after the end of the loop
1086             appendOp(URX_LD_SP, varLoc);
1087         }
1088 
1089         break;
1090 
1091     case doNGInterval:
1092         // Finished scanning a non-greedy {lower,upper}? interval.  Generate the code for it.
1093         compileInterval(URX_CTR_INIT_NG, URX_CTR_LOOP_NG);
1094         break;
1095 
1096     case doIntervalError:
1097         error(U_REGEX_BAD_INTERVAL);
1098         break;
1099 
1100     case doLiteralChar:
1101         // We've just scanned a "normal" character from the pattern,
1102         literalChar(fC.fChar);
1103         break;
1104 
1105 
1106     case doEscapedLiteralChar:
1107         // We've just scanned an backslashed escaped character with  no
1108         //   special meaning.  It represents itself.
1109         if ((fModeFlags & UREGEX_ERROR_ON_UNKNOWN_ESCAPES) != 0 &&
1110             ((fC.fChar >= 0x41 && fC.fChar<= 0x5A) ||     // in [A-Z]
1111             (fC.fChar >= 0x61 && fC.fChar <= 0x7a))) {   // in [a-z]
1112                error(U_REGEX_BAD_ESCAPE_SEQUENCE);
1113              }
1114         literalChar(fC.fChar);
1115         break;
1116 
1117 
1118     case doDotAny:
1119         // scanned a ".",  match any single character.
1120         {
1121             fixLiterals(FALSE);
1122             if (fModeFlags & UREGEX_DOTALL) {
1123                 appendOp(URX_DOTANY_ALL, 0);
1124             } else if (fModeFlags & UREGEX_UNIX_LINES) {
1125                 appendOp(URX_DOTANY_UNIX, 0);
1126             } else {
1127                 appendOp(URX_DOTANY, 0);
1128             }
1129         }
1130         break;
1131 
1132     case doCaret:
1133         {
1134             fixLiterals(FALSE);
1135             if (       (fModeFlags & UREGEX_MULTILINE) == 0 && (fModeFlags & UREGEX_UNIX_LINES) == 0) {
1136                 appendOp(URX_CARET, 0);
1137             } else if ((fModeFlags & UREGEX_MULTILINE) != 0 && (fModeFlags & UREGEX_UNIX_LINES) == 0) {
1138                 appendOp(URX_CARET_M, 0);
1139             } else if ((fModeFlags & UREGEX_MULTILINE) == 0 && (fModeFlags & UREGEX_UNIX_LINES) != 0) {
1140                 appendOp(URX_CARET, 0);   // Only testing true start of input.
1141             } else if ((fModeFlags & UREGEX_MULTILINE) != 0 && (fModeFlags & UREGEX_UNIX_LINES) != 0) {
1142                 appendOp(URX_CARET_M_UNIX, 0);
1143             }
1144         }
1145         break;
1146 
1147     case doDollar:
1148         {
1149             fixLiterals(FALSE);
1150             if (       (fModeFlags & UREGEX_MULTILINE) == 0 && (fModeFlags & UREGEX_UNIX_LINES) == 0) {
1151                 appendOp(URX_DOLLAR, 0);
1152             } else if ((fModeFlags & UREGEX_MULTILINE) != 0 && (fModeFlags & UREGEX_UNIX_LINES) == 0) {
1153                 appendOp(URX_DOLLAR_M, 0);
1154             } else if ((fModeFlags & UREGEX_MULTILINE) == 0 && (fModeFlags & UREGEX_UNIX_LINES) != 0) {
1155                 appendOp(URX_DOLLAR_D, 0);
1156             } else if ((fModeFlags & UREGEX_MULTILINE) != 0 && (fModeFlags & UREGEX_UNIX_LINES) != 0) {
1157                 appendOp(URX_DOLLAR_MD, 0);
1158             }
1159         }
1160         break;
1161 
1162     case doBackslashA:
1163         fixLiterals(FALSE);
1164         appendOp(URX_CARET, 0);
1165         break;
1166 
1167     case doBackslashB:
1168         {
1169             #if  UCONFIG_NO_BREAK_ITERATION==1
1170             if (fModeFlags & UREGEX_UWORD) {
1171                 error(U_UNSUPPORTED_ERROR);
1172             }
1173             #endif
1174             fixLiterals(FALSE);
1175             int32_t op = (fModeFlags & UREGEX_UWORD)? URX_BACKSLASH_BU : URX_BACKSLASH_B;
1176             appendOp(op, 1);
1177         }
1178         break;
1179 
1180     case doBackslashb:
1181         {
1182             #if  UCONFIG_NO_BREAK_ITERATION==1
1183             if (fModeFlags & UREGEX_UWORD) {
1184                 error(U_UNSUPPORTED_ERROR);
1185             }
1186             #endif
1187             fixLiterals(FALSE);
1188             int32_t op = (fModeFlags & UREGEX_UWORD)? URX_BACKSLASH_BU : URX_BACKSLASH_B;
1189             appendOp(op, 0);
1190         }
1191         break;
1192 
1193     case doBackslashD:
1194         fixLiterals(FALSE);
1195         appendOp(URX_BACKSLASH_D, 1);
1196         break;
1197 
1198     case doBackslashd:
1199         fixLiterals(FALSE);
1200         appendOp(URX_BACKSLASH_D, 0);
1201         break;
1202 
1203     case doBackslashG:
1204         fixLiterals(FALSE);
1205         appendOp(URX_BACKSLASH_G, 0);
1206         break;
1207 
1208     case doBackslashH:
1209         fixLiterals(FALSE);
1210         appendOp(URX_BACKSLASH_H, 1);
1211         break;
1212 
1213     case doBackslashh:
1214         fixLiterals(FALSE);
1215         appendOp(URX_BACKSLASH_H, 0);
1216         break;
1217 
1218     case doBackslashR:
1219         fixLiterals(FALSE);
1220         appendOp(URX_BACKSLASH_R, 0);
1221         break;
1222 
1223     case doBackslashS:
1224         fixLiterals(FALSE);
1225         appendOp(URX_STAT_SETREF_N, URX_ISSPACE_SET);
1226         break;
1227 
1228     case doBackslashs:
1229         fixLiterals(FALSE);
1230         appendOp(URX_STATIC_SETREF, URX_ISSPACE_SET);
1231         break;
1232 
1233     case doBackslashV:
1234         fixLiterals(FALSE);
1235         appendOp(URX_BACKSLASH_V, 1);
1236         break;
1237 
1238     case doBackslashv:
1239         fixLiterals(FALSE);
1240         appendOp(URX_BACKSLASH_V, 0);
1241         break;
1242 
1243     case doBackslashW:
1244         fixLiterals(FALSE);
1245         appendOp(URX_STAT_SETREF_N, URX_ISWORD_SET);
1246         break;
1247 
1248     case doBackslashw:
1249         fixLiterals(FALSE);
1250         appendOp(URX_STATIC_SETREF, URX_ISWORD_SET);
1251         break;
1252 
1253     case doBackslashX:
1254         fixLiterals(FALSE);
1255         appendOp(URX_BACKSLASH_X, 0);
1256         break;
1257 
1258 
1259     case doBackslashZ:
1260         fixLiterals(FALSE);
1261         appendOp(URX_DOLLAR, 0);
1262         break;
1263 
1264     case doBackslashz:
1265         fixLiterals(FALSE);
1266         appendOp(URX_BACKSLASH_Z, 0);
1267         break;
1268 
1269     case doEscapeError:
1270         error(U_REGEX_BAD_ESCAPE_SEQUENCE);
1271         break;
1272 
1273     case doExit:
1274         fixLiterals(FALSE);
1275         returnVal = FALSE;
1276         break;
1277 
1278     case doProperty:
1279         {
1280             fixLiterals(FALSE);
1281             UnicodeSet *theSet = scanProp();
1282             compileSet(theSet);
1283         }
1284         break;
1285 
1286     case doNamedChar:
1287         {
1288             UChar32 c = scanNamedChar();
1289             literalChar(c);
1290         }
1291         break;
1292 
1293 
1294     case doBackRef:
1295         // BackReference.  Somewhat unusual in that the front-end can not completely parse
1296         //                 the regular expression, because the number of digits to be consumed
1297         //                 depends on the number of capture groups that have been defined.  So
1298         //                 we have to do it here instead.
1299         {
1300             int32_t  numCaptureGroups = fRXPat->fGroupMap->size();
1301             int32_t  groupNum = 0;
1302             UChar32  c        = fC.fChar;
1303 
1304             for (;;) {
1305                 // Loop once per digit, for max allowed number of digits in a back reference.
1306                 int32_t digit = u_charDigitValue(c);
1307                 groupNum = groupNum * 10 + digit;
1308                 if (groupNum >= numCaptureGroups) {
1309                     break;
1310                 }
1311                 c = peekCharLL();
1312                 if (RegexStaticSets::gStaticSets->fRuleDigitsAlias->contains(c) == FALSE) {
1313                     break;
1314                 }
1315                 nextCharLL();
1316             }
1317 
1318             // Scan of the back reference in the source regexp is complete.  Now generate
1319             //  the compiled code for it.
1320             // Because capture groups can be forward-referenced by back-references,
1321             //  we fill the operand with the capture group number.  At the end
1322             //  of compilation, it will be changed to the variable's location.
1323             U_ASSERT(groupNum > 0);  // Shouldn't happen.  '\0' begins an octal escape sequence,
1324                                      //    and shouldn't enter this code path at all.
1325             fixLiterals(FALSE);
1326             if (fModeFlags & UREGEX_CASE_INSENSITIVE) {
1327                 appendOp(URX_BACKREF_I, groupNum);
1328             } else {
1329                 appendOp(URX_BACKREF, groupNum);
1330             }
1331         }
1332         break;
1333 
1334     case doBeginNamedBackRef:
1335         U_ASSERT(fCaptureName == NULL);
1336         fCaptureName = new UnicodeString;
1337         if (fCaptureName == NULL) {
1338             error(U_MEMORY_ALLOCATION_ERROR);
1339         }
1340         break;
1341 
1342     case doContinueNamedBackRef:
1343         fCaptureName->append(fC.fChar);
1344         break;
1345 
1346     case doCompleteNamedBackRef:
1347         {
1348         int32_t groupNumber = uhash_geti(fRXPat->fNamedCaptureMap, fCaptureName);
1349         if (groupNumber == 0) {
1350             // Group name has not been defined.
1351             //   Could be a forward reference. If we choose to support them at some
1352             //   future time, extra mechanism will be required at this point.
1353             error(U_REGEX_INVALID_CAPTURE_GROUP_NAME);
1354         } else {
1355             // Given the number, handle identically to a \n numbered back reference.
1356             // See comments above, under doBackRef
1357             fixLiterals(FALSE);
1358             if (fModeFlags & UREGEX_CASE_INSENSITIVE) {
1359                 appendOp(URX_BACKREF_I, groupNumber);
1360             } else {
1361                 appendOp(URX_BACKREF, groupNumber);
1362             }
1363         }
1364         delete fCaptureName;
1365         fCaptureName = NULL;
1366         break;
1367         }
1368 
1369     case doPossessivePlus:
1370         // Possessive ++ quantifier.
1371         // Compiles to
1372         //       1.   STO_SP
1373         //       2.      body of stuff being iterated over
1374         //       3.   STATE_SAVE 5
1375         //       4.   JMP        2
1376         //       5.   LD_SP
1377         //       6.   ...
1378         //
1379         //  Note:  TODO:  This is pretty inefficient.  A mass of saved state is built up
1380         //                then unconditionally discarded.  Perhaps introduce a new opcode.  Ticket 6056
1381         //
1382         {
1383             // Emit the STO_SP
1384             int32_t   topLoc = blockTopLoc(TRUE);
1385             int32_t   stoLoc = allocateData(1);  // Reserve the data location for storing save stack ptr.
1386             int32_t   op     = buildOp(URX_STO_SP, stoLoc);
1387             fRXPat->fCompiledPat->setElementAt(op, topLoc);
1388 
1389             // Emit the STATE_SAVE
1390             appendOp(URX_STATE_SAVE, fRXPat->fCompiledPat->size()+2);
1391 
1392             // Emit the JMP
1393             appendOp(URX_JMP, topLoc+1);
1394 
1395             // Emit the LD_SP
1396             appendOp(URX_LD_SP, stoLoc);
1397         }
1398         break;
1399 
1400     case doPossessiveStar:
1401         // Possessive *+ quantifier.
1402         // Compiles to
1403         //       1.   STO_SP       loc
1404         //       2.   STATE_SAVE   5
1405         //       3.      body of stuff being iterated over
1406         //       4.   JMP          2
1407         //       5.   LD_SP        loc
1408         //       6    ...
1409         // TODO:  do something to cut back the state stack each time through the loop.
1410         {
1411             // Reserve two slots at the top of the block.
1412             int32_t   topLoc = blockTopLoc(TRUE);
1413             insertOp(topLoc);
1414 
1415             // emit   STO_SP     loc
1416             int32_t   stoLoc = allocateData(1);    // Reserve the data location for storing save stack ptr.
1417             int32_t   op     = buildOp(URX_STO_SP, stoLoc);
1418             fRXPat->fCompiledPat->setElementAt(op, topLoc);
1419 
1420             // Emit the SAVE_STATE   5
1421             int32_t L7 = fRXPat->fCompiledPat->size()+1;
1422             op = buildOp(URX_STATE_SAVE, L7);
1423             fRXPat->fCompiledPat->setElementAt(op, topLoc+1);
1424 
1425             // Append the JMP operation.
1426             appendOp(URX_JMP, topLoc+1);
1427 
1428             // Emit the LD_SP       loc
1429             appendOp(URX_LD_SP, stoLoc);
1430         }
1431         break;
1432 
1433     case doPossessiveOpt:
1434         // Possessive  ?+ quantifier.
1435         //  Compiles to
1436         //     1. STO_SP      loc
1437         //     2. SAVE_STATE  5
1438         //     3.    body of optional block
1439         //     4. LD_SP       loc
1440         //     5. ...
1441         //
1442         {
1443             // Reserve two slots at the top of the block.
1444             int32_t   topLoc = blockTopLoc(TRUE);
1445             insertOp(topLoc);
1446 
1447             // Emit the STO_SP
1448             int32_t   stoLoc = allocateData(1);   // Reserve the data location for storing save stack ptr.
1449             int32_t   op     = buildOp(URX_STO_SP, stoLoc);
1450             fRXPat->fCompiledPat->setElementAt(op, topLoc);
1451 
1452             // Emit the SAVE_STATE
1453             int32_t   continueLoc = fRXPat->fCompiledPat->size()+1;
1454             op = buildOp(URX_STATE_SAVE, continueLoc);
1455             fRXPat->fCompiledPat->setElementAt(op, topLoc+1);
1456 
1457             // Emit the LD_SP
1458             appendOp(URX_LD_SP, stoLoc);
1459         }
1460         break;
1461 
1462 
1463     case doBeginMatchMode:
1464         fNewModeFlags = fModeFlags;
1465         fSetModeFlag  = TRUE;
1466         break;
1467 
1468     case doMatchMode:   //  (?i)    and similar
1469         {
1470             int32_t  bit = 0;
1471             switch (fC.fChar) {
1472             case 0x69: /* 'i' */   bit = UREGEX_CASE_INSENSITIVE; break;
1473             case 0x64: /* 'd' */   bit = UREGEX_UNIX_LINES;       break;
1474             case 0x6d: /* 'm' */   bit = UREGEX_MULTILINE;        break;
1475             case 0x73: /* 's' */   bit = UREGEX_DOTALL;           break;
1476             case 0x75: /* 'u' */   bit = 0; /* Unicode casing */  break;
1477             case 0x77: /* 'w' */   bit = UREGEX_UWORD;            break;
1478             case 0x78: /* 'x' */   bit = UREGEX_COMMENTS;         break;
1479             case 0x2d: /* '-' */   fSetModeFlag = FALSE;          break;
1480             default:
1481                 UPRV_UNREACHABLE;   // Should never happen.  Other chars are filtered out
1482                                    // by the scanner.
1483             }
1484             if (fSetModeFlag) {
1485                 fNewModeFlags |= bit;
1486             } else {
1487                 fNewModeFlags &= ~bit;
1488             }
1489         }
1490         break;
1491 
1492     case doSetMatchMode:
1493         // Emit code to match any pending literals, using the not-yet changed match mode.
1494         fixLiterals();
1495 
1496         // We've got a (?i) or similar.  The match mode is being changed, but
1497         //   the change is not scoped to a parenthesized block.
1498         U_ASSERT(fNewModeFlags < 0);
1499         fModeFlags = fNewModeFlags;
1500 
1501         break;
1502 
1503 
1504     case doMatchModeParen:
1505         // We've got a (?i: or similar.  Begin a parenthesized block, save old
1506         //   mode flags so they can be restored at the close of the block.
1507         //
1508         //   Compile to a
1509         //      - NOP, which later may be replaced by a save-state if the
1510         //         parenthesized group gets a * quantifier, followed by
1511         //      - NOP, which may later be replaced by a save-state if there
1512         //             is an '|' alternation within the parens.
1513         {
1514             fixLiterals(FALSE);
1515             appendOp(URX_NOP, 0);
1516             appendOp(URX_NOP, 0);
1517 
1518             // On the Parentheses stack, start a new frame and add the postions
1519             //   of the two NOPs (a normal non-capturing () frame, except for the
1520             //   saving of the orignal mode flags.)
1521             fParenStack.push(fModeFlags, *fStatus);
1522             fParenStack.push(flags, *fStatus);                            // Frame Marker
1523             fParenStack.push(fRXPat->fCompiledPat->size()-2, *fStatus);   // The first NOP
1524             fParenStack.push(fRXPat->fCompiledPat->size()-1, *fStatus);   // The second NOP
1525 
1526             // Set the current mode flags to the new values.
1527             U_ASSERT(fNewModeFlags < 0);
1528             fModeFlags = fNewModeFlags;
1529         }
1530         break;
1531 
1532     case doBadModeFlag:
1533         error(U_REGEX_INVALID_FLAG);
1534         break;
1535 
1536     case doSuppressComments:
1537         // We have just scanned a '(?'.  We now need to prevent the character scanner from
1538         // treating a '#' as a to-the-end-of-line comment.
1539         //   (This Perl compatibility just gets uglier and uglier to do...)
1540         fEOLComments = FALSE;
1541         break;
1542 
1543 
1544     case doSetAddAmp:
1545         {
1546           UnicodeSet *set = (UnicodeSet *)fSetStack.peek();
1547           set->add(chAmp);
1548         }
1549         break;
1550 
1551     case doSetAddDash:
1552         {
1553           UnicodeSet *set = (UnicodeSet *)fSetStack.peek();
1554           set->add(chDash);
1555         }
1556         break;
1557 
1558      case doSetBackslash_s:
1559         {
1560          UnicodeSet *set = (UnicodeSet *)fSetStack.peek();
1561          set->addAll(*RegexStaticSets::gStaticSets->fPropSets[URX_ISSPACE_SET]);
1562          break;
1563         }
1564 
1565      case doSetBackslash_S:
1566         {
1567             UnicodeSet *set = (UnicodeSet *)fSetStack.peek();
1568             UnicodeSet SSet(*RegexStaticSets::gStaticSets->fPropSets[URX_ISSPACE_SET]);
1569             SSet.complement();
1570             set->addAll(SSet);
1571             break;
1572         }
1573 
1574     case doSetBackslash_d:
1575         {
1576             UnicodeSet *set = (UnicodeSet *)fSetStack.peek();
1577             // TODO - make a static set, ticket 6058.
1578             addCategory(set, U_GC_ND_MASK, *fStatus);
1579             break;
1580         }
1581 
1582     case doSetBackslash_D:
1583         {
1584             UnicodeSet *set = (UnicodeSet *)fSetStack.peek();
1585             UnicodeSet digits;
1586             // TODO - make a static set, ticket 6058.
1587             digits.applyIntPropertyValue(UCHAR_GENERAL_CATEGORY_MASK, U_GC_ND_MASK, *fStatus);
1588             digits.complement();
1589             set->addAll(digits);
1590             break;
1591         }
1592 
1593     case doSetBackslash_h:
1594         {
1595             UnicodeSet *set = (UnicodeSet *)fSetStack.peek();
1596             UnicodeSet h;
1597             h.applyIntPropertyValue(UCHAR_GENERAL_CATEGORY_MASK, U_GC_ZS_MASK, *fStatus);
1598             h.add((UChar32)9);   // Tab
1599             set->addAll(h);
1600             break;
1601         }
1602 
1603     case doSetBackslash_H:
1604         {
1605             UnicodeSet *set = (UnicodeSet *)fSetStack.peek();
1606             UnicodeSet h;
1607             h.applyIntPropertyValue(UCHAR_GENERAL_CATEGORY_MASK, U_GC_ZS_MASK, *fStatus);
1608             h.add((UChar32)9);   // Tab
1609             h.complement();
1610             set->addAll(h);
1611             break;
1612         }
1613 
1614     case doSetBackslash_v:
1615         {
1616             UnicodeSet *set = (UnicodeSet *)fSetStack.peek();
1617             set->add((UChar32)0x0a, (UChar32)0x0d);  // add range
1618             set->add((UChar32)0x85);
1619             set->add((UChar32)0x2028, (UChar32)0x2029);
1620             break;
1621         }
1622 
1623     case doSetBackslash_V:
1624         {
1625             UnicodeSet *set = (UnicodeSet *)fSetStack.peek();
1626             UnicodeSet v;
1627             v.add((UChar32)0x0a, (UChar32)0x0d);  // add range
1628             v.add((UChar32)0x85);
1629             v.add((UChar32)0x2028, (UChar32)0x2029);
1630             v.complement();
1631             set->addAll(v);
1632             break;
1633         }
1634 
1635     case doSetBackslash_w:
1636         {
1637             UnicodeSet *set = (UnicodeSet *)fSetStack.peek();
1638             set->addAll(*RegexStaticSets::gStaticSets->fPropSets[URX_ISWORD_SET]);
1639             break;
1640         }
1641 
1642     case doSetBackslash_W:
1643         {
1644             UnicodeSet *set = (UnicodeSet *)fSetStack.peek();
1645             UnicodeSet SSet(*RegexStaticSets::gStaticSets->fPropSets[URX_ISWORD_SET]);
1646             SSet.complement();
1647             set->addAll(SSet);
1648             break;
1649         }
1650 
1651     case doSetBegin:
1652         fixLiterals(FALSE);
1653         fSetStack.push(new UnicodeSet(), *fStatus);
1654         fSetOpStack.push(setStart, *fStatus);
1655         if ((fModeFlags & UREGEX_CASE_INSENSITIVE) != 0) {
1656             fSetOpStack.push(setCaseClose, *fStatus);
1657         }
1658         break;
1659 
1660     case doSetBeginDifference1:
1661         //  We have scanned something like [[abc]-[
1662         //  Set up a new UnicodeSet for the set beginning with the just-scanned '['
1663         //  Push a Difference operator, which will cause the new set to be subtracted from what
1664         //    went before once it is created.
1665         setPushOp(setDifference1);
1666         fSetOpStack.push(setStart, *fStatus);
1667         if ((fModeFlags & UREGEX_CASE_INSENSITIVE) != 0) {
1668             fSetOpStack.push(setCaseClose, *fStatus);
1669         }
1670         break;
1671 
1672     case doSetBeginIntersection1:
1673         //  We have scanned something like  [[abc]&[
1674         //   Need both the '&' operator and the open '[' operator.
1675         setPushOp(setIntersection1);
1676         fSetOpStack.push(setStart, *fStatus);
1677         if ((fModeFlags & UREGEX_CASE_INSENSITIVE) != 0) {
1678             fSetOpStack.push(setCaseClose, *fStatus);
1679         }
1680         break;
1681 
1682     case doSetBeginUnion:
1683         //  We have scanned something like  [[abc][
1684         //     Need to handle the union operation explicitly [[abc] | [
1685         setPushOp(setUnion);
1686         fSetOpStack.push(setStart, *fStatus);
1687         if ((fModeFlags & UREGEX_CASE_INSENSITIVE) != 0) {
1688             fSetOpStack.push(setCaseClose, *fStatus);
1689         }
1690         break;
1691 
1692     case doSetDifference2:
1693         // We have scanned something like [abc--
1694         //   Consider this to unambiguously be a set difference operator.
1695         setPushOp(setDifference2);
1696         break;
1697 
1698     case doSetEnd:
1699         // Have encountered the ']' that closes a set.
1700         //    Force the evaluation of any pending operations within this set,
1701         //    leave the completed set on the top of the set stack.
1702         setEval(setEnd);
1703         U_ASSERT(fSetOpStack.peeki()==setStart);
1704         fSetOpStack.popi();
1705         break;
1706 
1707     case doSetFinish:
1708         {
1709         // Finished a complete set expression, including all nested sets.
1710         //   The close bracket has already triggered clearing out pending set operators,
1711         //    the operator stack should be empty and the operand stack should have just
1712         //    one entry, the result set.
1713         U_ASSERT(fSetOpStack.empty());
1714         UnicodeSet *theSet = (UnicodeSet *)fSetStack.pop();
1715         U_ASSERT(fSetStack.empty());
1716         compileSet(theSet);
1717         break;
1718         }
1719 
1720     case doSetIntersection2:
1721         // Have scanned something like [abc&&
1722         setPushOp(setIntersection2);
1723         break;
1724 
1725     case doSetLiteral:
1726         // Union the just-scanned literal character into the set being built.
1727         //    This operation is the highest precedence set operation, so we can always do
1728         //    it immediately, without waiting to see what follows.  It is necessary to perform
1729         //    any pending '-' or '&' operation first, because these have the same precedence
1730         //    as union-ing in a literal'
1731         {
1732             setEval(setUnion);
1733             UnicodeSet *s = (UnicodeSet *)fSetStack.peek();
1734             s->add(fC.fChar);
1735             fLastSetLiteral = fC.fChar;
1736             break;
1737         }
1738 
1739     case doSetLiteralEscaped:
1740         // A back-slash escaped literal character was encountered.
1741         // Processing is the same as with setLiteral, above, with the addition of
1742         //  the optional check for errors on escaped ASCII letters.
1743         {
1744             if ((fModeFlags & UREGEX_ERROR_ON_UNKNOWN_ESCAPES) != 0 &&
1745                 ((fC.fChar >= 0x41 && fC.fChar<= 0x5A) ||     // in [A-Z]
1746                  (fC.fChar >= 0x61 && fC.fChar <= 0x7a))) {   // in [a-z]
1747                 error(U_REGEX_BAD_ESCAPE_SEQUENCE);
1748             }
1749             setEval(setUnion);
1750             UnicodeSet *s = (UnicodeSet *)fSetStack.peek();
1751             s->add(fC.fChar);
1752             fLastSetLiteral = fC.fChar;
1753             break;
1754         }
1755 
1756         case doSetNamedChar:
1757         // Scanning a \N{UNICODE CHARACTER NAME}
1758         //  Aside from the source of the character, the processing is identical to doSetLiteral,
1759         //    above.
1760         {
1761             UChar32  c = scanNamedChar();
1762             setEval(setUnion);
1763             UnicodeSet *s = (UnicodeSet *)fSetStack.peek();
1764             s->add(c);
1765             fLastSetLiteral = c;
1766             break;
1767         }
1768 
1769     case doSetNamedRange:
1770         // We have scanned literal-\N{CHAR NAME}.  Add the range to the set.
1771         // The left character is already in the set, and is saved in fLastSetLiteral.
1772         // The right side needs to be picked up, the scan is at the 'N'.
1773         // Lower Limit > Upper limit being an error matches both Java
1774         //        and ICU UnicodeSet behavior.
1775         {
1776             UChar32  c = scanNamedChar();
1777             if (U_SUCCESS(*fStatus) && (fLastSetLiteral == U_SENTINEL || fLastSetLiteral > c)) {
1778                 error(U_REGEX_INVALID_RANGE);
1779             }
1780             UnicodeSet *s = (UnicodeSet *)fSetStack.peek();
1781             s->add(fLastSetLiteral, c);
1782             fLastSetLiteral = c;
1783             break;
1784         }
1785 
1786 
1787     case  doSetNegate:
1788         // Scanned a '^' at the start of a set.
1789         // Push the negation operator onto the set op stack.
1790         // A twist for case-insensitive matching:
1791         //   the case closure operation must happen _before_ negation.
1792         //   But the case closure operation will already be on the stack if it's required.
1793         //   This requires checking for case closure, and swapping the stack order
1794         //    if it is present.
1795         {
1796             int32_t  tosOp = fSetOpStack.peeki();
1797             if (tosOp == setCaseClose) {
1798                 fSetOpStack.popi();
1799                 fSetOpStack.push(setNegation, *fStatus);
1800                 fSetOpStack.push(setCaseClose, *fStatus);
1801             } else {
1802                 fSetOpStack.push(setNegation, *fStatus);
1803             }
1804         }
1805         break;
1806 
1807     case doSetNoCloseError:
1808         error(U_REGEX_MISSING_CLOSE_BRACKET);
1809         break;
1810 
1811     case doSetOpError:
1812         error(U_REGEX_RULE_SYNTAX);   //  -- or && at the end of a set.  Illegal.
1813         break;
1814 
1815     case doSetPosixProp:
1816         {
1817             UnicodeSet *s = scanPosixProp();
1818             if (s != NULL) {
1819                 UnicodeSet *tos = (UnicodeSet *)fSetStack.peek();
1820                 tos->addAll(*s);
1821                 delete s;
1822             }  // else error.  scanProp() reported the error status already.
1823         }
1824         break;
1825 
1826     case doSetProp:
1827         //  Scanned a \p \P within [brackets].
1828         {
1829             UnicodeSet *s = scanProp();
1830             if (s != NULL) {
1831                 UnicodeSet *tos = (UnicodeSet *)fSetStack.peek();
1832                 tos->addAll(*s);
1833                 delete s;
1834             }  // else error.  scanProp() reported the error status already.
1835         }
1836         break;
1837 
1838 
1839     case doSetRange:
1840         // We have scanned literal-literal.  Add the range to the set.
1841         // The left character is already in the set, and is saved in fLastSetLiteral.
1842         // The right side is the current character.
1843         // Lower Limit > Upper limit being an error matches both Java
1844         //        and ICU UnicodeSet behavior.
1845         {
1846 
1847         if (fLastSetLiteral == U_SENTINEL || fLastSetLiteral > fC.fChar) {
1848             error(U_REGEX_INVALID_RANGE);
1849         }
1850         UnicodeSet *s = (UnicodeSet *)fSetStack.peek();
1851         s->add(fLastSetLiteral, fC.fChar);
1852         break;
1853         }
1854 
1855     default:
1856         UPRV_UNREACHABLE;
1857     }
1858 
1859     if (U_FAILURE(*fStatus)) {
1860         returnVal = FALSE;
1861     }
1862 
1863     return returnVal;
1864 }
1865 
1866 
1867 
1868 //------------------------------------------------------------------------------
1869 //
1870 //   literalChar           We've encountered a literal character from the pattern,
1871 //                             or an escape sequence that reduces to a character.
1872 //                         Add it to the string containing all literal chars/strings from
1873 //                             the pattern.
1874 //
1875 //------------------------------------------------------------------------------
literalChar(UChar32 c)1876 void RegexCompile::literalChar(UChar32 c)  {
1877     fLiteralChars.append(c);
1878 }
1879 
1880 
1881 //------------------------------------------------------------------------------
1882 //
1883 //    fixLiterals           When compiling something that can follow a literal
1884 //                          string in a pattern, emit the code to match the
1885 //                          accumulated literal string.
1886 //
1887 //                          Optionally, split the last char of the string off into
1888 //                          a single "ONE_CHAR" operation, so that quantifiers can
1889 //                          apply to that char alone.  Example:   abc*
1890 //                          The * must apply to the 'c' only.
1891 //
1892 //------------------------------------------------------------------------------
fixLiterals(UBool split)1893 void    RegexCompile::fixLiterals(UBool split) {
1894 
1895     // If no literal characters have been scanned but not yet had code generated
1896     //   for them, nothing needs to be done.
1897     if (fLiteralChars.length() == 0) {
1898         return;
1899     }
1900 
1901     int32_t indexOfLastCodePoint = fLiteralChars.moveIndex32(fLiteralChars.length(), -1);
1902     UChar32 lastCodePoint = fLiteralChars.char32At(indexOfLastCodePoint);
1903 
1904     // Split:  We need to  ensure that the last item in the compiled pattern
1905     //     refers only to the last literal scanned in the pattern, so that
1906     //     quantifiers (*, +, etc.) affect only it, and not a longer string.
1907     //     Split before case folding for case insensitive matches.
1908 
1909     if (split) {
1910         fLiteralChars.truncate(indexOfLastCodePoint);
1911         fixLiterals(FALSE);   // Recursive call, emit code to match the first part of the string.
1912                               //  Note that the truncated literal string may be empty, in which case
1913                               //  nothing will be emitted.
1914 
1915         literalChar(lastCodePoint);  // Re-add the last code point as if it were a new literal.
1916         fixLiterals(FALSE);          // Second recursive call, code for the final code point.
1917         return;
1918     }
1919 
1920     // If we are doing case-insensitive matching, case fold the string.  This may expand
1921     //   the string, e.g. the German sharp-s turns into "ss"
1922     if (fModeFlags & UREGEX_CASE_INSENSITIVE) {
1923         fLiteralChars.foldCase();
1924         indexOfLastCodePoint = fLiteralChars.moveIndex32(fLiteralChars.length(), -1);
1925         lastCodePoint = fLiteralChars.char32At(indexOfLastCodePoint);
1926     }
1927 
1928     if (indexOfLastCodePoint == 0) {
1929         // Single character, emit a URX_ONECHAR op to match it.
1930         if ((fModeFlags & UREGEX_CASE_INSENSITIVE) &&
1931                  u_hasBinaryProperty(lastCodePoint, UCHAR_CASE_SENSITIVE)) {
1932             appendOp(URX_ONECHAR_I, lastCodePoint);
1933         } else {
1934             appendOp(URX_ONECHAR, lastCodePoint);
1935         }
1936     } else {
1937         // Two or more chars, emit a URX_STRING to match them.
1938         if (fLiteralChars.length() > 0x00ffffff || fRXPat->fLiteralText.length() > 0x00ffffff) {
1939             error(U_REGEX_PATTERN_TOO_BIG);
1940         }
1941         if (fModeFlags & UREGEX_CASE_INSENSITIVE) {
1942             appendOp(URX_STRING_I, fRXPat->fLiteralText.length());
1943         } else {
1944             // TODO here:  add optimization to split case sensitive strings of length two
1945             //             into two single char ops, for efficiency.
1946             appendOp(URX_STRING, fRXPat->fLiteralText.length());
1947         }
1948         appendOp(URX_STRING_LEN, fLiteralChars.length());
1949 
1950         // Add this string into the accumulated strings of the compiled pattern.
1951         fRXPat->fLiteralText.append(fLiteralChars);
1952     }
1953 
1954     fLiteralChars.remove();
1955 }
1956 
1957 
buildOp(int32_t type,int32_t val)1958 int32_t RegexCompile::buildOp(int32_t type, int32_t val) {
1959     if (U_FAILURE(*fStatus)) {
1960         return 0;
1961     }
1962     if (type < 0 || type > 255) {
1963         UPRV_UNREACHABLE;
1964     }
1965     if (val > 0x00ffffff) {
1966         UPRV_UNREACHABLE;
1967     }
1968     if (val < 0) {
1969         if (!(type == URX_RESERVED_OP_N || type == URX_RESERVED_OP)) {
1970             UPRV_UNREACHABLE;
1971         }
1972         if (URX_TYPE(val) != 0xff) {
1973             UPRV_UNREACHABLE;
1974         }
1975         type = URX_RESERVED_OP_N;
1976     }
1977     return (type << 24) | val;
1978 }
1979 
1980 
1981 //------------------------------------------------------------------------------
1982 //
1983 //   appendOp()             Append a new instruction onto the compiled pattern
1984 //                          Includes error checking, limiting the size of the
1985 //                          pattern to lengths that can be represented in the
1986 //                          24 bit operand field of an instruction.
1987 //
1988 //------------------------------------------------------------------------------
appendOp(int32_t op)1989 void RegexCompile::appendOp(int32_t op) {
1990     if (U_FAILURE(*fStatus)) {
1991         return;
1992     }
1993     fRXPat->fCompiledPat->addElement(op, *fStatus);
1994     if ((fRXPat->fCompiledPat->size() > 0x00fffff0) && U_SUCCESS(*fStatus)) {
1995         error(U_REGEX_PATTERN_TOO_BIG);
1996     }
1997 }
1998 
appendOp(int32_t type,int32_t val)1999 void RegexCompile::appendOp(int32_t type, int32_t val) {
2000     appendOp(buildOp(type, val));
2001 }
2002 
2003 
2004 //------------------------------------------------------------------------------
2005 //
2006 //   insertOp()             Insert a slot for a new opcode into the already
2007 //                          compiled pattern code.
2008 //
2009 //                          Fill the slot with a NOP.  Our caller will replace it
2010 //                          with what they really wanted.
2011 //
2012 //------------------------------------------------------------------------------
insertOp(int32_t where)2013 void   RegexCompile::insertOp(int32_t where) {
2014     UVector64 *code = fRXPat->fCompiledPat;
2015     U_ASSERT(where>0 && where < code->size());
2016 
2017     int32_t  nop = buildOp(URX_NOP, 0);
2018     code->insertElementAt(nop, where, *fStatus);
2019 
2020     // Walk through the pattern, looking for any ops with targets that
2021     //  were moved down by the insert.  Fix them.
2022     int32_t loc;
2023     for (loc=0; loc<code->size(); loc++) {
2024         int32_t op = (int32_t)code->elementAti(loc);
2025         int32_t opType = URX_TYPE(op);
2026         int32_t opValue = URX_VAL(op);
2027         if ((opType == URX_JMP         ||
2028             opType == URX_JMPX         ||
2029             opType == URX_STATE_SAVE   ||
2030             opType == URX_CTR_LOOP     ||
2031             opType == URX_CTR_LOOP_NG  ||
2032             opType == URX_JMP_SAV      ||
2033             opType == URX_JMP_SAV_X    ||
2034             opType == URX_RELOC_OPRND)    && opValue > where) {
2035             // Target location for this opcode is after the insertion point and
2036             //   needs to be incremented to adjust for the insertion.
2037             opValue++;
2038             op = buildOp(opType, opValue);
2039             code->setElementAt(op, loc);
2040         }
2041     }
2042 
2043     // Now fix up the parentheses stack.  All positive values in it are locations in
2044     //  the compiled pattern.   (Negative values are frame boundaries, and don't need fixing.)
2045     for (loc=0; loc<fParenStack.size(); loc++) {
2046         int32_t x = fParenStack.elementAti(loc);
2047         U_ASSERT(x < code->size());
2048         if (x>where) {
2049             x++;
2050             fParenStack.setElementAt(x, loc);
2051         }
2052     }
2053 
2054     if (fMatchCloseParen > where) {
2055         fMatchCloseParen++;
2056     }
2057     if (fMatchOpenParen > where) {
2058         fMatchOpenParen++;
2059     }
2060 }
2061 
2062 
2063 //------------------------------------------------------------------------------
2064 //
2065 //   allocateData()        Allocate storage in the matcher's static data area.
2066 //                         Return the index for the newly allocated data.
2067 //                         The storage won't actually exist until we are running a match
2068 //                         operation, but the storage indexes are inserted into various
2069 //                         opcodes while compiling the pattern.
2070 //
2071 //------------------------------------------------------------------------------
allocateData(int32_t size)2072 int32_t RegexCompile::allocateData(int32_t size) {
2073     if (U_FAILURE(*fStatus)) {
2074         return 0;
2075     }
2076     if (size <= 0 || size > 0x100 || fRXPat->fDataSize < 0) {
2077         error(U_REGEX_INTERNAL_ERROR);
2078         return 0;
2079     }
2080     int32_t dataIndex = fRXPat->fDataSize;
2081     fRXPat->fDataSize += size;
2082     if (fRXPat->fDataSize >= 0x00fffff0) {
2083         error(U_REGEX_INTERNAL_ERROR);
2084     }
2085     return dataIndex;
2086 }
2087 
2088 
2089 //------------------------------------------------------------------------------
2090 //
2091 //   allocateStackData()   Allocate space in the back-tracking stack frame.
2092 //                         Return the index for the newly allocated data.
2093 //                         The frame indexes are inserted into various
2094 //                         opcodes while compiling the pattern, meaning that frame
2095 //                         size must be restricted to the size that will fit
2096 //                         as an operand (24 bits).
2097 //
2098 //------------------------------------------------------------------------------
allocateStackData(int32_t size)2099 int32_t RegexCompile::allocateStackData(int32_t size) {
2100     if (U_FAILURE(*fStatus)) {
2101         return 0;
2102     }
2103     if (size <= 0 || size > 0x100 || fRXPat->fFrameSize < 0) {
2104         error(U_REGEX_INTERNAL_ERROR);
2105         return 0;
2106     }
2107     int32_t dataIndex = fRXPat->fFrameSize;
2108     fRXPat->fFrameSize += size;
2109     if (fRXPat->fFrameSize >= 0x00fffff0) {
2110         error(U_REGEX_PATTERN_TOO_BIG);
2111     }
2112     return dataIndex;
2113 }
2114 
2115 
2116 //------------------------------------------------------------------------------
2117 //
2118 //   blockTopLoc()          Find or create a location in the compiled pattern
2119 //                          at the start of the operation or block that has
2120 //                          just been compiled.  Needed when a quantifier (* or
2121 //                          whatever) appears, and we need to add an operation
2122 //                          at the start of the thing being quantified.
2123 //
2124 //                          (Parenthesized Blocks) have a slot with a NOP that
2125 //                          is reserved for this purpose.  .* or similar don't
2126 //                          and a slot needs to be added.
2127 //
2128 //       parameter reserveLoc   :  TRUE -  ensure that there is space to add an opcode
2129 //                                         at the returned location.
2130 //                                 FALSE - just return the address,
2131 //                                         do not reserve a location there.
2132 //
2133 //------------------------------------------------------------------------------
blockTopLoc(UBool reserveLoc)2134 int32_t   RegexCompile::blockTopLoc(UBool reserveLoc) {
2135     int32_t   theLoc;
2136     fixLiterals(TRUE);  // Emit code for any pending literals.
2137                         //   If last item was a string, emit separate op for the its last char.
2138     if (fRXPat->fCompiledPat->size() == fMatchCloseParen)
2139     {
2140         // The item just processed is a parenthesized block.
2141         theLoc = fMatchOpenParen;   // A slot is already reserved for us.
2142         U_ASSERT(theLoc > 0);
2143         U_ASSERT(URX_TYPE(((uint32_t)fRXPat->fCompiledPat->elementAti(theLoc))) == URX_NOP);
2144     }
2145     else {
2146         // Item just compiled is a single thing, a ".", or a single char, a string or a set reference.
2147         // No slot for STATE_SAVE was pre-reserved in the compiled code.
2148         // We need to make space now.
2149         theLoc = fRXPat->fCompiledPat->size()-1;
2150         int32_t opAtTheLoc = (int32_t)fRXPat->fCompiledPat->elementAti(theLoc);
2151         if (URX_TYPE(opAtTheLoc) == URX_STRING_LEN) {
2152             // Strings take two opcode, we want the position of the first one.
2153             // We can have a string at this point if a single character case-folded to two.
2154             theLoc--;
2155         }
2156         if (reserveLoc) {
2157             int32_t  nop = buildOp(URX_NOP, 0);
2158             fRXPat->fCompiledPat->insertElementAt(nop, theLoc, *fStatus);
2159         }
2160     }
2161     return theLoc;
2162 }
2163 
2164 
2165 
2166 //------------------------------------------------------------------------------
2167 //
2168 //    handleCloseParen      When compiling a close paren, we need to go back
2169 //                          and fix up any JMP or SAVE operations within the
2170 //                          parenthesized block that need to target the end
2171 //                          of the block.  The locations of these are kept on
2172 //                          the paretheses stack.
2173 //
2174 //                          This function is called both when encountering a
2175 //                          real ) and at the end of the pattern.
2176 //
2177 //------------------------------------------------------------------------------
handleCloseParen()2178 void  RegexCompile::handleCloseParen() {
2179     int32_t   patIdx;
2180     int32_t   patOp;
2181     if (fParenStack.size() <= 0) {
2182         error(U_REGEX_MISMATCHED_PAREN);
2183         return;
2184     }
2185 
2186     // Emit code for any pending literals.
2187     fixLiterals(FALSE);
2188 
2189     // Fixup any operations within the just-closed parenthesized group
2190     //    that need to reference the end of the (block).
2191     //    (The first one popped from the stack is an unused slot for
2192     //     alternation (OR) state save, but applying the fixup to it does no harm.)
2193     for (;;) {
2194         patIdx = fParenStack.popi();
2195         if (patIdx < 0) {
2196             // value < 0 flags the start of the frame on the paren stack.
2197             break;
2198         }
2199         U_ASSERT(patIdx>0 && patIdx <= fRXPat->fCompiledPat->size());
2200         patOp = (int32_t)fRXPat->fCompiledPat->elementAti(patIdx);
2201         U_ASSERT(URX_VAL(patOp) == 0);          // Branch target for JMP should not be set.
2202         patOp |= fRXPat->fCompiledPat->size();  // Set it now.
2203         fRXPat->fCompiledPat->setElementAt(patOp, patIdx);
2204         fMatchOpenParen     = patIdx;
2205     }
2206 
2207     //  At the close of any parenthesized block, restore the match mode flags  to
2208     //  the value they had at the open paren.  Saved value is
2209     //  at the top of the paren stack.
2210     fModeFlags = fParenStack.popi();
2211     U_ASSERT(fModeFlags < 0);
2212 
2213     // DO any additional fixups, depending on the specific kind of
2214     // parentesized grouping this is
2215 
2216     switch (patIdx) {
2217     case plain:
2218     case flags:
2219         // No additional fixups required.
2220         //   (Grouping-only parentheses)
2221         break;
2222     case capturing:
2223         // Capturing Parentheses.
2224         //   Insert a End Capture op into the pattern.
2225         //   The frame offset of the variables for this cg is obtained from the
2226         //       start capture op and put it into the end-capture op.
2227         {
2228             int32_t   captureOp = (int32_t)fRXPat->fCompiledPat->elementAti(fMatchOpenParen+1);
2229             U_ASSERT(URX_TYPE(captureOp) == URX_START_CAPTURE);
2230 
2231             int32_t   frameVarLocation = URX_VAL(captureOp);
2232             appendOp(URX_END_CAPTURE, frameVarLocation);
2233         }
2234         break;
2235     case atomic:
2236         // Atomic Parenthesis.
2237         //   Insert a LD_SP operation to restore the state stack to the position
2238         //   it was when the atomic parens were entered.
2239         {
2240             int32_t   stoOp = (int32_t)fRXPat->fCompiledPat->elementAti(fMatchOpenParen+1);
2241             U_ASSERT(URX_TYPE(stoOp) == URX_STO_SP);
2242             int32_t   stoLoc = URX_VAL(stoOp);
2243             appendOp(URX_LD_SP, stoLoc);
2244         }
2245         break;
2246 
2247     case lookAhead:
2248         {
2249             int32_t  startOp = (int32_t)fRXPat->fCompiledPat->elementAti(fMatchOpenParen-5);
2250             U_ASSERT(URX_TYPE(startOp) == URX_LA_START);
2251             int32_t dataLoc  = URX_VAL(startOp);
2252             appendOp(URX_LA_END, dataLoc);
2253         }
2254         break;
2255 
2256     case negLookAhead:
2257         {
2258             // See comment at doOpenLookAheadNeg
2259             int32_t  startOp = (int32_t)fRXPat->fCompiledPat->elementAti(fMatchOpenParen-1);
2260             U_ASSERT(URX_TYPE(startOp) == URX_LA_START);
2261             int32_t dataLoc  = URX_VAL(startOp);
2262             appendOp(URX_LA_END, dataLoc);
2263             appendOp(URX_BACKTRACK, 0);
2264             appendOp(URX_LA_END, dataLoc);
2265 
2266             // Patch the URX_SAVE near the top of the block.
2267             // The destination of the SAVE is the final LA_END that was just added.
2268             int32_t saveOp   = (int32_t)fRXPat->fCompiledPat->elementAti(fMatchOpenParen);
2269             U_ASSERT(URX_TYPE(saveOp) == URX_STATE_SAVE);
2270             int32_t dest     = fRXPat->fCompiledPat->size()-1;
2271             saveOp           = buildOp(URX_STATE_SAVE, dest);
2272             fRXPat->fCompiledPat->setElementAt(saveOp, fMatchOpenParen);
2273         }
2274         break;
2275 
2276     case lookBehind:
2277         {
2278             // See comment at doOpenLookBehind.
2279 
2280             // Append the URX_LB_END and URX_LA_END to the compiled pattern.
2281             int32_t  startOp = (int32_t)fRXPat->fCompiledPat->elementAti(fMatchOpenParen-4);
2282             U_ASSERT(URX_TYPE(startOp) == URX_LB_START);
2283             int32_t dataLoc  = URX_VAL(startOp);
2284             appendOp(URX_LB_END, dataLoc);
2285             appendOp(URX_LA_END, dataLoc);
2286 
2287             // Determine the min and max bounds for the length of the
2288             //  string that the pattern can match.
2289             //  An unbounded upper limit is an error.
2290             int32_t patEnd   = fRXPat->fCompiledPat->size() - 1;
2291             int32_t minML    = minMatchLength(fMatchOpenParen, patEnd);
2292             int32_t maxML    = maxMatchLength(fMatchOpenParen, patEnd);
2293             if (URX_TYPE(maxML) != 0) {
2294                 error(U_REGEX_LOOK_BEHIND_LIMIT);
2295                 break;
2296             }
2297             if (maxML == INT32_MAX) {
2298                 error(U_REGEX_LOOK_BEHIND_LIMIT);
2299                 break;
2300             }
2301             if (minML == INT32_MAX) {
2302                 // This condition happens when no match is possible, such as with a
2303                 // [set] expression containing no elements.
2304                 // In principle, the generated code to evaluate the expression could be deleted,
2305                 // but it's probably not worth the complication.
2306                 minML = 0;
2307             }
2308             U_ASSERT(minML <= maxML);
2309 
2310             // Insert the min and max match len bounds into the URX_LB_CONT op that
2311             //  appears at the top of the look-behind block, at location fMatchOpenParen+1
2312             fRXPat->fCompiledPat->setElementAt(minML,  fMatchOpenParen-2);
2313             fRXPat->fCompiledPat->setElementAt(maxML,  fMatchOpenParen-1);
2314 
2315         }
2316         break;
2317 
2318 
2319 
2320     case lookBehindN:
2321         {
2322             // See comment at doOpenLookBehindNeg.
2323 
2324             // Append the URX_LBN_END to the compiled pattern.
2325             int32_t  startOp = (int32_t)fRXPat->fCompiledPat->elementAti(fMatchOpenParen-5);
2326             U_ASSERT(URX_TYPE(startOp) == URX_LB_START);
2327             int32_t dataLoc  = URX_VAL(startOp);
2328             appendOp(URX_LBN_END, dataLoc);
2329 
2330             // Determine the min and max bounds for the length of the
2331             //  string that the pattern can match.
2332             //  An unbounded upper limit is an error.
2333             int32_t patEnd   = fRXPat->fCompiledPat->size() - 1;
2334             int32_t minML    = minMatchLength(fMatchOpenParen, patEnd);
2335             int32_t maxML    = maxMatchLength(fMatchOpenParen, patEnd);
2336             if (URX_TYPE(maxML) != 0) {
2337                 error(U_REGEX_LOOK_BEHIND_LIMIT);
2338                 break;
2339             }
2340             if (maxML == INT32_MAX) {
2341                 error(U_REGEX_LOOK_BEHIND_LIMIT);
2342                 break;
2343             }
2344             if (minML == INT32_MAX) {
2345                 // This condition happens when no match is possible, such as with a
2346                 // [set] expression containing no elements.
2347                 // In principle, the generated code to evaluate the expression could be deleted,
2348                 // but it's probably not worth the complication.
2349                 minML = 0;
2350             }
2351 
2352             U_ASSERT(minML <= maxML);
2353 
2354             // Insert the min and max match len bounds into the URX_LB_CONT op that
2355             //  appears at the top of the look-behind block, at location fMatchOpenParen+1
2356             fRXPat->fCompiledPat->setElementAt(minML,  fMatchOpenParen-3);
2357             fRXPat->fCompiledPat->setElementAt(maxML,  fMatchOpenParen-2);
2358 
2359             // Insert the pattern location to continue at after a successful match
2360             //  as the last operand of the URX_LBN_CONT
2361             int32_t op = buildOp(URX_RELOC_OPRND, fRXPat->fCompiledPat->size());
2362             fRXPat->fCompiledPat->setElementAt(op,  fMatchOpenParen-1);
2363         }
2364         break;
2365 
2366 
2367 
2368     default:
2369         UPRV_UNREACHABLE;
2370     }
2371 
2372     // remember the next location in the compiled pattern.
2373     // The compilation of Quantifiers will look at this to see whether its looping
2374     //   over a parenthesized block or a single item
2375     fMatchCloseParen = fRXPat->fCompiledPat->size();
2376 }
2377 
2378 
2379 
2380 //------------------------------------------------------------------------------
2381 //
2382 //   compileSet       Compile the pattern operations for a reference to a
2383 //                    UnicodeSet.
2384 //
2385 //------------------------------------------------------------------------------
compileSet(UnicodeSet * theSet)2386 void        RegexCompile::compileSet(UnicodeSet *theSet)
2387 {
2388     if (theSet == NULL) {
2389         return;
2390     }
2391     //  Remove any strings from the set.
2392     //  There shoudn't be any, but just in case.
2393     //     (Case Closure can add them; if we had a simple case closure avaialble that
2394     //      ignored strings, that would be better.)
2395     theSet->removeAllStrings();
2396     int32_t  setSize = theSet->size();
2397 
2398     switch (setSize) {
2399     case 0:
2400         {
2401             // Set of no elements.   Always fails to match.
2402             appendOp(URX_BACKTRACK, 0);
2403             delete theSet;
2404         }
2405         break;
2406 
2407     case 1:
2408         {
2409             // The set contains only a single code point.  Put it into
2410             //   the compiled pattern as a single char operation rather
2411             //   than a set, and discard the set itself.
2412             literalChar(theSet->charAt(0));
2413             delete theSet;
2414         }
2415         break;
2416 
2417     default:
2418         {
2419             //  The set contains two or more chars.  (the normal case)
2420             //  Put it into the compiled pattern as a set.
2421             int32_t setNumber = fRXPat->fSets->size();
2422             fRXPat->fSets->addElement(theSet, *fStatus);
2423             appendOp(URX_SETREF, setNumber);
2424         }
2425     }
2426 }
2427 
2428 
2429 //------------------------------------------------------------------------------
2430 //
2431 //   compileInterval    Generate the code for a {min, max} style interval quantifier.
2432 //                      Except for the specific opcodes used, the code is the same
2433 //                      for all three types (greedy, non-greedy, possessive) of
2434 //                      intervals.  The opcodes are supplied as parameters.
2435 //                      (There are two sets of opcodes - greedy & possessive use the
2436 //                      same ones, while non-greedy has it's own.)
2437 //
2438 //                      The code for interval loops has this form:
2439 //                         0  CTR_INIT   counter loc (in stack frame)
2440 //                         1             5  patt address of CTR_LOOP at bottom of block
2441 //                         2             min count
2442 //                         3             max count   (-1 for unbounded)
2443 //                         4  ...        block to be iterated over
2444 //                         5  CTR_LOOP
2445 //
2446 //                       In
2447 //------------------------------------------------------------------------------
compileInterval(int32_t InitOp,int32_t LoopOp)2448 void        RegexCompile::compileInterval(int32_t InitOp,  int32_t LoopOp)
2449 {
2450     // The CTR_INIT op at the top of the block with the {n,m} quantifier takes
2451     //   four slots in the compiled code.  Reserve them.
2452     int32_t   topOfBlock = blockTopLoc(TRUE);
2453     insertOp(topOfBlock);
2454     insertOp(topOfBlock);
2455     insertOp(topOfBlock);
2456 
2457     // The operands for the CTR_INIT opcode include the index in the matcher data
2458     //   of the counter.  Allocate it now. There are two data items
2459     //        counterLoc   -->  Loop counter
2460     //               +1    -->  Input index (for breaking non-progressing loops)
2461     //                          (Only present if unbounded upper limit on loop)
2462     int32_t   dataSize = fIntervalUpper < 0 ? 2 : 1;
2463     int32_t   counterLoc = allocateStackData(dataSize);
2464 
2465     int32_t   op = buildOp(InitOp, counterLoc);
2466     fRXPat->fCompiledPat->setElementAt(op, topOfBlock);
2467 
2468     // The second operand of CTR_INIT is the location following the end of the loop.
2469     //   Must put in as a URX_RELOC_OPRND so that the value will be adjusted if the
2470     //   compilation of something later on causes the code to grow and the target
2471     //   position to move.
2472     int32_t loopEnd = fRXPat->fCompiledPat->size();
2473     op = buildOp(URX_RELOC_OPRND, loopEnd);
2474     fRXPat->fCompiledPat->setElementAt(op, topOfBlock+1);
2475 
2476     // Followed by the min and max counts.
2477     fRXPat->fCompiledPat->setElementAt(fIntervalLow, topOfBlock+2);
2478     fRXPat->fCompiledPat->setElementAt(fIntervalUpper, topOfBlock+3);
2479 
2480     // Apend the CTR_LOOP op.  The operand is the location of the CTR_INIT op.
2481     //   Goes at end of the block being looped over, so just append to the code so far.
2482     appendOp(LoopOp, topOfBlock);
2483 
2484     if ((fIntervalLow & 0xff000000) != 0 ||
2485         (fIntervalUpper > 0 && (fIntervalUpper & 0xff000000) != 0)) {
2486             error(U_REGEX_NUMBER_TOO_BIG);
2487         }
2488 
2489     if (fIntervalLow > fIntervalUpper && fIntervalUpper != -1) {
2490         error(U_REGEX_MAX_LT_MIN);
2491     }
2492 }
2493 
2494 
2495 
compileInlineInterval()2496 UBool RegexCompile::compileInlineInterval() {
2497     if (fIntervalUpper > 10 || fIntervalUpper < fIntervalLow) {
2498         // Too big to inline.  Fail, which will cause looping code to be generated.
2499         //   (Upper < Lower picks up unbounded upper and errors, both.)
2500         return FALSE;
2501     }
2502 
2503     int32_t   topOfBlock = blockTopLoc(FALSE);
2504     if (fIntervalUpper == 0) {
2505         // Pathological case.  Attempt no matches, as if the block doesn't exist.
2506         // Discard the generated code for the block.
2507         // If the block included parens, discard the info pertaining to them as well.
2508         fRXPat->fCompiledPat->setSize(topOfBlock);
2509         if (fMatchOpenParen >= topOfBlock) {
2510             fMatchOpenParen = -1;
2511         }
2512         if (fMatchCloseParen >= topOfBlock) {
2513             fMatchCloseParen = -1;
2514         }
2515         return TRUE;
2516     }
2517 
2518     if (topOfBlock != fRXPat->fCompiledPat->size()-1 && fIntervalUpper != 1) {
2519         // The thing being repeated is not a single op, but some
2520         //   more complex block.  Do it as a loop, not inlines.
2521         //   Note that things "repeated" a max of once are handled as inline, because
2522         //     the one copy of the code already generated is just fine.
2523         return FALSE;
2524     }
2525 
2526     // Pick up the opcode that is to be repeated
2527     //
2528     int32_t op = (int32_t)fRXPat->fCompiledPat->elementAti(topOfBlock);
2529 
2530     // Compute the pattern location where the inline sequence
2531     //   will end, and set up the state save op that will be needed.
2532     //
2533     int32_t endOfSequenceLoc = fRXPat->fCompiledPat->size()-1
2534                                 + fIntervalUpper + (fIntervalUpper-fIntervalLow);
2535     int32_t saveOp = buildOp(URX_STATE_SAVE, endOfSequenceLoc);
2536     if (fIntervalLow == 0) {
2537         insertOp(topOfBlock);
2538         fRXPat->fCompiledPat->setElementAt(saveOp, topOfBlock);
2539     }
2540 
2541 
2542 
2543     //  Loop, emitting the op for the thing being repeated each time.
2544     //    Loop starts at 1 because one instance of the op already exists in the pattern,
2545     //    it was put there when it was originally encountered.
2546     int32_t i;
2547     for (i=1; i<fIntervalUpper; i++ ) {
2548         if (i >= fIntervalLow) {
2549             appendOp(saveOp);
2550         }
2551         appendOp(op);
2552     }
2553     return TRUE;
2554 }
2555 
2556 
2557 
2558 //------------------------------------------------------------------------------
2559 //
2560 //   caseInsensitiveStart  given a single code point from a pattern string, determine the
2561 //                         set of characters that could potentially begin a case-insensitive
2562 //                         match of a string beginning with that character, using full Unicode
2563 //                         case insensitive matching.
2564 //
2565 //          This is used in optimizing find().
2566 //
2567 //          closeOver(USET_CASE_INSENSITIVE) does most of what is needed, but
2568 //          misses cases like this:
2569 //             A string from the pattern begins with 'ss' (although all we know
2570 //                 in this context is that it begins with 's')
2571 //             The pattern could match a string beginning with a German sharp-s
2572 //
2573 //           To the ordinary case closure for a character c, we add all other
2574 //           characters cx where the case closure of cx incudes a string form that begins
2575 //           with the original character c.
2576 //
2577 //           This function could be made smarter. The full pattern string is available
2578 //           and it would be possible to verify that the extra characters being added
2579 //           to the starting set fully match, rather than having just a first-char of the
2580 //           folded form match.
2581 //
2582 //------------------------------------------------------------------------------
findCaseInsensitiveStarters(UChar32 c,UnicodeSet * starterChars)2583 void  RegexCompile::findCaseInsensitiveStarters(UChar32 c, UnicodeSet *starterChars) {
2584 
2585 // Machine Generated below.
2586 // It may need updating with new versions of Unicode.
2587 // Intltest test RegexTest::TestCaseInsensitiveStarters will fail if an update is needed.
2588 // The update tool is here: svn+ssh://source.icu-project.org/repos/icu/tools/trunk/unicode/c/genregexcasing
2589 
2590 // Machine Generated Data. Do not hand edit.
2591     static const UChar32 RECaseFixCodePoints[] = {
2592         0x61, 0x66, 0x68, 0x69, 0x6a, 0x73, 0x74, 0x77, 0x79, 0x2bc,
2593         0x3ac, 0x3ae, 0x3b1, 0x3b7, 0x3b9, 0x3c1, 0x3c5, 0x3c9, 0x3ce, 0x565,
2594         0x574, 0x57e, 0x1f00, 0x1f01, 0x1f02, 0x1f03, 0x1f04, 0x1f05, 0x1f06, 0x1f07,
2595         0x1f20, 0x1f21, 0x1f22, 0x1f23, 0x1f24, 0x1f25, 0x1f26, 0x1f27, 0x1f60, 0x1f61,
2596         0x1f62, 0x1f63, 0x1f64, 0x1f65, 0x1f66, 0x1f67, 0x1f70, 0x1f74, 0x1f7c, 0x110000};
2597 
2598     static const int16_t RECaseFixStringOffsets[] = {
2599         0x0, 0x1, 0x6, 0x7, 0x8, 0x9, 0xd, 0xe, 0xf, 0x10,
2600         0x11, 0x12, 0x13, 0x17, 0x1b, 0x20, 0x21, 0x2a, 0x2e, 0x2f,
2601         0x30, 0x34, 0x35, 0x37, 0x39, 0x3b, 0x3d, 0x3f, 0x41, 0x43,
2602         0x45, 0x47, 0x49, 0x4b, 0x4d, 0x4f, 0x51, 0x53, 0x55, 0x57,
2603         0x59, 0x5b, 0x5d, 0x5f, 0x61, 0x63, 0x65, 0x66, 0x67, 0};
2604 
2605     static const int16_t RECaseFixCounts[] = {
2606         0x1, 0x5, 0x1, 0x1, 0x1, 0x4, 0x1, 0x1, 0x1, 0x1,
2607         0x1, 0x1, 0x4, 0x4, 0x5, 0x1, 0x9, 0x4, 0x1, 0x1,
2608         0x4, 0x1, 0x2, 0x2, 0x2, 0x2, 0x2, 0x2, 0x2, 0x2,
2609         0x2, 0x2, 0x2, 0x2, 0x2, 0x2, 0x2, 0x2, 0x2, 0x2,
2610         0x2, 0x2, 0x2, 0x2, 0x2, 0x2, 0x1, 0x1, 0x1, 0};
2611 
2612     static const UChar RECaseFixData[] = {
2613         0x1e9a, 0xfb00, 0xfb01, 0xfb02, 0xfb03, 0xfb04, 0x1e96, 0x130, 0x1f0, 0xdf,
2614         0x1e9e, 0xfb05, 0xfb06, 0x1e97, 0x1e98, 0x1e99, 0x149, 0x1fb4, 0x1fc4, 0x1fb3,
2615         0x1fb6, 0x1fb7, 0x1fbc, 0x1fc3, 0x1fc6, 0x1fc7, 0x1fcc, 0x390, 0x1fd2, 0x1fd3,
2616         0x1fd6, 0x1fd7, 0x1fe4, 0x3b0, 0x1f50, 0x1f52, 0x1f54, 0x1f56, 0x1fe2, 0x1fe3,
2617         0x1fe6, 0x1fe7, 0x1ff3, 0x1ff6, 0x1ff7, 0x1ffc, 0x1ff4, 0x587, 0xfb13, 0xfb14,
2618         0xfb15, 0xfb17, 0xfb16, 0x1f80, 0x1f88, 0x1f81, 0x1f89, 0x1f82, 0x1f8a, 0x1f83,
2619         0x1f8b, 0x1f84, 0x1f8c, 0x1f85, 0x1f8d, 0x1f86, 0x1f8e, 0x1f87, 0x1f8f, 0x1f90,
2620         0x1f98, 0x1f91, 0x1f99, 0x1f92, 0x1f9a, 0x1f93, 0x1f9b, 0x1f94, 0x1f9c, 0x1f95,
2621         0x1f9d, 0x1f96, 0x1f9e, 0x1f97, 0x1f9f, 0x1fa0, 0x1fa8, 0x1fa1, 0x1fa9, 0x1fa2,
2622         0x1faa, 0x1fa3, 0x1fab, 0x1fa4, 0x1fac, 0x1fa5, 0x1fad, 0x1fa6, 0x1fae, 0x1fa7,
2623         0x1faf, 0x1fb2, 0x1fc2, 0x1ff2, 0};
2624 
2625 // End of machine generated data.
2626 
2627     if (c < UCHAR_MIN_VALUE || c > UCHAR_MAX_VALUE) {
2628         // This function should never be called with an invalid input character.
2629         UPRV_UNREACHABLE;
2630     } else if (u_hasBinaryProperty(c, UCHAR_CASE_SENSITIVE)) {
2631         UChar32 caseFoldedC  = u_foldCase(c, U_FOLD_CASE_DEFAULT);
2632         starterChars->set(caseFoldedC, caseFoldedC);
2633 
2634         int32_t i;
2635         for (i=0; RECaseFixCodePoints[i]<c ; i++) {
2636             // Simple linear search through the sorted list of interesting code points.
2637         }
2638 
2639         if (RECaseFixCodePoints[i] == c) {
2640             int32_t dataIndex = RECaseFixStringOffsets[i];
2641             int32_t numCharsToAdd = RECaseFixCounts[i];
2642             UChar32 cpToAdd = 0;
2643             for (int32_t j=0; j<numCharsToAdd; j++) {
2644                 U16_NEXT_UNSAFE(RECaseFixData, dataIndex, cpToAdd);
2645                 starterChars->add(cpToAdd);
2646             }
2647         }
2648 
2649         starterChars->closeOver(USET_CASE_INSENSITIVE);
2650         starterChars->removeAllStrings();
2651     } else {
2652         // Not a cased character. Just return it alone.
2653         starterChars->set(c, c);
2654     }
2655 }
2656 
2657 
2658 // Increment with overflow check.
2659 // val and delta will both be positive.
2660 
safeIncrement(int32_t val,int32_t delta)2661 static int32_t safeIncrement(int32_t val, int32_t delta) {
2662     if (INT32_MAX - val > delta) {
2663         return val + delta;
2664     } else {
2665         return INT32_MAX;
2666     }
2667 }
2668 
2669 
2670 //------------------------------------------------------------------------------
2671 //
2672 //   matchStartType    Determine how a match can start.
2673 //                     Used to optimize find() operations.
2674 //
2675 //                     Operation is very similar to minMatchLength().  Walk the compiled
2676 //                     pattern, keeping an on-going minimum-match-length.  For any
2677 //                     op where the min match coming in is zero, add that ops possible
2678 //                     starting matches to the possible starts for the overall pattern.
2679 //
2680 //------------------------------------------------------------------------------
matchStartType()2681 void   RegexCompile::matchStartType() {
2682     if (U_FAILURE(*fStatus)) {
2683         return;
2684     }
2685 
2686 
2687     int32_t    loc;                    // Location in the pattern of the current op being processed.
2688     int32_t    op;                     // The op being processed
2689     int32_t    opType;                 // The opcode type of the op
2690     int32_t    currentLen = 0;         // Minimum length of a match to this point (loc) in the pattern
2691     int32_t    numInitialStrings = 0;  // Number of strings encountered that could match at start.
2692 
2693     UBool      atStart = TRUE;         // True if no part of the pattern yet encountered
2694                                        //   could have advanced the position in a match.
2695                                        //   (Maximum match length so far == 0)
2696 
2697     // forwardedLength is a vector holding minimum-match-length values that
2698     //   are propagated forward in the pattern by JMP or STATE_SAVE operations.
2699     //   It must be one longer than the pattern being checked because some  ops
2700     //   will jmp to a end-of-block+1 location from within a block, and we must
2701     //   count those when checking the block.
2702     int32_t end = fRXPat->fCompiledPat->size();
2703     UVector32  forwardedLength(end+1, *fStatus);
2704     forwardedLength.setSize(end+1);
2705     for (loc=3; loc<end; loc++) {
2706         forwardedLength.setElementAt(INT32_MAX, loc);
2707     }
2708 
2709     for (loc = 3; loc<end; loc++) {
2710         op = (int32_t)fRXPat->fCompiledPat->elementAti(loc);
2711         opType = URX_TYPE(op);
2712 
2713         // The loop is advancing linearly through the pattern.
2714         // If the op we are now at was the destination of a branch in the pattern,
2715         // and that path has a shorter minimum length than the current accumulated value,
2716         // replace the current accumulated value.
2717         if (forwardedLength.elementAti(loc) < currentLen) {
2718             currentLen = forwardedLength.elementAti(loc);
2719             U_ASSERT(currentLen>=0 && currentLen < INT32_MAX);
2720         }
2721 
2722         switch (opType) {
2723             // Ops that don't change the total length matched
2724         case URX_RESERVED_OP:
2725         case URX_END:
2726         case URX_FAIL:
2727         case URX_STRING_LEN:
2728         case URX_NOP:
2729         case URX_START_CAPTURE:
2730         case URX_END_CAPTURE:
2731         case URX_BACKSLASH_B:
2732         case URX_BACKSLASH_BU:
2733         case URX_BACKSLASH_G:
2734         case URX_BACKSLASH_Z:
2735         case URX_DOLLAR:
2736         case URX_DOLLAR_M:
2737         case URX_DOLLAR_D:
2738         case URX_DOLLAR_MD:
2739         case URX_RELOC_OPRND:
2740         case URX_STO_INP_LOC:
2741         case URX_BACKREF:         // BackRef.  Must assume that it might be a zero length match
2742         case URX_BACKREF_I:
2743 
2744         case URX_STO_SP:          // Setup for atomic or possessive blocks.  Doesn't change what can match.
2745         case URX_LD_SP:
2746             break;
2747 
2748         case URX_CARET:
2749             if (atStart) {
2750                 fRXPat->fStartType = START_START;
2751             }
2752             break;
2753 
2754         case URX_CARET_M:
2755         case URX_CARET_M_UNIX:
2756             if (atStart) {
2757                 fRXPat->fStartType = START_LINE;
2758             }
2759             break;
2760 
2761         case URX_ONECHAR:
2762             if (currentLen == 0) {
2763                 // This character could appear at the start of a match.
2764                 //   Add it to the set of possible starting characters.
2765                 fRXPat->fInitialChars->add(URX_VAL(op));
2766                 numInitialStrings += 2;
2767             }
2768             currentLen = safeIncrement(currentLen, 1);
2769             atStart = FALSE;
2770             break;
2771 
2772 
2773         case URX_SETREF:
2774             if (currentLen == 0) {
2775                 int32_t  sn = URX_VAL(op);
2776                 U_ASSERT(sn > 0 && sn < fRXPat->fSets->size());
2777                 const UnicodeSet *s = (UnicodeSet *)fRXPat->fSets->elementAt(sn);
2778                 fRXPat->fInitialChars->addAll(*s);
2779                 numInitialStrings += 2;
2780             }
2781             currentLen = safeIncrement(currentLen, 1);
2782             atStart = FALSE;
2783             break;
2784 
2785         case URX_LOOP_SR_I:
2786             // [Set]*, like a SETREF, above, in what it can match,
2787             //  but may not match at all, so currentLen is not incremented.
2788             if (currentLen == 0) {
2789                 int32_t  sn = URX_VAL(op);
2790                 U_ASSERT(sn > 0 && sn < fRXPat->fSets->size());
2791                 const UnicodeSet *s = (UnicodeSet *)fRXPat->fSets->elementAt(sn);
2792                 fRXPat->fInitialChars->addAll(*s);
2793                 numInitialStrings += 2;
2794             }
2795             atStart = FALSE;
2796             break;
2797 
2798         case URX_LOOP_DOT_I:
2799             if (currentLen == 0) {
2800                 // .* at the start of a pattern.
2801                 //    Any character can begin the match.
2802                 fRXPat->fInitialChars->clear();
2803                 fRXPat->fInitialChars->complement();
2804                 numInitialStrings += 2;
2805             }
2806             atStart = FALSE;
2807             break;
2808 
2809 
2810         case URX_STATIC_SETREF:
2811             if (currentLen == 0) {
2812                 int32_t  sn = URX_VAL(op);
2813                 U_ASSERT(sn>0 && sn<URX_LAST_SET);
2814                 const UnicodeSet *s = fRXPat->fStaticSets[sn];
2815                 fRXPat->fInitialChars->addAll(*s);
2816                 numInitialStrings += 2;
2817             }
2818             currentLen = safeIncrement(currentLen, 1);
2819             atStart = FALSE;
2820             break;
2821 
2822 
2823 
2824         case URX_STAT_SETREF_N:
2825             if (currentLen == 0) {
2826                 int32_t  sn = URX_VAL(op);
2827                 const UnicodeSet *s = fRXPat->fStaticSets[sn];
2828                 UnicodeSet sc(*s);
2829                 sc.complement();
2830                 fRXPat->fInitialChars->addAll(sc);
2831                 numInitialStrings += 2;
2832             }
2833             currentLen = safeIncrement(currentLen, 1);
2834             atStart = FALSE;
2835             break;
2836 
2837 
2838 
2839         case URX_BACKSLASH_D:
2840             // Digit Char
2841              if (currentLen == 0) {
2842                  UnicodeSet s;
2843                  s.applyIntPropertyValue(UCHAR_GENERAL_CATEGORY_MASK, U_GC_ND_MASK, *fStatus);
2844                  if (URX_VAL(op) != 0) {
2845                      s.complement();
2846                  }
2847                  fRXPat->fInitialChars->addAll(s);
2848                  numInitialStrings += 2;
2849             }
2850             currentLen = safeIncrement(currentLen, 1);
2851             atStart = FALSE;
2852             break;
2853 
2854 
2855         case URX_BACKSLASH_H:
2856             // Horiz white space
2857             if (currentLen == 0) {
2858                 UnicodeSet s;
2859                 s.applyIntPropertyValue(UCHAR_GENERAL_CATEGORY_MASK, U_GC_ZS_MASK, *fStatus);
2860                 s.add((UChar32)9);   // Tab
2861                 if (URX_VAL(op) != 0) {
2862                     s.complement();
2863                 }
2864                 fRXPat->fInitialChars->addAll(s);
2865                 numInitialStrings += 2;
2866             }
2867             currentLen = safeIncrement(currentLen, 1);
2868             atStart = FALSE;
2869             break;
2870 
2871 
2872         case URX_BACKSLASH_R:       // Any line ending sequence
2873         case URX_BACKSLASH_V:       // Any line ending code point, with optional negation
2874             if (currentLen == 0) {
2875                 UnicodeSet s;
2876                 s.add((UChar32)0x0a, (UChar32)0x0d);  // add range
2877                 s.add((UChar32)0x85);
2878                 s.add((UChar32)0x2028, (UChar32)0x2029);
2879                 if (URX_VAL(op) != 0) {
2880                      // Complement option applies to URX_BACKSLASH_V only.
2881                      s.complement();
2882                 }
2883                 fRXPat->fInitialChars->addAll(s);
2884                 numInitialStrings += 2;
2885             }
2886             currentLen = safeIncrement(currentLen, 1);
2887             atStart = FALSE;
2888             break;
2889 
2890 
2891 
2892         case URX_ONECHAR_I:
2893             // Case Insensitive Single Character.
2894             if (currentLen == 0) {
2895                 UChar32  c = URX_VAL(op);
2896                 if (u_hasBinaryProperty(c, UCHAR_CASE_SENSITIVE)) {
2897                     UnicodeSet starters(c, c);
2898                     starters.closeOver(USET_CASE_INSENSITIVE);
2899                     // findCaseInsensitiveStarters(c, &starters);
2900                     //   For ONECHAR_I, no need to worry about text chars that expand on folding into strings.
2901                     //   The expanded folding can't match the pattern.
2902                     fRXPat->fInitialChars->addAll(starters);
2903                 } else {
2904                     // Char has no case variants.  Just add it as-is to the
2905                     //   set of possible starting chars.
2906                     fRXPat->fInitialChars->add(c);
2907                 }
2908                 numInitialStrings += 2;
2909             }
2910             currentLen = safeIncrement(currentLen, 1);
2911             atStart = FALSE;
2912             break;
2913 
2914 
2915         case URX_BACKSLASH_X:   // Grahpeme Cluster.  Minimum is 1, max unbounded.
2916         case URX_DOTANY_ALL:    // . matches one or two.
2917         case URX_DOTANY:
2918         case URX_DOTANY_UNIX:
2919             if (currentLen == 0) {
2920                 // These constructs are all bad news when they appear at the start
2921                 //   of a match.  Any character can begin the match.
2922                 fRXPat->fInitialChars->clear();
2923                 fRXPat->fInitialChars->complement();
2924                 numInitialStrings += 2;
2925             }
2926             currentLen = safeIncrement(currentLen, 1);
2927             atStart = FALSE;
2928             break;
2929 
2930 
2931         case URX_JMPX:
2932             loc++;             // Except for extra operand on URX_JMPX, same as URX_JMP.
2933             U_FALLTHROUGH;
2934         case URX_JMP:
2935             {
2936                 int32_t  jmpDest = URX_VAL(op);
2937                 if (jmpDest < loc) {
2938                     // Loop of some kind.  Can safely ignore, the worst that will happen
2939                     //  is that we understate the true minimum length
2940                     currentLen = forwardedLength.elementAti(loc+1);
2941 
2942                 } else {
2943                     // Forward jump.  Propagate the current min length to the target loc of the jump.
2944                     U_ASSERT(jmpDest <= end+1);
2945                     if (forwardedLength.elementAti(jmpDest) > currentLen) {
2946                         forwardedLength.setElementAt(currentLen, jmpDest);
2947                     }
2948                 }
2949             }
2950             atStart = FALSE;
2951             break;
2952 
2953         case URX_JMP_SAV:
2954         case URX_JMP_SAV_X:
2955             // Combo of state save to the next loc, + jmp backwards.
2956             //   Net effect on min. length computation is nothing.
2957             atStart = FALSE;
2958             break;
2959 
2960         case URX_BACKTRACK:
2961             // Fails are kind of like a branch, except that the min length was
2962             //   propagated already, by the state save.
2963             currentLen = forwardedLength.elementAti(loc+1);
2964             atStart = FALSE;
2965             break;
2966 
2967 
2968         case URX_STATE_SAVE:
2969             {
2970                 // State Save, for forward jumps, propagate the current minimum.
2971                 //             of the state save.
2972                 int32_t  jmpDest = URX_VAL(op);
2973                 if (jmpDest > loc) {
2974                     if (currentLen < forwardedLength.elementAti(jmpDest)) {
2975                         forwardedLength.setElementAt(currentLen, jmpDest);
2976                     }
2977                 }
2978             }
2979             atStart = FALSE;
2980             break;
2981 
2982 
2983 
2984 
2985         case URX_STRING:
2986             {
2987                 loc++;
2988                 int32_t stringLenOp = (int32_t)fRXPat->fCompiledPat->elementAti(loc);
2989                 int32_t stringLen   = URX_VAL(stringLenOp);
2990                 U_ASSERT(URX_TYPE(stringLenOp) == URX_STRING_LEN);
2991                 U_ASSERT(stringLenOp >= 2);
2992                 if (currentLen == 0) {
2993                     // Add the starting character of this string to the set of possible starting
2994                     //   characters for this pattern.
2995                     int32_t stringStartIdx = URX_VAL(op);
2996                     UChar32  c = fRXPat->fLiteralText.char32At(stringStartIdx);
2997                     fRXPat->fInitialChars->add(c);
2998 
2999                     // Remember this string.  After the entire pattern has been checked,
3000                     //  if nothing else is identified that can start a match, we'll use it.
3001                     numInitialStrings++;
3002                     fRXPat->fInitialStringIdx = stringStartIdx;
3003                     fRXPat->fInitialStringLen = stringLen;
3004                 }
3005 
3006                 currentLen = safeIncrement(currentLen, stringLen);
3007                 atStart = FALSE;
3008             }
3009             break;
3010 
3011         case URX_STRING_I:
3012             {
3013                 // Case-insensitive string.  Unlike exact-match strings, we won't
3014                 //   attempt a string search for possible match positions.  But we
3015                 //   do update the set of possible starting characters.
3016                 loc++;
3017                 int32_t stringLenOp = (int32_t)fRXPat->fCompiledPat->elementAti(loc);
3018                 int32_t stringLen   = URX_VAL(stringLenOp);
3019                 U_ASSERT(URX_TYPE(stringLenOp) == URX_STRING_LEN);
3020                 U_ASSERT(stringLenOp >= 2);
3021                 if (currentLen == 0) {
3022                     // Add the starting character of this string to the set of possible starting
3023                     //   characters for this pattern.
3024                     int32_t stringStartIdx = URX_VAL(op);
3025                     UChar32  c = fRXPat->fLiteralText.char32At(stringStartIdx);
3026                     UnicodeSet s;
3027                     findCaseInsensitiveStarters(c, &s);
3028                     fRXPat->fInitialChars->addAll(s);
3029                     numInitialStrings += 2;  // Matching on an initial string not possible.
3030                 }
3031                 currentLen = safeIncrement(currentLen, stringLen);
3032                 atStart = FALSE;
3033             }
3034             break;
3035 
3036         case URX_CTR_INIT:
3037         case URX_CTR_INIT_NG:
3038             {
3039                 // Loop Init Ops.  These don't change the min length, but they are 4 word ops
3040                 //   so location must be updated accordingly.
3041                 // Loop Init Ops.
3042                 //   If the min loop count == 0
3043                 //      move loc forwards to the end of the loop, skipping over the body.
3044                 //   If the min count is > 0,
3045                 //      continue normal processing of the body of the loop.
3046                 int32_t loopEndLoc   = (int32_t)fRXPat->fCompiledPat->elementAti(loc+1);
3047                         loopEndLoc   = URX_VAL(loopEndLoc);
3048                 int32_t minLoopCount = (int32_t)fRXPat->fCompiledPat->elementAti(loc+2);
3049                 if (minLoopCount == 0) {
3050                     // Min Loop Count of 0, treat like a forward branch and
3051                     //   move the current minimum length up to the target
3052                     //   (end of loop) location.
3053                     U_ASSERT(loopEndLoc <= end+1);
3054                     if (forwardedLength.elementAti(loopEndLoc) > currentLen) {
3055                         forwardedLength.setElementAt(currentLen, loopEndLoc);
3056                     }
3057                 }
3058                 loc+=3;  // Skips over operands of CTR_INIT
3059             }
3060             atStart = FALSE;
3061             break;
3062 
3063 
3064         case URX_CTR_LOOP:
3065         case URX_CTR_LOOP_NG:
3066             // Loop ops.
3067             //  The jump is conditional, backwards only.
3068             atStart = FALSE;
3069             break;
3070 
3071         case URX_LOOP_C:
3072             // More loop ops.  These state-save to themselves.
3073             //   don't change the minimum match
3074             atStart = FALSE;
3075             break;
3076 
3077 
3078         case URX_LA_START:
3079         case URX_LB_START:
3080             {
3081                 // Look-around.  Scan forward until the matching look-ahead end,
3082                 //   without processing the look-around block.  This is overly pessimistic.
3083 
3084                 // Keep track of the nesting depth of look-around blocks.  Boilerplate code for
3085                 //   lookahead contains two LA_END instructions, so count goes up by two
3086                 //   for each LA_START.
3087                 int32_t  depth = (opType == URX_LA_START? 2: 1);
3088                 for (;;) {
3089                     loc++;
3090                     op = (int32_t)fRXPat->fCompiledPat->elementAti(loc);
3091                     if (URX_TYPE(op) == URX_LA_START) {
3092                         depth+=2;
3093                     }
3094                     if (URX_TYPE(op) == URX_LB_START) {
3095                         depth++;
3096                     }
3097                     if (URX_TYPE(op) == URX_LA_END || URX_TYPE(op)==URX_LBN_END) {
3098                         depth--;
3099                         if (depth == 0) {
3100                             break;
3101                         }
3102                     }
3103                     if (URX_TYPE(op) == URX_STATE_SAVE) {
3104                         // Need this because neg lookahead blocks will FAIL to outside
3105                         //   of the block.
3106                         int32_t  jmpDest = URX_VAL(op);
3107                         if (jmpDest > loc) {
3108                             if (currentLen < forwardedLength.elementAti(jmpDest)) {
3109                                 forwardedLength.setElementAt(currentLen, jmpDest);
3110                             }
3111                         }
3112                     }
3113                     U_ASSERT(loc <= end);
3114                 }
3115             }
3116             break;
3117 
3118         case URX_LA_END:
3119         case URX_LB_CONT:
3120         case URX_LB_END:
3121         case URX_LBN_CONT:
3122         case URX_LBN_END:
3123             UPRV_UNREACHABLE;     // Shouldn't get here.  These ops should be
3124                                  //  consumed by the scan in URX_LA_START and LB_START
3125         default:
3126             UPRV_UNREACHABLE;
3127             }
3128 
3129         }
3130 
3131 
3132     // We have finished walking through the ops.  Check whether some forward jump
3133     //   propagated a shorter length to location end+1.
3134     if (forwardedLength.elementAti(end+1) < currentLen) {
3135         currentLen = forwardedLength.elementAti(end+1);
3136     }
3137 
3138 
3139     fRXPat->fInitialChars8->init(fRXPat->fInitialChars);
3140 
3141 
3142     // Sort out what we should check for when looking for candidate match start positions.
3143     // In order of preference,
3144     //     1.   Start of input text buffer.
3145     //     2.   A literal string.
3146     //     3.   Start of line in multi-line mode.
3147     //     4.   A single literal character.
3148     //     5.   A character from a set of characters.
3149     //
3150     if (fRXPat->fStartType == START_START) {
3151         // Match only at the start of an input text string.
3152         //    start type is already set.  We're done.
3153     } else if (numInitialStrings == 1 && fRXPat->fMinMatchLen > 0) {
3154         // Match beginning only with a literal string.
3155         UChar32  c = fRXPat->fLiteralText.char32At(fRXPat->fInitialStringIdx);
3156         U_ASSERT(fRXPat->fInitialChars->contains(c));
3157         fRXPat->fStartType   = START_STRING;
3158         fRXPat->fInitialChar = c;
3159     } else if (fRXPat->fStartType == START_LINE) {
3160         // Match at start of line in Multi-Line mode.
3161         // Nothing to do here; everything is already set.
3162     } else if (fRXPat->fMinMatchLen == 0) {
3163         // Zero length match possible.  We could start anywhere.
3164         fRXPat->fStartType = START_NO_INFO;
3165     } else if (fRXPat->fInitialChars->size() == 1) {
3166         // All matches begin with the same char.
3167         fRXPat->fStartType   = START_CHAR;
3168         fRXPat->fInitialChar = fRXPat->fInitialChars->charAt(0);
3169         U_ASSERT(fRXPat->fInitialChar != (UChar32)-1);
3170     } else if (fRXPat->fInitialChars->contains((UChar32)0, (UChar32)0x10ffff) == FALSE &&
3171         fRXPat->fMinMatchLen > 0) {
3172         // Matches start with a set of character smaller than the set of all chars.
3173         fRXPat->fStartType = START_SET;
3174     } else {
3175         // Matches can start with anything
3176         fRXPat->fStartType = START_NO_INFO;
3177     }
3178 
3179     return;
3180 }
3181 
3182 
3183 
3184 //------------------------------------------------------------------------------
3185 //
3186 //   minMatchLength    Calculate the length of the shortest string that could
3187 //                     match the specified pattern.
3188 //                     Length is in 16 bit code units, not code points.
3189 //
3190 //                     The calculated length may not be exact.  The returned
3191 //                     value may be shorter than the actual minimum; it must
3192 //                     never be longer.
3193 //
3194 //                     start and end are the range of p-code operations to be
3195 //                     examined.  The endpoints are included in the range.
3196 //
3197 //------------------------------------------------------------------------------
minMatchLength(int32_t start,int32_t end)3198 int32_t   RegexCompile::minMatchLength(int32_t start, int32_t end) {
3199     if (U_FAILURE(*fStatus)) {
3200         return 0;
3201     }
3202 
3203     U_ASSERT(start <= end);
3204     U_ASSERT(end < fRXPat->fCompiledPat->size());
3205 
3206 
3207     int32_t    loc;
3208     int32_t    op;
3209     int32_t    opType;
3210     int32_t    currentLen = 0;
3211 
3212 
3213     // forwardedLength is a vector holding minimum-match-length values that
3214     //   are propagated forward in the pattern by JMP or STATE_SAVE operations.
3215     //   It must be one longer than the pattern being checked because some  ops
3216     //   will jmp to a end-of-block+1 location from within a block, and we must
3217     //   count those when checking the block.
3218     UVector32  forwardedLength(end+2, *fStatus);
3219     forwardedLength.setSize(end+2);
3220     for (loc=start; loc<=end+1; loc++) {
3221         forwardedLength.setElementAt(INT32_MAX, loc);
3222     }
3223 
3224     for (loc = start; loc<=end; loc++) {
3225         op = (int32_t)fRXPat->fCompiledPat->elementAti(loc);
3226         opType = URX_TYPE(op);
3227 
3228         // The loop is advancing linearly through the pattern.
3229         // If the op we are now at was the destination of a branch in the pattern,
3230         // and that path has a shorter minimum length than the current accumulated value,
3231         // replace the current accumulated value.
3232         // U_ASSERT(currentLen>=0 && currentLen < INT32_MAX);  // MinLength == INT32_MAX for some
3233                                                                //   no-match-possible cases.
3234         if (forwardedLength.elementAti(loc) < currentLen) {
3235             currentLen = forwardedLength.elementAti(loc);
3236             U_ASSERT(currentLen>=0 && currentLen < INT32_MAX);
3237         }
3238 
3239         switch (opType) {
3240             // Ops that don't change the total length matched
3241         case URX_RESERVED_OP:
3242         case URX_END:
3243         case URX_STRING_LEN:
3244         case URX_NOP:
3245         case URX_START_CAPTURE:
3246         case URX_END_CAPTURE:
3247         case URX_BACKSLASH_B:
3248         case URX_BACKSLASH_BU:
3249         case URX_BACKSLASH_G:
3250         case URX_BACKSLASH_Z:
3251         case URX_CARET:
3252         case URX_DOLLAR:
3253         case URX_DOLLAR_M:
3254         case URX_DOLLAR_D:
3255         case URX_DOLLAR_MD:
3256         case URX_RELOC_OPRND:
3257         case URX_STO_INP_LOC:
3258         case URX_CARET_M:
3259         case URX_CARET_M_UNIX:
3260         case URX_BACKREF:         // BackRef.  Must assume that it might be a zero length match
3261         case URX_BACKREF_I:
3262 
3263         case URX_STO_SP:          // Setup for atomic or possessive blocks.  Doesn't change what can match.
3264         case URX_LD_SP:
3265 
3266         case URX_JMP_SAV:
3267         case URX_JMP_SAV_X:
3268             break;
3269 
3270 
3271             // Ops that match a minimum of one character (one or two 16 bit code units.)
3272             //
3273         case URX_ONECHAR:
3274         case URX_STATIC_SETREF:
3275         case URX_STAT_SETREF_N:
3276         case URX_SETREF:
3277         case URX_BACKSLASH_D:
3278         case URX_BACKSLASH_H:
3279         case URX_BACKSLASH_R:
3280         case URX_BACKSLASH_V:
3281         case URX_ONECHAR_I:
3282         case URX_BACKSLASH_X:   // Grahpeme Cluster.  Minimum is 1, max unbounded.
3283         case URX_DOTANY_ALL:    // . matches one or two.
3284         case URX_DOTANY:
3285         case URX_DOTANY_UNIX:
3286             currentLen = safeIncrement(currentLen, 1);
3287             break;
3288 
3289 
3290         case URX_JMPX:
3291             loc++;              // URX_JMPX has an extra operand, ignored here,
3292                                 //   otherwise processed identically to URX_JMP.
3293             U_FALLTHROUGH;
3294         case URX_JMP:
3295             {
3296                 int32_t  jmpDest = URX_VAL(op);
3297                 if (jmpDest < loc) {
3298                     // Loop of some kind.  Can safely ignore, the worst that will happen
3299                     //  is that we understate the true minimum length
3300                     currentLen = forwardedLength.elementAti(loc+1);
3301                 } else {
3302                     // Forward jump.  Propagate the current min length to the target loc of the jump.
3303                     U_ASSERT(jmpDest <= end+1);
3304                     if (forwardedLength.elementAti(jmpDest) > currentLen) {
3305                         forwardedLength.setElementAt(currentLen, jmpDest);
3306                     }
3307                 }
3308             }
3309             break;
3310 
3311         case URX_BACKTRACK:
3312             {
3313                 // Back-tracks are kind of like a branch, except that the min length was
3314                 //   propagated already, by the state save.
3315                 currentLen = forwardedLength.elementAti(loc+1);
3316             }
3317             break;
3318 
3319 
3320         case URX_STATE_SAVE:
3321             {
3322                 // State Save, for forward jumps, propagate the current minimum.
3323                 //             of the state save.
3324                 int32_t  jmpDest = URX_VAL(op);
3325                 if (jmpDest > loc) {
3326                     if (currentLen < forwardedLength.elementAti(jmpDest)) {
3327                         forwardedLength.setElementAt(currentLen, jmpDest);
3328                     }
3329                 }
3330             }
3331             break;
3332 
3333 
3334         case URX_STRING:
3335             {
3336                 loc++;
3337                 int32_t stringLenOp = (int32_t)fRXPat->fCompiledPat->elementAti(loc);
3338                 currentLen = safeIncrement(currentLen, URX_VAL(stringLenOp));
3339             }
3340             break;
3341 
3342 
3343         case URX_STRING_I:
3344             {
3345                 loc++;
3346                 // TODO: with full case folding, matching input text may be shorter than
3347                 //       the string we have here.  More smarts could put some bounds on it.
3348                 //       Assume a min length of one for now.  A min length of zero causes
3349                 //        optimization failures for a pattern like "string"+
3350                 // currentLen += URX_VAL(stringLenOp);
3351                 currentLen = safeIncrement(currentLen, 1);
3352             }
3353             break;
3354 
3355         case URX_CTR_INIT:
3356         case URX_CTR_INIT_NG:
3357             {
3358                 // Loop Init Ops.
3359                 //   If the min loop count == 0
3360                 //      move loc forwards to the end of the loop, skipping over the body.
3361                 //   If the min count is > 0,
3362                 //      continue normal processing of the body of the loop.
3363                 int32_t loopEndLoc   = (int32_t)fRXPat->fCompiledPat->elementAti(loc+1);
3364                         loopEndLoc   = URX_VAL(loopEndLoc);
3365                 int32_t minLoopCount = (int32_t)fRXPat->fCompiledPat->elementAti(loc+2);
3366                 if (minLoopCount == 0) {
3367                     loc = loopEndLoc;
3368                 } else {
3369                     loc+=3;  // Skips over operands of CTR_INIT
3370                 }
3371             }
3372             break;
3373 
3374 
3375         case URX_CTR_LOOP:
3376         case URX_CTR_LOOP_NG:
3377             // Loop ops.
3378             //  The jump is conditional, backwards only.
3379             break;
3380 
3381         case URX_LOOP_SR_I:
3382         case URX_LOOP_DOT_I:
3383         case URX_LOOP_C:
3384             // More loop ops.  These state-save to themselves.
3385             //   don't change the minimum match - could match nothing at all.
3386             break;
3387 
3388 
3389         case URX_LA_START:
3390         case URX_LB_START:
3391             {
3392                 // Look-around.  Scan forward until the matching look-ahead end,
3393                 //   without processing the look-around block.  This is overly pessimistic for look-ahead,
3394                 //   it assumes that the look-ahead match might be zero-length.
3395                 //   TODO:  Positive lookahead could recursively do the block, then continue
3396                 //          with the longer of the block or the value coming in.  Ticket 6060
3397                 int32_t  depth = (opType == URX_LA_START? 2: 1);
3398                 for (;;) {
3399                     loc++;
3400                     op = (int32_t)fRXPat->fCompiledPat->elementAti(loc);
3401                     if (URX_TYPE(op) == URX_LA_START) {
3402                         // The boilerplate for look-ahead includes two LA_END insturctions,
3403                         //    Depth will be decremented by each one when it is seen.
3404                         depth += 2;
3405                     }
3406                     if (URX_TYPE(op) == URX_LB_START) {
3407                         depth++;
3408                     }
3409                     if (URX_TYPE(op) == URX_LA_END) {
3410                         depth--;
3411                         if (depth == 0) {
3412                             break;
3413                         }
3414                     }
3415                     if (URX_TYPE(op)==URX_LBN_END) {
3416                         depth--;
3417                         if (depth == 0) {
3418                             break;
3419                         }
3420                     }
3421                     if (URX_TYPE(op) == URX_STATE_SAVE) {
3422                         // Need this because neg lookahead blocks will FAIL to outside
3423                         //   of the block.
3424                         int32_t  jmpDest = URX_VAL(op);
3425                         if (jmpDest > loc) {
3426                             if (currentLen < forwardedLength.elementAti(jmpDest)) {
3427                                 forwardedLength.setElementAt(currentLen, jmpDest);
3428                             }
3429                         }
3430                     }
3431                     U_ASSERT(loc <= end);
3432                 }
3433             }
3434             break;
3435 
3436         case URX_LA_END:
3437         case URX_LB_CONT:
3438         case URX_LB_END:
3439         case URX_LBN_CONT:
3440         case URX_LBN_END:
3441             // Only come here if the matching URX_LA_START or URX_LB_START was not in the
3442             //   range being sized, which happens when measuring size of look-behind blocks.
3443             break;
3444 
3445         default:
3446             UPRV_UNREACHABLE;
3447             }
3448 
3449         }
3450 
3451     // We have finished walking through the ops.  Check whether some forward jump
3452     //   propagated a shorter length to location end+1.
3453     if (forwardedLength.elementAti(end+1) < currentLen) {
3454         currentLen = forwardedLength.elementAti(end+1);
3455         U_ASSERT(currentLen>=0 && currentLen < INT32_MAX);
3456     }
3457 
3458     return currentLen;
3459 }
3460 
3461 //------------------------------------------------------------------------------
3462 //
3463 //   maxMatchLength    Calculate the length of the longest string that could
3464 //                     match the specified pattern.
3465 //                     Length is in 16 bit code units, not code points.
3466 //
3467 //                     The calculated length may not be exact.  The returned
3468 //                     value may be longer than the actual maximum; it must
3469 //                     never be shorter.
3470 //
3471 //------------------------------------------------------------------------------
maxMatchLength(int32_t start,int32_t end)3472 int32_t   RegexCompile::maxMatchLength(int32_t start, int32_t end) {
3473     if (U_FAILURE(*fStatus)) {
3474         return 0;
3475     }
3476     U_ASSERT(start <= end);
3477     U_ASSERT(end < fRXPat->fCompiledPat->size());
3478 
3479     int32_t    loc;
3480     int32_t    op;
3481     int32_t    opType;
3482     int32_t    currentLen = 0;
3483     UVector32  forwardedLength(end+1, *fStatus);
3484     forwardedLength.setSize(end+1);
3485 
3486     for (loc=start; loc<=end; loc++) {
3487         forwardedLength.setElementAt(0, loc);
3488     }
3489 
3490     for (loc = start; loc<=end; loc++) {
3491         op = (int32_t)fRXPat->fCompiledPat->elementAti(loc);
3492         opType = URX_TYPE(op);
3493 
3494         // The loop is advancing linearly through the pattern.
3495         // If the op we are now at was the destination of a branch in the pattern,
3496         // and that path has a longer maximum length than the current accumulated value,
3497         // replace the current accumulated value.
3498         if (forwardedLength.elementAti(loc) > currentLen) {
3499             currentLen = forwardedLength.elementAti(loc);
3500         }
3501 
3502         switch (opType) {
3503             // Ops that don't change the total length matched
3504         case URX_RESERVED_OP:
3505         case URX_END:
3506         case URX_STRING_LEN:
3507         case URX_NOP:
3508         case URX_START_CAPTURE:
3509         case URX_END_CAPTURE:
3510         case URX_BACKSLASH_B:
3511         case URX_BACKSLASH_BU:
3512         case URX_BACKSLASH_G:
3513         case URX_BACKSLASH_Z:
3514         case URX_CARET:
3515         case URX_DOLLAR:
3516         case URX_DOLLAR_M:
3517         case URX_DOLLAR_D:
3518         case URX_DOLLAR_MD:
3519         case URX_RELOC_OPRND:
3520         case URX_STO_INP_LOC:
3521         case URX_CARET_M:
3522         case URX_CARET_M_UNIX:
3523 
3524         case URX_STO_SP:          // Setup for atomic or possessive blocks.  Doesn't change what can match.
3525         case URX_LD_SP:
3526 
3527         case URX_LB_END:
3528         case URX_LB_CONT:
3529         case URX_LBN_CONT:
3530         case URX_LBN_END:
3531             break;
3532 
3533 
3534             // Ops that increase that cause an unbounded increase in the length
3535             //   of a matched string, or that increase it a hard to characterize way.
3536             //   Call the max length unbounded, and stop further checking.
3537         case URX_BACKREF:         // BackRef.  Must assume that it might be a zero length match
3538         case URX_BACKREF_I:
3539         case URX_BACKSLASH_X:   // Grahpeme Cluster.  Minimum is 1, max unbounded.
3540             currentLen = INT32_MAX;
3541             break;
3542 
3543 
3544             // Ops that match a max of one character (possibly two 16 bit code units.)
3545             //
3546         case URX_STATIC_SETREF:
3547         case URX_STAT_SETREF_N:
3548         case URX_SETREF:
3549         case URX_BACKSLASH_D:
3550         case URX_BACKSLASH_H:
3551         case URX_BACKSLASH_R:
3552         case URX_BACKSLASH_V:
3553         case URX_ONECHAR_I:
3554         case URX_DOTANY_ALL:
3555         case URX_DOTANY:
3556         case URX_DOTANY_UNIX:
3557             currentLen = safeIncrement(currentLen, 2);
3558             break;
3559 
3560             // Single literal character.  Increase current max length by one or two,
3561             //       depending on whether the char is in the supplementary range.
3562         case URX_ONECHAR:
3563             currentLen = safeIncrement(currentLen, 1);
3564             if (URX_VAL(op) > 0x10000) {
3565                 currentLen = safeIncrement(currentLen, 1);
3566             }
3567             break;
3568 
3569             // Jumps.
3570             //
3571         case URX_JMP:
3572         case URX_JMPX:
3573         case URX_JMP_SAV:
3574         case URX_JMP_SAV_X:
3575             {
3576                 int32_t  jmpDest = URX_VAL(op);
3577                 if (jmpDest < loc) {
3578                     // Loop of some kind.  Max match length is unbounded.
3579                     currentLen = INT32_MAX;
3580                 } else {
3581                     // Forward jump.  Propagate the current min length to the target loc of the jump.
3582                     if (forwardedLength.elementAti(jmpDest) < currentLen) {
3583                         forwardedLength.setElementAt(currentLen, jmpDest);
3584                     }
3585                     currentLen = 0;
3586                 }
3587             }
3588             break;
3589 
3590         case URX_BACKTRACK:
3591             // back-tracks are kind of like a branch, except that the max length was
3592             //   propagated already, by the state save.
3593             currentLen = forwardedLength.elementAti(loc+1);
3594             break;
3595 
3596 
3597         case URX_STATE_SAVE:
3598             {
3599                 // State Save, for forward jumps, propagate the current minimum.
3600                 //               of the state save.
3601                 //             For backwards jumps, they create a loop, maximum
3602                 //               match length is unbounded.
3603                 int32_t  jmpDest = URX_VAL(op);
3604                 if (jmpDest > loc) {
3605                     if (currentLen > forwardedLength.elementAti(jmpDest)) {
3606                         forwardedLength.setElementAt(currentLen, jmpDest);
3607                     }
3608                 } else {
3609                     currentLen = INT32_MAX;
3610                 }
3611             }
3612             break;
3613 
3614 
3615 
3616 
3617         case URX_STRING:
3618             {
3619                 loc++;
3620                 int32_t stringLenOp = (int32_t)fRXPat->fCompiledPat->elementAti(loc);
3621                 currentLen = safeIncrement(currentLen, URX_VAL(stringLenOp));
3622                 break;
3623             }
3624 
3625         case URX_STRING_I:
3626             // TODO:  This code assumes that any user string that matches will be no longer
3627             //        than our compiled string, with case insensitive matching.
3628             //        Our compiled string has been case-folded already.
3629             //
3630             //        Any matching user string will have no more code points than our
3631             //        compiled (folded) string.  Folding may add code points, but
3632             //        not remove them.
3633             //
3634             //        There is a potential problem if a supplemental code point
3635             //        case-folds to a BMP code point.  In this case our compiled string
3636             //        could be shorter (in code units) than a matching user string.
3637             //
3638             //        At this time (Unicode 6.1) there are no such characters, and this case
3639             //        is not being handled.  A test, intltest regex/Bug9283, will fail if
3640             //        any problematic characters are added to Unicode.
3641             //
3642             //        If this happens, we can make a set of the BMP chars that the
3643             //        troublesome supplementals fold to, scan our string, and bump the
3644             //        currentLen one extra for each that is found.
3645             //
3646             {
3647                 loc++;
3648                 int32_t stringLenOp = (int32_t)fRXPat->fCompiledPat->elementAti(loc);
3649                 currentLen = safeIncrement(currentLen, URX_VAL(stringLenOp));
3650             }
3651             break;
3652 
3653         case URX_CTR_INIT:
3654         case URX_CTR_INIT_NG:
3655             // For Loops, recursively call this function on the pattern for the loop body,
3656             //   then multiply the result by the maximum loop count.
3657             {
3658                 int32_t  loopEndLoc = URX_VAL(fRXPat->fCompiledPat->elementAti(loc+1));
3659                 if (loopEndLoc == loc+4) {
3660                     // Loop has an empty body. No affect on max match length.
3661                     // Continue processing with code after the loop end.
3662                     loc = loopEndLoc;
3663                     break;
3664                 }
3665 
3666                 int32_t maxLoopCount = static_cast<int32_t>(fRXPat->fCompiledPat->elementAti(loc+3));
3667                 if (maxLoopCount == -1) {
3668                     // Unbounded Loop. No upper bound on match length.
3669                     currentLen = INT32_MAX;
3670                     break;
3671                 }
3672 
3673                 U_ASSERT(loopEndLoc >= loc+4);
3674                 int64_t blockLen = maxMatchLength(loc+4, loopEndLoc-1);  // Recursive call.
3675                 int64_t updatedLen = (int64_t)currentLen + blockLen * maxLoopCount;
3676                 if (updatedLen >= INT32_MAX) {
3677                     currentLen = INT32_MAX;
3678                     break;
3679                 }
3680                 currentLen = (int32_t)updatedLen;
3681                 loc = loopEndLoc;
3682                 break;
3683             }
3684 
3685         case URX_CTR_LOOP:
3686         case URX_CTR_LOOP_NG:
3687             // These opcodes will be skipped over by code for URX_CTR_INIT.
3688             // We shouldn't encounter them here.
3689             UPRV_UNREACHABLE;
3690 
3691         case URX_LOOP_SR_I:
3692         case URX_LOOP_DOT_I:
3693         case URX_LOOP_C:
3694             // For anything to do with loops, make the match length unbounded.
3695             currentLen = INT32_MAX;
3696             break;
3697 
3698 
3699 
3700         case URX_LA_START:
3701         case URX_LA_END:
3702             // Look-ahead.  Just ignore, treat the look-ahead block as if
3703             // it were normal pattern.  Gives a too-long match length,
3704             //  but good enough for now.
3705             break;
3706 
3707             // End of look-ahead ops should always be consumed by the processing at
3708             //  the URX_LA_START op.
3709             // UPRV_UNREACHABLE;
3710 
3711         case URX_LB_START:
3712             {
3713                 // Look-behind.  Scan forward until the matching look-around end,
3714                 //   without processing the look-behind block.
3715                 int32_t dataLoc = URX_VAL(op);
3716                 for (loc = loc + 1; loc < end; ++loc) {
3717                     op = (int32_t)fRXPat->fCompiledPat->elementAti(loc);
3718                     int32_t opType = URX_TYPE(op);
3719                     if ((opType == URX_LA_END || opType == URX_LBN_END) && (URX_VAL(op) == dataLoc)) {
3720                         break;
3721                     }
3722                 }
3723                 U_ASSERT(loc < end);
3724             }
3725             break;
3726 
3727         default:
3728             UPRV_UNREACHABLE;
3729         }
3730 
3731 
3732         if (currentLen == INT32_MAX) {
3733             //  The maximum length is unbounded.
3734             //  Stop further processing of the pattern.
3735             break;
3736         }
3737 
3738     }
3739     return currentLen;
3740 
3741 }
3742 
3743 
3744 //------------------------------------------------------------------------------
3745 //
3746 //   stripNOPs    Remove any NOP operations from the compiled pattern code.
3747 //                Extra NOPs are inserted for some constructs during the initial
3748 //                code generation to provide locations that may be patched later.
3749 //                Many end up unneeded, and are removed by this function.
3750 //
3751 //                In order to minimize the number of passes through the pattern,
3752 //                back-reference fixup is also performed here (adjusting
3753 //                back-reference operands to point to the correct frame offsets).
3754 //
3755 //------------------------------------------------------------------------------
stripNOPs()3756 void RegexCompile::stripNOPs() {
3757 
3758     if (U_FAILURE(*fStatus)) {
3759         return;
3760     }
3761 
3762     int32_t    end = fRXPat->fCompiledPat->size();
3763     UVector32  deltas(end, *fStatus);
3764 
3765     // Make a first pass over the code, computing the amount that things
3766     //   will be offset at each location in the original code.
3767     int32_t   loc;
3768     int32_t   d = 0;
3769     for (loc=0; loc<end; loc++) {
3770         deltas.addElement(d, *fStatus);
3771         int32_t op = (int32_t)fRXPat->fCompiledPat->elementAti(loc);
3772         if (URX_TYPE(op) == URX_NOP) {
3773             d++;
3774         }
3775     }
3776 
3777     UnicodeString caseStringBuffer;
3778 
3779     // Make a second pass over the code, removing the NOPs by moving following
3780     //  code up, and patching operands that refer to code locations that
3781     //  are being moved.  The array of offsets from the first step is used
3782     //  to compute the new operand values.
3783     int32_t src;
3784     int32_t dst = 0;
3785     for (src=0; src<end; src++) {
3786         int32_t op = (int32_t)fRXPat->fCompiledPat->elementAti(src);
3787         int32_t opType = URX_TYPE(op);
3788         switch (opType) {
3789         case URX_NOP:
3790             break;
3791 
3792         case URX_STATE_SAVE:
3793         case URX_JMP:
3794         case URX_CTR_LOOP:
3795         case URX_CTR_LOOP_NG:
3796         case URX_RELOC_OPRND:
3797         case URX_JMPX:
3798         case URX_JMP_SAV:
3799         case URX_JMP_SAV_X:
3800             // These are instructions with operands that refer to code locations.
3801             {
3802                 int32_t  operandAddress = URX_VAL(op);
3803                 U_ASSERT(operandAddress>=0 && operandAddress<deltas.size());
3804                 int32_t fixedOperandAddress = operandAddress - deltas.elementAti(operandAddress);
3805                 op = buildOp(opType, fixedOperandAddress);
3806                 fRXPat->fCompiledPat->setElementAt(op, dst);
3807                 dst++;
3808                 break;
3809             }
3810 
3811         case URX_BACKREF:
3812         case URX_BACKREF_I:
3813             {
3814                 int32_t where = URX_VAL(op);
3815                 if (where > fRXPat->fGroupMap->size()) {
3816                     error(U_REGEX_INVALID_BACK_REF);
3817                     break;
3818                 }
3819                 where = fRXPat->fGroupMap->elementAti(where-1);
3820                 op    = buildOp(opType, where);
3821                 fRXPat->fCompiledPat->setElementAt(op, dst);
3822                 dst++;
3823 
3824                 fRXPat->fNeedsAltInput = TRUE;
3825                 break;
3826             }
3827         case URX_RESERVED_OP:
3828         case URX_RESERVED_OP_N:
3829         case URX_BACKTRACK:
3830         case URX_END:
3831         case URX_ONECHAR:
3832         case URX_STRING:
3833         case URX_STRING_LEN:
3834         case URX_START_CAPTURE:
3835         case URX_END_CAPTURE:
3836         case URX_STATIC_SETREF:
3837         case URX_STAT_SETREF_N:
3838         case URX_SETREF:
3839         case URX_DOTANY:
3840         case URX_FAIL:
3841         case URX_BACKSLASH_B:
3842         case URX_BACKSLASH_BU:
3843         case URX_BACKSLASH_G:
3844         case URX_BACKSLASH_X:
3845         case URX_BACKSLASH_Z:
3846         case URX_DOTANY_ALL:
3847         case URX_BACKSLASH_D:
3848         case URX_CARET:
3849         case URX_DOLLAR:
3850         case URX_CTR_INIT:
3851         case URX_CTR_INIT_NG:
3852         case URX_DOTANY_UNIX:
3853         case URX_STO_SP:
3854         case URX_LD_SP:
3855         case URX_STO_INP_LOC:
3856         case URX_LA_START:
3857         case URX_LA_END:
3858         case URX_ONECHAR_I:
3859         case URX_STRING_I:
3860         case URX_DOLLAR_M:
3861         case URX_CARET_M:
3862         case URX_CARET_M_UNIX:
3863         case URX_LB_START:
3864         case URX_LB_CONT:
3865         case URX_LB_END:
3866         case URX_LBN_CONT:
3867         case URX_LBN_END:
3868         case URX_LOOP_SR_I:
3869         case URX_LOOP_DOT_I:
3870         case URX_LOOP_C:
3871         case URX_DOLLAR_D:
3872         case URX_DOLLAR_MD:
3873         case URX_BACKSLASH_H:
3874         case URX_BACKSLASH_R:
3875         case URX_BACKSLASH_V:
3876             // These instructions are unaltered by the relocation.
3877             fRXPat->fCompiledPat->setElementAt(op, dst);
3878             dst++;
3879             break;
3880 
3881         default:
3882             // Some op is unaccounted for.
3883             UPRV_UNREACHABLE;
3884         }
3885     }
3886 
3887     fRXPat->fCompiledPat->setSize(dst);
3888 }
3889 
3890 
3891 
3892 
3893 //------------------------------------------------------------------------------
3894 //
3895 //  Error         Report a rule parse error.
3896 //                Only report it if no previous error has been recorded.
3897 //
3898 //------------------------------------------------------------------------------
error(UErrorCode e)3899 void RegexCompile::error(UErrorCode e) {
3900     if (U_SUCCESS(*fStatus) || e == U_MEMORY_ALLOCATION_ERROR) {
3901         *fStatus = e;
3902         // Hmm. fParseErr (UParseError) line & offset fields are int32_t in public
3903         // API (see common/unicode/parseerr.h), while fLineNum and fCharNum are
3904         // int64_t. If the values of the latter are out of range for the former,
3905         // set them to the appropriate "field not supported" values.
3906         if (fLineNum > 0x7FFFFFFF) {
3907             fParseErr->line   = 0;
3908             fParseErr->offset = -1;
3909         } else if (fCharNum > 0x7FFFFFFF) {
3910             fParseErr->line   = (int32_t)fLineNum;
3911             fParseErr->offset = -1;
3912         } else {
3913             fParseErr->line   = (int32_t)fLineNum;
3914             fParseErr->offset = (int32_t)fCharNum;
3915         }
3916 
3917         UErrorCode status = U_ZERO_ERROR; // throwaway status for extracting context
3918 
3919         // Fill in the context.
3920         //   Note: extractBetween() pins supplied indicies to the string bounds.
3921         uprv_memset(fParseErr->preContext,  0, sizeof(fParseErr->preContext));
3922         uprv_memset(fParseErr->postContext, 0, sizeof(fParseErr->postContext));
3923         utext_extract(fRXPat->fPattern, fScanIndex-U_PARSE_CONTEXT_LEN+1, fScanIndex, fParseErr->preContext, U_PARSE_CONTEXT_LEN, &status);
3924         utext_extract(fRXPat->fPattern, fScanIndex, fScanIndex+U_PARSE_CONTEXT_LEN-1, fParseErr->postContext, U_PARSE_CONTEXT_LEN, &status);
3925     }
3926 }
3927 
3928 
3929 //
3930 //  Assorted Unicode character constants.
3931 //     Numeric because there is no portable way to enter them as literals.
3932 //     (Think EBCDIC).
3933 //
3934 static const UChar      chCR        = 0x0d;      // New lines, for terminating comments.
3935 static const UChar      chLF        = 0x0a;      // Line Feed
3936 static const UChar      chPound     = 0x23;      // '#', introduces a comment.
3937 static const UChar      chDigit0    = 0x30;      // '0'
3938 static const UChar      chDigit7    = 0x37;      // '9'
3939 static const UChar      chColon     = 0x3A;      // ':'
3940 static const UChar      chE         = 0x45;      // 'E'
3941 static const UChar      chQ         = 0x51;      // 'Q'
3942 //static const UChar      chN         = 0x4E;      // 'N'
3943 static const UChar      chP         = 0x50;      // 'P'
3944 static const UChar      chBackSlash = 0x5c;      // '\'  introduces a char escape
3945 //static const UChar      chLBracket  = 0x5b;      // '['
3946 static const UChar      chRBracket  = 0x5d;      // ']'
3947 static const UChar      chUp        = 0x5e;      // '^'
3948 static const UChar      chLowerP    = 0x70;
3949 static const UChar      chLBrace    = 0x7b;      // '{'
3950 static const UChar      chRBrace    = 0x7d;      // '}'
3951 static const UChar      chNEL       = 0x85;      //    NEL newline variant
3952 static const UChar      chLS        = 0x2028;    //    Unicode Line Separator
3953 
3954 
3955 //------------------------------------------------------------------------------
3956 //
3957 //  nextCharLL    Low Level Next Char from the regex pattern.
3958 //                Get a char from the string, keep track of input position
3959 //                     for error reporting.
3960 //
3961 //------------------------------------------------------------------------------
nextCharLL()3962 UChar32  RegexCompile::nextCharLL() {
3963     UChar32       ch;
3964 
3965     if (fPeekChar != -1) {
3966         ch = fPeekChar;
3967         fPeekChar = -1;
3968         return ch;
3969     }
3970 
3971     // assume we're already in the right place
3972     ch = UTEXT_NEXT32(fRXPat->fPattern);
3973     if (ch == U_SENTINEL) {
3974         return ch;
3975     }
3976 
3977     if (ch == chCR ||
3978         ch == chNEL ||
3979         ch == chLS   ||
3980         (ch == chLF && fLastChar != chCR)) {
3981         // Character is starting a new line.  Bump up the line number, and
3982         //  reset the column to 0.
3983         fLineNum++;
3984         fCharNum=0;
3985     }
3986     else {
3987         // Character is not starting a new line.  Except in the case of a
3988         //   LF following a CR, increment the column position.
3989         if (ch != chLF) {
3990             fCharNum++;
3991         }
3992     }
3993     fLastChar = ch;
3994     return ch;
3995 }
3996 
3997 //------------------------------------------------------------------------------
3998 //
3999 //   peekCharLL    Low Level Character Scanning, sneak a peek at the next
4000 //                 character without actually getting it.
4001 //
4002 //------------------------------------------------------------------------------
peekCharLL()4003 UChar32  RegexCompile::peekCharLL() {
4004     if (fPeekChar == -1) {
4005         fPeekChar = nextCharLL();
4006     }
4007     return fPeekChar;
4008 }
4009 
4010 
4011 //------------------------------------------------------------------------------
4012 //
4013 //   nextChar     for pattern scanning.  At this level, we handle stripping
4014 //                out comments and processing some backslash character escapes.
4015 //                The rest of the pattern grammar is handled at the next level up.
4016 //
4017 //------------------------------------------------------------------------------
nextChar(RegexPatternChar & c)4018 void RegexCompile::nextChar(RegexPatternChar &c) {
4019   tailRecursion:
4020     fScanIndex = UTEXT_GETNATIVEINDEX(fRXPat->fPattern);
4021     c.fChar    = nextCharLL();
4022     c.fQuoted  = FALSE;
4023 
4024     if (fQuoteMode) {
4025         c.fQuoted = TRUE;
4026         if ((c.fChar==chBackSlash && peekCharLL()==chE && ((fModeFlags & UREGEX_LITERAL) == 0)) ||
4027             c.fChar == (UChar32)-1) {
4028             fQuoteMode = FALSE;  //  Exit quote mode,
4029             nextCharLL();        // discard the E
4030             // nextChar(c);      // recurse to get the real next char
4031             goto tailRecursion;  // Note: fuzz testing produced testcases that
4032                                  //       resulted in stack overflow here.
4033         }
4034     }
4035     else if (fInBackslashQuote) {
4036         // The current character immediately follows a '\'
4037         // Don't check for any further escapes, just return it as-is.
4038         // Don't set c.fQuoted, because that would prevent the state machine from
4039         //    dispatching on the character.
4040         fInBackslashQuote = FALSE;
4041     }
4042     else
4043     {
4044         // We are not in a \Q quoted region \E of the source.
4045         //
4046         if (fModeFlags & UREGEX_COMMENTS) {
4047             //
4048             // We are in free-spacing and comments mode.
4049             //  Scan through any white space and comments, until we
4050             //  reach a significant character or the end of inut.
4051             for (;;) {
4052                 if (c.fChar == (UChar32)-1) {
4053                     break;     // End of Input
4054                 }
4055                 if  (c.fChar == chPound && fEOLComments == TRUE) {
4056                     // Start of a comment.  Consume the rest of it, until EOF or a new line
4057                     for (;;) {
4058                         c.fChar = nextCharLL();
4059                         if (c.fChar == (UChar32)-1 ||  // EOF
4060                             c.fChar == chCR        ||
4061                             c.fChar == chLF        ||
4062                             c.fChar == chNEL       ||
4063                             c.fChar == chLS)       {
4064                             break;
4065                         }
4066                     }
4067                 }
4068                 // TODO:  check what Java & Perl do with non-ASCII white spaces.  Ticket 6061.
4069                 if (PatternProps::isWhiteSpace(c.fChar) == FALSE) {
4070                     break;
4071                 }
4072                 c.fChar = nextCharLL();
4073             }
4074         }
4075 
4076         //
4077         //  check for backslash escaped characters.
4078         //
4079         if (c.fChar == chBackSlash) {
4080             int64_t pos = UTEXT_GETNATIVEINDEX(fRXPat->fPattern);
4081             if (RegexStaticSets::gStaticSets->fUnescapeCharSet.contains(peekCharLL())) {
4082                 //
4083                 // A '\' sequence that is handled by ICU's standard unescapeAt function.
4084                 //   Includes \uxxxx, \n, \r, many others.
4085                 //   Return the single equivalent character.
4086                 //
4087                 nextCharLL();                 // get & discard the peeked char.
4088                 c.fQuoted = TRUE;
4089 
4090                 if (UTEXT_FULL_TEXT_IN_CHUNK(fRXPat->fPattern, fPatternLength)) {
4091                     int32_t endIndex = (int32_t)pos;
4092                     c.fChar = u_unescapeAt(uregex_ucstr_unescape_charAt, &endIndex, (int32_t)fPatternLength, (void *)fRXPat->fPattern->chunkContents);
4093 
4094                     if (endIndex == pos) {
4095                         error(U_REGEX_BAD_ESCAPE_SEQUENCE);
4096                     }
4097                     fCharNum += endIndex - pos;
4098                     UTEXT_SETNATIVEINDEX(fRXPat->fPattern, endIndex);
4099                 } else {
4100                     int32_t offset = 0;
4101                     struct URegexUTextUnescapeCharContext context = U_REGEX_UTEXT_UNESCAPE_CONTEXT(fRXPat->fPattern);
4102 
4103                     UTEXT_SETNATIVEINDEX(fRXPat->fPattern, pos);
4104                     c.fChar = u_unescapeAt(uregex_utext_unescape_charAt, &offset, INT32_MAX, &context);
4105 
4106                     if (offset == 0) {
4107                         error(U_REGEX_BAD_ESCAPE_SEQUENCE);
4108                     } else if (context.lastOffset == offset) {
4109                         UTEXT_PREVIOUS32(fRXPat->fPattern);
4110                     } else if (context.lastOffset != offset-1) {
4111                         utext_moveIndex32(fRXPat->fPattern, offset - context.lastOffset - 1);
4112                     }
4113                     fCharNum += offset;
4114                 }
4115             }
4116             else if (peekCharLL() == chDigit0) {
4117                 //  Octal Escape, using Java Regexp Conventions
4118                 //    which are \0 followed by 1-3 octal digits.
4119                 //    Different from ICU Unescape handling of Octal, which does not
4120                 //    require the leading 0.
4121                 //  Java also has the convention of only consuming 2 octal digits if
4122                 //    the three digit number would be > 0xff
4123                 //
4124                 c.fChar = 0;
4125                 nextCharLL();    // Consume the initial 0.
4126                 int index;
4127                 for (index=0; index<3; index++) {
4128                     int32_t ch = peekCharLL();
4129                     if (ch<chDigit0 || ch>chDigit7) {
4130                         if (index==0) {
4131                            // \0 is not followed by any octal digits.
4132                            error(U_REGEX_BAD_ESCAPE_SEQUENCE);
4133                         }
4134                         break;
4135                     }
4136                     c.fChar <<= 3;
4137                     c.fChar += ch&7;
4138                     if (c.fChar <= 255) {
4139                         nextCharLL();
4140                     } else {
4141                         // The last digit made the number too big.  Forget we saw it.
4142                         c.fChar >>= 3;
4143                     }
4144                 }
4145                 c.fQuoted = TRUE;
4146             }
4147             else if (peekCharLL() == chQ) {
4148                 //  "\Q"  enter quote mode, which will continue until "\E"
4149                 fQuoteMode = TRUE;
4150                 nextCharLL();        // discard the 'Q'.
4151                 // nextChar(c);      // recurse to get the real next char.
4152                 goto tailRecursion;  // Note: fuzz testing produced test cases that
4153                 //                            resulted in stack overflow here.
4154             }
4155             else
4156             {
4157                 // We are in a '\' escape that will be handled by the state table scanner.
4158                 // Just return the backslash, but remember that the following char is to
4159                 //  be taken literally.
4160                 fInBackslashQuote = TRUE;
4161             }
4162         }
4163     }
4164 
4165     // re-enable # to end-of-line comments, in case they were disabled.
4166     // They are disabled by the parser upon seeing '(?', but this lasts for
4167     //  the fetching of the next character only.
4168     fEOLComments = TRUE;
4169 
4170     // putc(c.fChar, stdout);
4171 }
4172 
4173 
4174 
4175 //------------------------------------------------------------------------------
4176 //
4177 //  scanNamedChar
4178 //            Get a UChar32 from a \N{UNICODE CHARACTER NAME} in the pattern.
4179 //
4180 //             The scan position will be at the 'N'.  On return
4181 //             the scan position should be just after the '}'
4182 //
4183 //             Return the UChar32
4184 //
4185 //------------------------------------------------------------------------------
scanNamedChar()4186 UChar32  RegexCompile::scanNamedChar() {
4187     if (U_FAILURE(*fStatus)) {
4188         return 0;
4189     }
4190 
4191     nextChar(fC);
4192     if (fC.fChar != chLBrace) {
4193         error(U_REGEX_PROPERTY_SYNTAX);
4194         return 0;
4195     }
4196 
4197     UnicodeString  charName;
4198     for (;;) {
4199         nextChar(fC);
4200         if (fC.fChar == chRBrace) {
4201             break;
4202         }
4203         if (fC.fChar == -1) {
4204             error(U_REGEX_PROPERTY_SYNTAX);
4205             return 0;
4206         }
4207         charName.append(fC.fChar);
4208     }
4209 
4210     char name[100];
4211     if (!uprv_isInvariantUString(charName.getBuffer(), charName.length()) ||
4212          (uint32_t)charName.length()>=sizeof(name)) {
4213         // All Unicode character names have only invariant characters.
4214         // The API to get a character, given a name, accepts only char *, forcing us to convert,
4215         //   which requires this error check
4216         error(U_REGEX_PROPERTY_SYNTAX);
4217         return 0;
4218     }
4219     charName.extract(0, charName.length(), name, sizeof(name), US_INV);
4220 
4221     UChar32  theChar = u_charFromName(U_UNICODE_CHAR_NAME, name, fStatus);
4222     if (U_FAILURE(*fStatus)) {
4223         error(U_REGEX_PROPERTY_SYNTAX);
4224     }
4225 
4226     nextChar(fC);      // Continue overall regex pattern processing with char after the '}'
4227     return theChar;
4228 }
4229 
4230 //------------------------------------------------------------------------------
4231 //
4232 //  scanProp   Construct a UnicodeSet from the text at the current scan
4233 //             position, which will be of the form \p{whaterver}
4234 //
4235 //             The scan position will be at the 'p' or 'P'.  On return
4236 //             the scan position should be just after the '}'
4237 //
4238 //             Return a UnicodeSet, constructed from the \P pattern,
4239 //             or NULL if the pattern is invalid.
4240 //
4241 //------------------------------------------------------------------------------
scanProp()4242 UnicodeSet *RegexCompile::scanProp() {
4243     UnicodeSet    *uset = NULL;
4244 
4245     if (U_FAILURE(*fStatus)) {
4246         return NULL;
4247     }
4248     (void)chLowerP;   // Suppress compiler unused variable warning.
4249     U_ASSERT(fC.fChar == chLowerP || fC.fChar == chP);
4250     UBool negated = (fC.fChar == chP);
4251 
4252     UnicodeString propertyName;
4253     nextChar(fC);
4254     if (fC.fChar != chLBrace) {
4255         error(U_REGEX_PROPERTY_SYNTAX);
4256         return NULL;
4257     }
4258     for (;;) {
4259         nextChar(fC);
4260         if (fC.fChar == chRBrace) {
4261             break;
4262         }
4263         if (fC.fChar == -1) {
4264             // Hit the end of the input string without finding the closing '}'
4265             error(U_REGEX_PROPERTY_SYNTAX);
4266             return NULL;
4267         }
4268         propertyName.append(fC.fChar);
4269     }
4270     uset = createSetForProperty(propertyName, negated);
4271     nextChar(fC);    // Move input scan to position following the closing '}'
4272     return uset;
4273 }
4274 
4275 //------------------------------------------------------------------------------
4276 //
4277 //  scanPosixProp   Construct a UnicodeSet from the text at the current scan
4278 //             position, which is expected be of the form [:property expression:]
4279 //
4280 //             The scan position will be at the opening ':'.  On return
4281 //             the scan position must be on the closing ']'
4282 //
4283 //             Return a UnicodeSet constructed from the pattern,
4284 //             or NULL if this is not a valid POSIX-style set expression.
4285 //             If not a property expression, restore the initial scan position
4286 //                (to the opening ':')
4287 //
4288 //               Note:  the opening '[:' is not sufficient to guarantee that
4289 //                      this is a [:property:] expression.
4290 //                      [:'+=,] is a perfectly good ordinary set expression that
4291 //                              happens to include ':' as one of its characters.
4292 //
4293 //------------------------------------------------------------------------------
scanPosixProp()4294 UnicodeSet *RegexCompile::scanPosixProp() {
4295     UnicodeSet    *uset = NULL;
4296 
4297     if (U_FAILURE(*fStatus)) {
4298         return NULL;
4299     }
4300 
4301     U_ASSERT(fC.fChar == chColon);
4302 
4303     // Save the scanner state.
4304     // TODO:  move this into the scanner, with the state encapsulated in some way.  Ticket 6062
4305     int64_t     savedScanIndex        = fScanIndex;
4306     int64_t     savedNextIndex        = UTEXT_GETNATIVEINDEX(fRXPat->fPattern);
4307     UBool       savedQuoteMode        = fQuoteMode;
4308     UBool       savedInBackslashQuote = fInBackslashQuote;
4309     UBool       savedEOLComments      = fEOLComments;
4310     int64_t     savedLineNum          = fLineNum;
4311     int64_t     savedCharNum          = fCharNum;
4312     UChar32     savedLastChar         = fLastChar;
4313     UChar32     savedPeekChar         = fPeekChar;
4314     RegexPatternChar savedfC          = fC;
4315 
4316     // Scan for a closing ].   A little tricky because there are some perverse
4317     //   edge cases possible.  "[:abc\Qdef:] \E]"  is a valid non-property expression,
4318     //   ending on the second closing ].
4319 
4320     UnicodeString propName;
4321     UBool         negated  = FALSE;
4322 
4323     // Check for and consume the '^' in a negated POSIX property, e.g.  [:^Letter:]
4324     nextChar(fC);
4325     if (fC.fChar == chUp) {
4326        negated = TRUE;
4327        nextChar(fC);
4328     }
4329 
4330     // Scan for the closing ":]", collecting the property name along the way.
4331     UBool  sawPropSetTerminator = FALSE;
4332     for (;;) {
4333         propName.append(fC.fChar);
4334         nextChar(fC);
4335         if (fC.fQuoted || fC.fChar == -1) {
4336             // Escaped characters or end of input - either says this isn't a [:Property:]
4337             break;
4338         }
4339         if (fC.fChar == chColon) {
4340             nextChar(fC);
4341             if (fC.fChar == chRBracket) {
4342                 sawPropSetTerminator = TRUE;
4343             }
4344             break;
4345         }
4346     }
4347 
4348     if (sawPropSetTerminator) {
4349         uset = createSetForProperty(propName, negated);
4350     }
4351     else
4352     {
4353         // No closing ":]".
4354         //  Restore the original scan position.
4355         //  The main scanner will retry the input as a normal set expression,
4356         //    not a [:Property:] expression.
4357         fScanIndex        = savedScanIndex;
4358         fQuoteMode        = savedQuoteMode;
4359         fInBackslashQuote = savedInBackslashQuote;
4360         fEOLComments      = savedEOLComments;
4361         fLineNum          = savedLineNum;
4362         fCharNum          = savedCharNum;
4363         fLastChar         = savedLastChar;
4364         fPeekChar         = savedPeekChar;
4365         fC                = savedfC;
4366         UTEXT_SETNATIVEINDEX(fRXPat->fPattern, savedNextIndex);
4367     }
4368     return uset;
4369 }
4370 
addIdentifierIgnorable(UnicodeSet * set,UErrorCode & ec)4371 static inline void addIdentifierIgnorable(UnicodeSet *set, UErrorCode& ec) {
4372     set->add(0, 8).add(0x0e, 0x1b).add(0x7f, 0x9f);
4373     addCategory(set, U_GC_CF_MASK, ec);
4374 }
4375 
4376 //
4377 //  Create a Unicode Set from a Unicode Property expression.
4378 //     This is common code underlying both \p{...} ane [:...:] expressions.
4379 //     Includes trying the Java "properties" that aren't supported as
4380 //     normal ICU UnicodeSet properties
4381 //
createSetForProperty(const UnicodeString & propName,UBool negated)4382 UnicodeSet *RegexCompile::createSetForProperty(const UnicodeString &propName, UBool negated) {
4383 
4384     if (U_FAILURE(*fStatus)) {
4385         return nullptr;
4386     }
4387     LocalPointer<UnicodeSet> set;
4388     UErrorCode status = U_ZERO_ERROR;
4389 
4390     do {      // non-loop, exists to allow breaks from the block.
4391         //
4392         //  First try the property as we received it
4393         //
4394         UnicodeString   setExpr;
4395         uint32_t        usetFlags = 0;
4396         setExpr.append(u"[\\p{", -1);
4397         setExpr.append(propName);
4398         setExpr.append(u"}]", -1);
4399         if (fModeFlags & UREGEX_CASE_INSENSITIVE) {
4400             usetFlags |= USET_CASE_INSENSITIVE;
4401         }
4402         set.adoptInsteadAndCheckErrorCode(new UnicodeSet(setExpr, usetFlags, NULL, status), status);
4403         if (U_SUCCESS(status) || status == U_MEMORY_ALLOCATION_ERROR) {
4404             break;
4405         }
4406 
4407         //
4408         //  The incoming property wasn't directly recognized by ICU.
4409 
4410         //  Check [:word:] and [:all:]. These are not recognized as a properties by ICU UnicodeSet.
4411         //     Java accepts 'word' with mixed case.
4412         //     Java accepts 'all' only in all lower case.
4413 
4414         status = U_ZERO_ERROR;
4415         if (propName.caseCompare(u"word", -1, 0) == 0) {
4416             set.adoptInsteadAndCheckErrorCode(new UnicodeSet(*(fRXPat->fStaticSets[URX_ISWORD_SET])), status);
4417             break;
4418         }
4419         if (propName.compare(u"all", -1) == 0) {
4420             set.adoptInsteadAndCheckErrorCode(new UnicodeSet(0, 0x10ffff), status);
4421             break;
4422         }
4423 
4424 
4425         //    Do Java InBlock expressions
4426         //
4427         UnicodeString mPropName = propName;
4428         if (mPropName.startsWith(u"In", 2) && mPropName.length() >= 3) {
4429             status = U_ZERO_ERROR;
4430             set.adoptInsteadAndCheckErrorCode(new UnicodeSet(), status);
4431             if (U_FAILURE(status)) {
4432                 break;
4433             }
4434             UnicodeString blockName(mPropName, 2);  // Property with the leading "In" removed.
4435             set->applyPropertyAlias(UnicodeString(u"Block"), blockName, status);
4436             break;
4437         }
4438 
4439         //  Check for the Java form "IsBooleanPropertyValue", which we will recast
4440         //  as "BooleanPropertyValue". The property value can be either a
4441         //  a General Category or a Script Name.
4442 
4443         if (propName.startsWith(u"Is", 2) && propName.length()>=3) {
4444             mPropName.remove(0, 2);      // Strip the "Is"
4445             if (mPropName.indexOf(u'=') >= 0) {
4446                 // Reject any "Is..." property expression containing an '=', that is,
4447                 // any non-binary property expression.
4448                 status = U_REGEX_PROPERTY_SYNTAX;
4449                 break;
4450             }
4451 
4452             if (mPropName.caseCompare(u"assigned", -1, 0) == 0) {
4453                 mPropName.setTo(u"unassigned", -1);
4454                 negated = !negated;
4455             } else if (mPropName.caseCompare(u"TitleCase", -1, 0) == 0) {
4456                 mPropName.setTo(u"Titlecase_Letter", -1);
4457             }
4458 
4459             mPropName.insert(0, u"[\\p{", -1);
4460             mPropName.append(u"}]", -1);
4461             set.adoptInsteadAndCheckErrorCode(new UnicodeSet(mPropName, *fStatus), status);
4462 
4463             if (U_SUCCESS(status) && !set->isEmpty() && (usetFlags & USET_CASE_INSENSITIVE)) {
4464                 set->closeOver(USET_CASE_INSENSITIVE);
4465             }
4466             break;
4467 
4468         }
4469 
4470         if (propName.startsWith(u"java", -1)) {
4471             status = U_ZERO_ERROR;
4472             set.adoptInsteadAndCheckErrorCode(new UnicodeSet(), status);
4473             if (U_FAILURE(status)) {
4474                 break;
4475             }
4476             //
4477             //  Try the various Java specific properties.
4478             //   These all begin with "java"
4479             //
4480             if (propName.compare(u"javaDefined", -1) == 0) {
4481                 addCategory(set.getAlias(), U_GC_CN_MASK, status);
4482                 set->complement();
4483             }
4484             else if (propName.compare(u"javaDigit", -1) == 0) {
4485                 addCategory(set.getAlias(), U_GC_ND_MASK, status);
4486             }
4487             else if (propName.compare(u"javaIdentifierIgnorable", -1) == 0) {
4488                 addIdentifierIgnorable(set.getAlias(), status);
4489             }
4490             else if (propName.compare(u"javaISOControl", -1) == 0) {
4491                 set->add(0, 0x1F).add(0x7F, 0x9F);
4492             }
4493             else if (propName.compare(u"javaJavaIdentifierPart", -1) == 0) {
4494                 addCategory(set.getAlias(), U_GC_L_MASK, status);
4495                 addCategory(set.getAlias(), U_GC_SC_MASK, status);
4496                 addCategory(set.getAlias(), U_GC_PC_MASK, status);
4497                 addCategory(set.getAlias(), U_GC_ND_MASK, status);
4498                 addCategory(set.getAlias(), U_GC_NL_MASK, status);
4499                 addCategory(set.getAlias(), U_GC_MC_MASK, status);
4500                 addCategory(set.getAlias(), U_GC_MN_MASK, status);
4501                 addIdentifierIgnorable(set.getAlias(), status);
4502             }
4503             else if (propName.compare(u"javaJavaIdentifierStart", -1) == 0) {
4504                 addCategory(set.getAlias(), U_GC_L_MASK, status);
4505                 addCategory(set.getAlias(), U_GC_NL_MASK, status);
4506                 addCategory(set.getAlias(), U_GC_SC_MASK, status);
4507                 addCategory(set.getAlias(), U_GC_PC_MASK, status);
4508             }
4509             else if (propName.compare(u"javaLetter", -1) == 0) {
4510                 addCategory(set.getAlias(), U_GC_L_MASK, status);
4511             }
4512             else if (propName.compare(u"javaLetterOrDigit", -1) == 0) {
4513                 addCategory(set.getAlias(), U_GC_L_MASK, status);
4514                 addCategory(set.getAlias(), U_GC_ND_MASK, status);
4515             }
4516             else if (propName.compare(u"javaLowerCase", -1) == 0) {
4517                 addCategory(set.getAlias(), U_GC_LL_MASK, status);
4518             }
4519             else if (propName.compare(u"javaMirrored", -1) == 0) {
4520                 set->applyIntPropertyValue(UCHAR_BIDI_MIRRORED, 1, status);
4521             }
4522             else if (propName.compare(u"javaSpaceChar", -1) == 0) {
4523                 addCategory(set.getAlias(), U_GC_Z_MASK, status);
4524             }
4525             else if (propName.compare(u"javaSupplementaryCodePoint", -1) == 0) {
4526                 set->add(0x10000, UnicodeSet::MAX_VALUE);
4527             }
4528             else if (propName.compare(u"javaTitleCase", -1) == 0) {
4529                 addCategory(set.getAlias(), U_GC_LT_MASK, status);
4530             }
4531             else if (propName.compare(u"javaUnicodeIdentifierStart", -1) == 0) {
4532                 addCategory(set.getAlias(), U_GC_L_MASK, status);
4533                 addCategory(set.getAlias(), U_GC_NL_MASK, status);
4534             }
4535             else if (propName.compare(u"javaUnicodeIdentifierPart", -1) == 0) {
4536                 addCategory(set.getAlias(), U_GC_L_MASK, status);
4537                 addCategory(set.getAlias(), U_GC_PC_MASK, status);
4538                 addCategory(set.getAlias(), U_GC_ND_MASK, status);
4539                 addCategory(set.getAlias(), U_GC_NL_MASK, status);
4540                 addCategory(set.getAlias(), U_GC_MC_MASK, status);
4541                 addCategory(set.getAlias(), U_GC_MN_MASK, status);
4542                 addIdentifierIgnorable(set.getAlias(), status);
4543             }
4544             else if (propName.compare(u"javaUpperCase", -1) == 0) {
4545                 addCategory(set.getAlias(), U_GC_LU_MASK, status);
4546             }
4547             else if (propName.compare(u"javaValidCodePoint", -1) == 0) {
4548                 set->add(0, UnicodeSet::MAX_VALUE);
4549             }
4550             else if (propName.compare(u"javaWhitespace", -1) == 0) {
4551                 addCategory(set.getAlias(), U_GC_Z_MASK, status);
4552                 set->removeAll(UnicodeSet().add(0xa0).add(0x2007).add(0x202f));
4553                 set->add(9, 0x0d).add(0x1c, 0x1f);
4554             } else {
4555                 status = U_REGEX_PROPERTY_SYNTAX;
4556             }
4557 
4558             if (U_SUCCESS(status) && !set->isEmpty() && (usetFlags & USET_CASE_INSENSITIVE)) {
4559                 set->closeOver(USET_CASE_INSENSITIVE);
4560             }
4561             break;
4562         }
4563 
4564         // Unrecognized property. ICU didn't like it as it was, and none of the Java compatibility
4565         // extensions matched it.
4566         status = U_REGEX_PROPERTY_SYNTAX;
4567     } while (false);   // End of do loop block. Code above breaks out of the block on success or hard failure.
4568 
4569     if (U_SUCCESS(status)) {
4570         U_ASSERT(set.isValid());
4571         if (negated) {
4572             set->complement();
4573         }
4574         return set.orphan();
4575     } else {
4576         if (status == U_ILLEGAL_ARGUMENT_ERROR) {
4577             status = U_REGEX_PROPERTY_SYNTAX;
4578         }
4579         error(status);
4580         return nullptr;
4581     }
4582 }
4583 
4584 
4585 //
4586 //  SetEval   Part of the evaluation of [set expressions].
4587 //            Perform any pending (stacked) operations with precedence
4588 //            equal or greater to that of the next operator encountered
4589 //            in the expression.
4590 //
setEval(int32_t nextOp)4591 void RegexCompile::setEval(int32_t nextOp) {
4592     UnicodeSet *rightOperand = NULL;
4593     UnicodeSet *leftOperand  = NULL;
4594     for (;;) {
4595         U_ASSERT(fSetOpStack.empty()==FALSE);
4596         int32_t pendingSetOperation = fSetOpStack.peeki();
4597         if ((pendingSetOperation&0xffff0000) < (nextOp&0xffff0000)) {
4598             break;
4599         }
4600         fSetOpStack.popi();
4601         U_ASSERT(fSetStack.empty() == FALSE);
4602         rightOperand = (UnicodeSet *)fSetStack.peek();
4603         switch (pendingSetOperation) {
4604             case setNegation:
4605                 rightOperand->complement();
4606                 break;
4607             case setCaseClose:
4608                 // TODO: need a simple close function.  Ticket 6065
4609                 rightOperand->closeOver(USET_CASE_INSENSITIVE);
4610                 rightOperand->removeAllStrings();
4611                 break;
4612             case setDifference1:
4613             case setDifference2:
4614                 fSetStack.pop();
4615                 leftOperand = (UnicodeSet *)fSetStack.peek();
4616                 leftOperand->removeAll(*rightOperand);
4617                 delete rightOperand;
4618                 break;
4619             case setIntersection1:
4620             case setIntersection2:
4621                 fSetStack.pop();
4622                 leftOperand = (UnicodeSet *)fSetStack.peek();
4623                 leftOperand->retainAll(*rightOperand);
4624                 delete rightOperand;
4625                 break;
4626             case setUnion:
4627                 fSetStack.pop();
4628                 leftOperand = (UnicodeSet *)fSetStack.peek();
4629                 leftOperand->addAll(*rightOperand);
4630                 delete rightOperand;
4631                 break;
4632             default:
4633                 UPRV_UNREACHABLE;
4634             }
4635         }
4636     }
4637 
setPushOp(int32_t op)4638 void RegexCompile::setPushOp(int32_t op) {
4639     setEval(op);
4640     fSetOpStack.push(op, *fStatus);
4641     fSetStack.push(new UnicodeSet(), *fStatus);
4642 }
4643 
4644 U_NAMESPACE_END
4645 #endif  // !UCONFIG_NO_REGULAR_EXPRESSIONS
4646 
4647