1 // © 2016 and later: Unicode, Inc. and others.
2 // License & terms of use: http://www.unicode.org/copyright.html
3 /*
4 **************************************************************************
5 *   Copyright (C) 2002-2016 International Business Machines Corporation
6 *   and others. All rights reserved.
7 **************************************************************************
8 */
9 //
10 //  file:  rematch.cpp
11 //
12 //         Contains the implementation of class RegexMatcher,
13 //         which is one of the main API classes for the ICU regular expression package.
14 //
15 
16 #include "unicode/utypes.h"
17 #if !UCONFIG_NO_REGULAR_EXPRESSIONS
18 
19 #include "unicode/regex.h"
20 #include "unicode/uniset.h"
21 #include "unicode/uchar.h"
22 #include "unicode/ustring.h"
23 #include "unicode/rbbi.h"
24 #include "unicode/utf.h"
25 #include "unicode/utf16.h"
26 #include "uassert.h"
27 #include "cmemory.h"
28 #include "cstr.h"
29 #include "uvector.h"
30 #include "uvectr32.h"
31 #include "uvectr64.h"
32 #include "regeximp.h"
33 #include "regexst.h"
34 #include "regextxt.h"
35 #include "ucase.h"
36 
37 // #include <malloc.h>        // Needed for heapcheck testing
38 
39 
40 U_NAMESPACE_BEGIN
41 
42 // Default limit for the size of the back track stack, to avoid system
43 //    failures causedby heap exhaustion.  Units are in 32 bit words, not bytes.
44 // This value puts ICU's limits higher than most other regexp implementations,
45 //    which use recursion rather than the heap, and take more storage per
46 //    backtrack point.
47 //
48 static const int32_t DEFAULT_BACKTRACK_STACK_CAPACITY = 8000000;
49 
50 // Time limit counter constant.
51 //   Time limits for expression evaluation are in terms of quanta of work by
52 //   the engine, each of which is 10,000 state saves.
53 //   This constant determines that state saves per tick number.
54 static const int32_t TIMER_INITIAL_VALUE = 10000;
55 
56 
57 // Test for any of the Unicode line terminating characters.
isLineTerminator(UChar32 c)58 static inline UBool isLineTerminator(UChar32 c) {
59     if (c & ~(0x0a | 0x0b | 0x0c | 0x0d | 0x85 | 0x2028 | 0x2029)) {
60         return false;
61     }
62     return (c<=0x0d && c>=0x0a) || c==0x85 || c==0x2028 || c==0x2029;
63 }
64 
65 //-----------------------------------------------------------------------------
66 //
67 //   Constructor and Destructor
68 //
69 //-----------------------------------------------------------------------------
RegexMatcher(const RegexPattern * pat)70 RegexMatcher::RegexMatcher(const RegexPattern *pat)  {
71     fDeferredStatus = U_ZERO_ERROR;
72     init(fDeferredStatus);
73     if (U_FAILURE(fDeferredStatus)) {
74         return;
75     }
76     if (pat==NULL) {
77         fDeferredStatus = U_ILLEGAL_ARGUMENT_ERROR;
78         return;
79     }
80     fPattern = pat;
81     init2(RegexStaticSets::gStaticSets->fEmptyText, fDeferredStatus);
82 }
83 
84 
85 
RegexMatcher(const UnicodeString & regexp,const UnicodeString & input,uint32_t flags,UErrorCode & status)86 RegexMatcher::RegexMatcher(const UnicodeString &regexp, const UnicodeString &input,
87                            uint32_t flags, UErrorCode &status) {
88     init(status);
89     if (U_FAILURE(status)) {
90         return;
91     }
92     UParseError    pe;
93     fPatternOwned      = RegexPattern::compile(regexp, flags, pe, status);
94     fPattern           = fPatternOwned;
95 
96     UText inputText = UTEXT_INITIALIZER;
97     utext_openConstUnicodeString(&inputText, &input, &status);
98     init2(&inputText, status);
99     utext_close(&inputText);
100 
101     fInputUniStrMaybeMutable = TRUE;
102 }
103 
104 
RegexMatcher(UText * regexp,UText * input,uint32_t flags,UErrorCode & status)105 RegexMatcher::RegexMatcher(UText *regexp, UText *input,
106                            uint32_t flags, UErrorCode &status) {
107     init(status);
108     if (U_FAILURE(status)) {
109         return;
110     }
111     UParseError    pe;
112     fPatternOwned      = RegexPattern::compile(regexp, flags, pe, status);
113     if (U_FAILURE(status)) {
114         return;
115     }
116 
117     fPattern           = fPatternOwned;
118     init2(input, status);
119 }
120 
121 
RegexMatcher(const UnicodeString & regexp,uint32_t flags,UErrorCode & status)122 RegexMatcher::RegexMatcher(const UnicodeString &regexp,
123                            uint32_t flags, UErrorCode &status) {
124     init(status);
125     if (U_FAILURE(status)) {
126         return;
127     }
128     UParseError    pe;
129     fPatternOwned      = RegexPattern::compile(regexp, flags, pe, status);
130     if (U_FAILURE(status)) {
131         return;
132     }
133     fPattern           = fPatternOwned;
134     init2(RegexStaticSets::gStaticSets->fEmptyText, status);
135 }
136 
RegexMatcher(UText * regexp,uint32_t flags,UErrorCode & status)137 RegexMatcher::RegexMatcher(UText *regexp,
138                            uint32_t flags, UErrorCode &status) {
139     init(status);
140     if (U_FAILURE(status)) {
141         return;
142     }
143     UParseError    pe;
144     fPatternOwned      = RegexPattern::compile(regexp, flags, pe, status);
145         if (U_FAILURE(status)) {
146         return;
147     }
148 
149     fPattern           = fPatternOwned;
150     init2(RegexStaticSets::gStaticSets->fEmptyText, status);
151 }
152 
153 
154 
155 
~RegexMatcher()156 RegexMatcher::~RegexMatcher() {
157     delete fStack;
158     if (fData != fSmallData) {
159         uprv_free(fData);
160         fData = NULL;
161     }
162     if (fPatternOwned) {
163         delete fPatternOwned;
164         fPatternOwned = NULL;
165         fPattern = NULL;
166     }
167 
168     if (fInput) {
169         delete fInput;
170     }
171     if (fInputText) {
172         utext_close(fInputText);
173     }
174     if (fAltInputText) {
175         utext_close(fAltInputText);
176     }
177 
178     #if UCONFIG_NO_BREAK_ITERATION==0
179     delete fWordBreakItr;
180     #endif
181 }
182 
183 //
184 //   init()   common initialization for use by all constructors.
185 //            Initialize all fields, get the object into a consistent state.
186 //            This must be done even when the initial status shows an error,
187 //            so that the object is initialized sufficiently well for the destructor
188 //            to run safely.
189 //
init(UErrorCode & status)190 void RegexMatcher::init(UErrorCode &status) {
191     fPattern           = NULL;
192     fPatternOwned      = NULL;
193     fFrameSize         = 0;
194     fRegionStart       = 0;
195     fRegionLimit       = 0;
196     fAnchorStart       = 0;
197     fAnchorLimit       = 0;
198     fLookStart         = 0;
199     fLookLimit         = 0;
200     fActiveStart       = 0;
201     fActiveLimit       = 0;
202     fTransparentBounds = FALSE;
203     fAnchoringBounds   = TRUE;
204     fMatch             = FALSE;
205     fMatchStart        = 0;
206     fMatchEnd          = 0;
207     fLastMatchEnd      = -1;
208     fAppendPosition    = 0;
209     fHitEnd            = FALSE;
210     fRequireEnd        = FALSE;
211     fStack             = NULL;
212     fFrame             = NULL;
213     fTimeLimit         = 0;
214     fTime              = 0;
215     fTickCounter       = 0;
216     fStackLimit        = DEFAULT_BACKTRACK_STACK_CAPACITY;
217     fCallbackFn        = NULL;
218     fCallbackContext   = NULL;
219     fFindProgressCallbackFn      = NULL;
220     fFindProgressCallbackContext = NULL;
221     fTraceDebug        = FALSE;
222     fDeferredStatus    = status;
223     fData              = fSmallData;
224     fWordBreakItr      = NULL;
225 
226     fStack             = NULL;
227     fInputText         = NULL;
228     fAltInputText      = NULL;
229     fInput             = NULL;
230     fInputLength       = 0;
231     fInputUniStrMaybeMutable = FALSE;
232 }
233 
234 //
235 //  init2()   Common initialization for use by RegexMatcher constructors, part 2.
236 //            This handles the common setup to be done after the Pattern is available.
237 //
init2(UText * input,UErrorCode & status)238 void RegexMatcher::init2(UText *input, UErrorCode &status) {
239     if (U_FAILURE(status)) {
240         fDeferredStatus = status;
241         return;
242     }
243 
244     if (fPattern->fDataSize > UPRV_LENGTHOF(fSmallData)) {
245         fData = (int64_t *)uprv_malloc(fPattern->fDataSize * sizeof(int64_t));
246         if (fData == NULL) {
247             status = fDeferredStatus = U_MEMORY_ALLOCATION_ERROR;
248             return;
249         }
250     }
251 
252     fStack = new UVector64(status);
253     if (fStack == NULL) {
254         status = fDeferredStatus = U_MEMORY_ALLOCATION_ERROR;
255         return;
256     }
257 
258     reset(input);
259     setStackLimit(DEFAULT_BACKTRACK_STACK_CAPACITY, status);
260     if (U_FAILURE(status)) {
261         fDeferredStatus = status;
262         return;
263     }
264 }
265 
266 
267 static const UChar BACKSLASH  = 0x5c;
268 static const UChar DOLLARSIGN = 0x24;
269 static const UChar LEFTBRACKET = 0x7b;
270 static const UChar RIGHTBRACKET = 0x7d;
271 
272 //--------------------------------------------------------------------------------
273 //
274 //    appendReplacement
275 //
276 //--------------------------------------------------------------------------------
appendReplacement(UnicodeString & dest,const UnicodeString & replacement,UErrorCode & status)277 RegexMatcher &RegexMatcher::appendReplacement(UnicodeString &dest,
278                                               const UnicodeString &replacement,
279                                               UErrorCode &status) {
280     UText replacementText = UTEXT_INITIALIZER;
281 
282     utext_openConstUnicodeString(&replacementText, &replacement, &status);
283     if (U_SUCCESS(status)) {
284         UText resultText = UTEXT_INITIALIZER;
285         utext_openUnicodeString(&resultText, &dest, &status);
286 
287         if (U_SUCCESS(status)) {
288             appendReplacement(&resultText, &replacementText, status);
289             utext_close(&resultText);
290         }
291         utext_close(&replacementText);
292     }
293 
294     return *this;
295 }
296 
297 //
298 //    appendReplacement, UText mode
299 //
appendReplacement(UText * dest,UText * replacement,UErrorCode & status)300 RegexMatcher &RegexMatcher::appendReplacement(UText *dest,
301                                               UText *replacement,
302                                               UErrorCode &status) {
303     if (U_FAILURE(status)) {
304         return *this;
305     }
306     if (U_FAILURE(fDeferredStatus)) {
307         status = fDeferredStatus;
308         return *this;
309     }
310     if (fMatch == FALSE) {
311         status = U_REGEX_INVALID_STATE;
312         return *this;
313     }
314 
315     // Copy input string from the end of previous match to start of current match
316     int64_t  destLen = utext_nativeLength(dest);
317     if (fMatchStart > fAppendPosition) {
318         if (UTEXT_FULL_TEXT_IN_CHUNK(fInputText, fInputLength)) {
319             destLen += utext_replace(dest, destLen, destLen, fInputText->chunkContents+fAppendPosition,
320                                      (int32_t)(fMatchStart-fAppendPosition), &status);
321         } else {
322             int32_t len16;
323             if (UTEXT_USES_U16(fInputText)) {
324                 len16 = (int32_t)(fMatchStart-fAppendPosition);
325             } else {
326                 UErrorCode lengthStatus = U_ZERO_ERROR;
327                 len16 = utext_extract(fInputText, fAppendPosition, fMatchStart, NULL, 0, &lengthStatus);
328             }
329             UChar *inputChars = (UChar *)uprv_malloc(sizeof(UChar)*(len16+1));
330             if (inputChars == NULL) {
331                 status = U_MEMORY_ALLOCATION_ERROR;
332                 return *this;
333             }
334             utext_extract(fInputText, fAppendPosition, fMatchStart, inputChars, len16+1, &status);
335             destLen += utext_replace(dest, destLen, destLen, inputChars, len16, &status);
336             uprv_free(inputChars);
337         }
338     }
339     fAppendPosition = fMatchEnd;
340 
341 
342     // scan the replacement text, looking for substitutions ($n) and \escapes.
343     //  TODO:  optimize this loop by efficiently scanning for '$' or '\',
344     //         move entire ranges not containing substitutions.
345     UTEXT_SETNATIVEINDEX(replacement, 0);
346     for (UChar32 c = UTEXT_NEXT32(replacement); U_SUCCESS(status) && c != U_SENTINEL;  c = UTEXT_NEXT32(replacement)) {
347         if (c == BACKSLASH) {
348             // Backslash Escape.  Copy the following char out without further checks.
349             //                    Note:  Surrogate pairs don't need any special handling
350             //                           The second half wont be a '$' or a '\', and
351             //                           will move to the dest normally on the next
352             //                           loop iteration.
353             c = UTEXT_CURRENT32(replacement);
354             if (c == U_SENTINEL) {
355                 break;
356             }
357 
358             if (c==0x55/*U*/ || c==0x75/*u*/) {
359                 // We have a \udddd or \Udddddddd escape sequence.
360                 int32_t offset = 0;
361                 struct URegexUTextUnescapeCharContext context = U_REGEX_UTEXT_UNESCAPE_CONTEXT(replacement);
362                 UChar32 escapedChar = u_unescapeAt(uregex_utext_unescape_charAt, &offset, INT32_MAX, &context);
363                 if (escapedChar != (UChar32)0xFFFFFFFF) {
364                     if (U_IS_BMP(escapedChar)) {
365                         UChar c16 = (UChar)escapedChar;
366                         destLen += utext_replace(dest, destLen, destLen, &c16, 1, &status);
367                     } else {
368                         UChar surrogate[2];
369                         surrogate[0] = U16_LEAD(escapedChar);
370                         surrogate[1] = U16_TRAIL(escapedChar);
371                         if (U_SUCCESS(status)) {
372                             destLen += utext_replace(dest, destLen, destLen, surrogate, 2, &status);
373                         }
374                     }
375                     // TODO:  Report errors for mal-formed \u escapes?
376                     //        As this is, the original sequence is output, which may be OK.
377                     if (context.lastOffset == offset) {
378                         (void)UTEXT_PREVIOUS32(replacement);
379                     } else if (context.lastOffset != offset-1) {
380                         utext_moveIndex32(replacement, offset - context.lastOffset - 1);
381                     }
382                 }
383             } else {
384                 (void)UTEXT_NEXT32(replacement);
385                 // Plain backslash escape.  Just put out the escaped character.
386                 if (U_IS_BMP(c)) {
387                     UChar c16 = (UChar)c;
388                     destLen += utext_replace(dest, destLen, destLen, &c16, 1, &status);
389                 } else {
390                     UChar surrogate[2];
391                     surrogate[0] = U16_LEAD(c);
392                     surrogate[1] = U16_TRAIL(c);
393                     if (U_SUCCESS(status)) {
394                         destLen += utext_replace(dest, destLen, destLen, surrogate, 2, &status);
395                     }
396                 }
397             }
398         } else if (c != DOLLARSIGN) {
399             // Normal char, not a $.  Copy it out without further checks.
400             if (U_IS_BMP(c)) {
401                 UChar c16 = (UChar)c;
402                 destLen += utext_replace(dest, destLen, destLen, &c16, 1, &status);
403             } else {
404                 UChar surrogate[2];
405                 surrogate[0] = U16_LEAD(c);
406                 surrogate[1] = U16_TRAIL(c);
407                 if (U_SUCCESS(status)) {
408                     destLen += utext_replace(dest, destLen, destLen, surrogate, 2, &status);
409                 }
410             }
411         } else {
412             // We've got a $.  Pick up a capture group name or number if one follows.
413             // Consume digits so long as the resulting group number <= the number of
414             // number of capture groups in the pattern.
415 
416             int32_t groupNum  = 0;
417             int32_t numDigits = 0;
418             UChar32 nextChar = utext_current32(replacement);
419             if (nextChar == LEFTBRACKET) {
420                 // Scan for a Named Capture Group, ${name}.
421                 UnicodeString groupName;
422                 utext_next32(replacement);
423                 while(U_SUCCESS(status) && nextChar != RIGHTBRACKET) {
424                     nextChar = utext_next32(replacement);
425                     if (nextChar == U_SENTINEL) {
426                         status = U_REGEX_INVALID_CAPTURE_GROUP_NAME;
427                     } else if ((nextChar >= 0x41 && nextChar <= 0x5a) ||       // A..Z
428                                (nextChar >= 0x61 && nextChar <= 0x7a) ||       // a..z
429                                (nextChar >= 0x31 && nextChar <= 0x39)) {       // 0..9
430                         groupName.append(nextChar);
431                     } else if (nextChar == RIGHTBRACKET) {
432                         groupNum = uhash_geti(fPattern->fNamedCaptureMap, &groupName);
433                         if (groupNum == 0) {
434                             status = U_REGEX_INVALID_CAPTURE_GROUP_NAME;
435                         }
436                     } else {
437                         // Character was something other than a name char or a closing '}'
438                         status = U_REGEX_INVALID_CAPTURE_GROUP_NAME;
439                     }
440                 }
441 
442             } else if (u_isdigit(nextChar)) {
443                 // $n    Scan for a capture group number
444                 int32_t numCaptureGroups = fPattern->fGroupMap->size();
445                 for (;;) {
446                     nextChar = UTEXT_CURRENT32(replacement);
447                     if (nextChar == U_SENTINEL) {
448                         break;
449                     }
450                     if (u_isdigit(nextChar) == FALSE) {
451                         break;
452                     }
453                     int32_t nextDigitVal = u_charDigitValue(nextChar);
454                     if (groupNum*10 + nextDigitVal > numCaptureGroups) {
455                         // Don't consume the next digit if it makes the capture group number too big.
456                         if (numDigits == 0) {
457                             status = U_INDEX_OUTOFBOUNDS_ERROR;
458                         }
459                         break;
460                     }
461                     (void)UTEXT_NEXT32(replacement);
462                     groupNum=groupNum*10 + nextDigitVal;
463                     ++numDigits;
464                 }
465             } else {
466                 // $ not followed by capture group name or number.
467                 status = U_REGEX_INVALID_CAPTURE_GROUP_NAME;
468             }
469 
470             if (U_SUCCESS(status)) {
471                 destLen += appendGroup(groupNum, dest, status);
472             }
473         }  // End of $ capture group handling
474     }  // End of per-character loop through the replacement string.
475 
476     return *this;
477 }
478 
479 
480 
481 //--------------------------------------------------------------------------------
482 //
483 //    appendTail     Intended to be used in conjunction with appendReplacement()
484 //                   To the destination string, append everything following
485 //                   the last match position from the input string.
486 //
487 //                   Note:  Match ranges do not affect appendTail or appendReplacement
488 //
489 //--------------------------------------------------------------------------------
appendTail(UnicodeString & dest)490 UnicodeString &RegexMatcher::appendTail(UnicodeString &dest) {
491     UErrorCode status = U_ZERO_ERROR;
492     UText resultText = UTEXT_INITIALIZER;
493     utext_openUnicodeString(&resultText, &dest, &status);
494 
495     if (U_SUCCESS(status)) {
496         appendTail(&resultText, status);
497         utext_close(&resultText);
498     }
499 
500     return dest;
501 }
502 
503 //
504 //   appendTail, UText mode
505 //
appendTail(UText * dest,UErrorCode & status)506 UText *RegexMatcher::appendTail(UText *dest, UErrorCode &status) {
507     if (U_FAILURE(status)) {
508         return dest;
509     }
510     if (U_FAILURE(fDeferredStatus)) {
511         status = fDeferredStatus;
512         return dest;
513     }
514 
515     if (fInputLength > fAppendPosition) {
516         if (UTEXT_FULL_TEXT_IN_CHUNK(fInputText, fInputLength)) {
517             int64_t destLen = utext_nativeLength(dest);
518             utext_replace(dest, destLen, destLen, fInputText->chunkContents+fAppendPosition,
519                           (int32_t)(fInputLength-fAppendPosition), &status);
520         } else {
521             int32_t len16;
522             if (UTEXT_USES_U16(fInputText)) {
523                 len16 = (int32_t)(fInputLength-fAppendPosition);
524             } else {
525                 len16 = utext_extract(fInputText, fAppendPosition, fInputLength, NULL, 0, &status);
526                 status = U_ZERO_ERROR; // buffer overflow
527             }
528 
529             UChar *inputChars = (UChar *)uprv_malloc(sizeof(UChar)*(len16));
530             if (inputChars == NULL) {
531                 fDeferredStatus = U_MEMORY_ALLOCATION_ERROR;
532             } else {
533                 utext_extract(fInputText, fAppendPosition, fInputLength, inputChars, len16, &status); // unterminated
534                 int64_t destLen = utext_nativeLength(dest);
535                 utext_replace(dest, destLen, destLen, inputChars, len16, &status);
536                 uprv_free(inputChars);
537             }
538         }
539     }
540     return dest;
541 }
542 
543 
544 
545 //--------------------------------------------------------------------------------
546 //
547 //   end
548 //
549 //--------------------------------------------------------------------------------
end(UErrorCode & err) const550 int32_t RegexMatcher::end(UErrorCode &err) const {
551     return end(0, err);
552 }
553 
end64(UErrorCode & err) const554 int64_t RegexMatcher::end64(UErrorCode &err) const {
555     return end64(0, err);
556 }
557 
end64(int32_t group,UErrorCode & err) const558 int64_t RegexMatcher::end64(int32_t group, UErrorCode &err) const {
559     if (U_FAILURE(err)) {
560         return -1;
561     }
562     if (fMatch == FALSE) {
563         err = U_REGEX_INVALID_STATE;
564         return -1;
565     }
566     if (group < 0 || group > fPattern->fGroupMap->size()) {
567         err = U_INDEX_OUTOFBOUNDS_ERROR;
568         return -1;
569     }
570     int64_t e = -1;
571     if (group == 0) {
572         e = fMatchEnd;
573     } else {
574         // Get the position within the stack frame of the variables for
575         //    this capture group.
576         int32_t groupOffset = fPattern->fGroupMap->elementAti(group-1);
577         U_ASSERT(groupOffset < fPattern->fFrameSize);
578         U_ASSERT(groupOffset >= 0);
579         e = fFrame->fExtra[groupOffset + 1];
580     }
581 
582         return e;
583 }
584 
end(int32_t group,UErrorCode & err) const585 int32_t RegexMatcher::end(int32_t group, UErrorCode &err) const {
586     return (int32_t)end64(group, err);
587 }
588 
589 //--------------------------------------------------------------------------------
590 //
591 //   findProgressInterrupt  This function is called once for each advance in the target
592 //                          string from the find() function, and calls the user progress callback
593 //                          function if there is one installed.
594 //
595 //         Return:  TRUE if the find operation is to be terminated.
596 //                  FALSE if the find operation is to continue running.
597 //
598 //--------------------------------------------------------------------------------
findProgressInterrupt(int64_t pos,UErrorCode & status)599 UBool RegexMatcher::findProgressInterrupt(int64_t pos, UErrorCode &status) {
600     if (fFindProgressCallbackFn && !(*fFindProgressCallbackFn)(fFindProgressCallbackContext, pos)) {
601         status = U_REGEX_STOPPED_BY_CALLER;
602         return TRUE;
603     }
604     return FALSE;
605 }
606 
607 //--------------------------------------------------------------------------------
608 //
609 //   find()
610 //
611 //--------------------------------------------------------------------------------
find()612 UBool RegexMatcher::find() {
613     if (U_FAILURE(fDeferredStatus)) {
614         return FALSE;
615     }
616     UErrorCode status = U_ZERO_ERROR;
617     UBool result = find(status);
618     return result;
619 }
620 
621 //--------------------------------------------------------------------------------
622 //
623 //   find()
624 //
625 //--------------------------------------------------------------------------------
find(UErrorCode & status)626 UBool RegexMatcher::find(UErrorCode &status) {
627     // Start at the position of the last match end.  (Will be zero if the
628     //   matcher has been reset.)
629     //
630     if (U_FAILURE(status)) {
631         return FALSE;
632     }
633     if (U_FAILURE(fDeferredStatus)) {
634         status = fDeferredStatus;
635         return FALSE;
636     }
637 
638     if (UTEXT_FULL_TEXT_IN_CHUNK(fInputText, fInputLength)) {
639         return findUsingChunk(status);
640     }
641 
642     int64_t startPos = fMatchEnd;
643     if (startPos==0) {
644         startPos = fActiveStart;
645     }
646 
647     if (fMatch) {
648         // Save the position of any previous successful match.
649         fLastMatchEnd = fMatchEnd;
650 
651         if (fMatchStart == fMatchEnd) {
652             // Previous match had zero length.  Move start position up one position
653             //  to avoid sending find() into a loop on zero-length matches.
654             if (startPos >= fActiveLimit) {
655                 fMatch = FALSE;
656                 fHitEnd = TRUE;
657                 return FALSE;
658             }
659             UTEXT_SETNATIVEINDEX(fInputText, startPos);
660             (void)UTEXT_NEXT32(fInputText);
661             startPos = UTEXT_GETNATIVEINDEX(fInputText);
662         }
663     } else {
664         if (fLastMatchEnd >= 0) {
665             // A previous find() failed to match.  Don't try again.
666             //   (without this test, a pattern with a zero-length match
667             //    could match again at the end of an input string.)
668             fHitEnd = TRUE;
669             return FALSE;
670         }
671     }
672 
673 
674     // Compute the position in the input string beyond which a match can not begin, because
675     //   the minimum length match would extend past the end of the input.
676     //   Note:  some patterns that cannot match anything will have fMinMatchLength==Max Int.
677     //          Be aware of possible overflows if making changes here.
678     int64_t testStartLimit;
679     if (UTEXT_USES_U16(fInputText)) {
680         testStartLimit = fActiveLimit - fPattern->fMinMatchLen;
681         if (startPos > testStartLimit) {
682             fMatch = FALSE;
683             fHitEnd = TRUE;
684             return FALSE;
685         }
686     } else {
687         // We don't know exactly how long the minimum match length is in native characters.
688         // Treat anything > 0 as 1.
689         testStartLimit = fActiveLimit - (fPattern->fMinMatchLen > 0 ? 1 : 0);
690     }
691 
692     UChar32  c;
693     U_ASSERT(startPos >= 0);
694 
695     switch (fPattern->fStartType) {
696     case START_NO_INFO:
697         // No optimization was found.
698         //  Try a match at each input position.
699         for (;;) {
700             MatchAt(startPos, FALSE, status);
701             if (U_FAILURE(status)) {
702                 return FALSE;
703             }
704             if (fMatch) {
705                 return TRUE;
706             }
707             if (startPos >= testStartLimit) {
708                 fHitEnd = TRUE;
709                 return FALSE;
710             }
711             UTEXT_SETNATIVEINDEX(fInputText, startPos);
712             (void)UTEXT_NEXT32(fInputText);
713             startPos = UTEXT_GETNATIVEINDEX(fInputText);
714             // Note that it's perfectly OK for a pattern to have a zero-length
715             //   match at the end of a string, so we must make sure that the loop
716             //   runs with startPos == testStartLimit the last time through.
717             if  (findProgressInterrupt(startPos, status))
718                 return FALSE;
719         }
720         UPRV_UNREACHABLE;
721 
722     case START_START:
723         // Matches are only possible at the start of the input string
724         //   (pattern begins with ^ or \A)
725         if (startPos > fActiveStart) {
726             fMatch = FALSE;
727             return FALSE;
728         }
729         MatchAt(startPos, FALSE, status);
730         if (U_FAILURE(status)) {
731             return FALSE;
732         }
733         return fMatch;
734 
735 
736     case START_SET:
737         {
738             // Match may start on any char from a pre-computed set.
739             U_ASSERT(fPattern->fMinMatchLen > 0);
740             UTEXT_SETNATIVEINDEX(fInputText, startPos);
741             for (;;) {
742                 int64_t pos = startPos;
743                 c = UTEXT_NEXT32(fInputText);
744                 startPos = UTEXT_GETNATIVEINDEX(fInputText);
745                 // c will be -1 (U_SENTINEL) at end of text, in which case we
746                 // skip this next block (so we don't have a negative array index)
747                 // and handle end of text in the following block.
748                 if (c >= 0 && ((c<256 && fPattern->fInitialChars8->contains(c)) ||
749                               (c>=256 && fPattern->fInitialChars->contains(c)))) {
750                     MatchAt(pos, FALSE, status);
751                     if (U_FAILURE(status)) {
752                         return FALSE;
753                     }
754                     if (fMatch) {
755                         return TRUE;
756                     }
757                     UTEXT_SETNATIVEINDEX(fInputText, pos);
758                 }
759                 if (startPos > testStartLimit) {
760                     fMatch = FALSE;
761                     fHitEnd = TRUE;
762                     return FALSE;
763                 }
764                 if  (findProgressInterrupt(startPos, status))
765                     return FALSE;
766             }
767         }
768         UPRV_UNREACHABLE;
769 
770     case START_STRING:
771     case START_CHAR:
772         {
773             // Match starts on exactly one char.
774             U_ASSERT(fPattern->fMinMatchLen > 0);
775             UChar32 theChar = fPattern->fInitialChar;
776             UTEXT_SETNATIVEINDEX(fInputText, startPos);
777             for (;;) {
778                 int64_t pos = startPos;
779                 c = UTEXT_NEXT32(fInputText);
780                 startPos = UTEXT_GETNATIVEINDEX(fInputText);
781                 if (c == theChar) {
782                     MatchAt(pos, FALSE, status);
783                     if (U_FAILURE(status)) {
784                         return FALSE;
785                     }
786                     if (fMatch) {
787                         return TRUE;
788                     }
789                     UTEXT_SETNATIVEINDEX(fInputText, startPos);
790                 }
791                 if (startPos > testStartLimit) {
792                     fMatch = FALSE;
793                     fHitEnd = TRUE;
794                     return FALSE;
795                 }
796                 if  (findProgressInterrupt(startPos, status))
797                     return FALSE;
798            }
799         }
800         UPRV_UNREACHABLE;
801 
802     case START_LINE:
803         {
804             UChar32 ch;
805             if (startPos == fAnchorStart) {
806                 MatchAt(startPos, FALSE, status);
807                 if (U_FAILURE(status)) {
808                     return FALSE;
809                 }
810                 if (fMatch) {
811                     return TRUE;
812                 }
813                 UTEXT_SETNATIVEINDEX(fInputText, startPos);
814                 ch = UTEXT_NEXT32(fInputText);
815                 startPos = UTEXT_GETNATIVEINDEX(fInputText);
816             } else {
817                 UTEXT_SETNATIVEINDEX(fInputText, startPos);
818                 ch = UTEXT_PREVIOUS32(fInputText);
819                 UTEXT_SETNATIVEINDEX(fInputText, startPos);
820             }
821 
822             if (fPattern->fFlags & UREGEX_UNIX_LINES) {
823                 for (;;) {
824                     if (ch == 0x0a) {
825                             MatchAt(startPos, FALSE, status);
826                             if (U_FAILURE(status)) {
827                                 return FALSE;
828                             }
829                             if (fMatch) {
830                                 return TRUE;
831                             }
832                             UTEXT_SETNATIVEINDEX(fInputText, startPos);
833                     }
834                     if (startPos >= testStartLimit) {
835                         fMatch = FALSE;
836                         fHitEnd = TRUE;
837                         return FALSE;
838                     }
839                     ch = UTEXT_NEXT32(fInputText);
840                     startPos = UTEXT_GETNATIVEINDEX(fInputText);
841                     // Note that it's perfectly OK for a pattern to have a zero-length
842                     //   match at the end of a string, so we must make sure that the loop
843                     //   runs with startPos == testStartLimit the last time through.
844                     if  (findProgressInterrupt(startPos, status))
845                         return FALSE;
846                 }
847             } else {
848                 for (;;) {
849                     if (isLineTerminator(ch)) {
850                         if (ch == 0x0d && startPos < fActiveLimit && UTEXT_CURRENT32(fInputText) == 0x0a) {
851                             (void)UTEXT_NEXT32(fInputText);
852                             startPos = UTEXT_GETNATIVEINDEX(fInputText);
853                         }
854                         MatchAt(startPos, FALSE, status);
855                         if (U_FAILURE(status)) {
856                             return FALSE;
857                         }
858                         if (fMatch) {
859                             return TRUE;
860                         }
861                         UTEXT_SETNATIVEINDEX(fInputText, startPos);
862                     }
863                     if (startPos >= testStartLimit) {
864                         fMatch = FALSE;
865                         fHitEnd = TRUE;
866                         return FALSE;
867                     }
868                     ch = UTEXT_NEXT32(fInputText);
869                     startPos = UTEXT_GETNATIVEINDEX(fInputText);
870                     // Note that it's perfectly OK for a pattern to have a zero-length
871                     //   match at the end of a string, so we must make sure that the loop
872                     //   runs with startPos == testStartLimit the last time through.
873                     if  (findProgressInterrupt(startPos, status))
874                         return FALSE;
875                 }
876             }
877         }
878 
879     default:
880         UPRV_UNREACHABLE;
881     }
882 
883     UPRV_UNREACHABLE;
884 }
885 
886 
887 
find(int64_t start,UErrorCode & status)888 UBool RegexMatcher::find(int64_t start, UErrorCode &status) {
889     if (U_FAILURE(status)) {
890         return FALSE;
891     }
892     if (U_FAILURE(fDeferredStatus)) {
893         status = fDeferredStatus;
894         return FALSE;
895     }
896     this->reset();                        // Note:  Reset() is specified by Java Matcher documentation.
897                                           //        This will reset the region to be the full input length.
898     if (start < 0) {
899         status = U_INDEX_OUTOFBOUNDS_ERROR;
900         return FALSE;
901     }
902 
903     int64_t nativeStart = start;
904     if (nativeStart < fActiveStart || nativeStart > fActiveLimit) {
905         status = U_INDEX_OUTOFBOUNDS_ERROR;
906         return FALSE;
907     }
908     fMatchEnd = nativeStart;
909     return find(status);
910 }
911 
912 
913 //--------------------------------------------------------------------------------
914 //
915 //   findUsingChunk() -- like find(), but with the advance knowledge that the
916 //                       entire string is available in the UText's chunk buffer.
917 //
918 //--------------------------------------------------------------------------------
findUsingChunk(UErrorCode & status)919 UBool RegexMatcher::findUsingChunk(UErrorCode &status) {
920     // Start at the position of the last match end.  (Will be zero if the
921     //   matcher has been reset.
922     //
923 
924     int32_t startPos = (int32_t)fMatchEnd;
925     if (startPos==0) {
926         startPos = (int32_t)fActiveStart;
927     }
928 
929     const UChar *inputBuf = fInputText->chunkContents;
930 
931     if (fMatch) {
932         // Save the position of any previous successful match.
933         fLastMatchEnd = fMatchEnd;
934 
935         if (fMatchStart == fMatchEnd) {
936             // Previous match had zero length.  Move start position up one position
937             //  to avoid sending find() into a loop on zero-length matches.
938             if (startPos >= fActiveLimit) {
939                 fMatch = FALSE;
940                 fHitEnd = TRUE;
941                 return FALSE;
942             }
943             U16_FWD_1(inputBuf, startPos, fInputLength);
944         }
945     } else {
946         if (fLastMatchEnd >= 0) {
947             // A previous find() failed to match.  Don't try again.
948             //   (without this test, a pattern with a zero-length match
949             //    could match again at the end of an input string.)
950             fHitEnd = TRUE;
951             return FALSE;
952         }
953     }
954 
955 
956     // Compute the position in the input string beyond which a match can not begin, because
957     //   the minimum length match would extend past the end of the input.
958     //   Note:  some patterns that cannot match anything will have fMinMatchLength==Max Int.
959     //          Be aware of possible overflows if making changes here.
960     //   Note:  a match can begin at inputBuf + testLen; it is an inclusive limit.
961     int32_t testLen  = (int32_t)(fActiveLimit - fPattern->fMinMatchLen);
962     if (startPos > testLen) {
963         fMatch = FALSE;
964         fHitEnd = TRUE;
965         return FALSE;
966     }
967 
968     UChar32  c;
969     U_ASSERT(startPos >= 0);
970 
971     switch (fPattern->fStartType) {
972     case START_NO_INFO:
973         // No optimization was found.
974         //  Try a match at each input position.
975         for (;;) {
976             MatchChunkAt(startPos, FALSE, status);
977             if (U_FAILURE(status)) {
978                 return FALSE;
979             }
980             if (fMatch) {
981                 return TRUE;
982             }
983             if (startPos >= testLen) {
984                 fHitEnd = TRUE;
985                 return FALSE;
986             }
987             U16_FWD_1(inputBuf, startPos, fActiveLimit);
988             // Note that it's perfectly OK for a pattern to have a zero-length
989             //   match at the end of a string, so we must make sure that the loop
990             //   runs with startPos == testLen the last time through.
991             if  (findProgressInterrupt(startPos, status))
992                 return FALSE;
993         }
994         UPRV_UNREACHABLE;
995 
996     case START_START:
997         // Matches are only possible at the start of the input string
998         //   (pattern begins with ^ or \A)
999         if (startPos > fActiveStart) {
1000             fMatch = FALSE;
1001             return FALSE;
1002         }
1003         MatchChunkAt(startPos, FALSE, status);
1004         if (U_FAILURE(status)) {
1005             return FALSE;
1006         }
1007         return fMatch;
1008 
1009 
1010     case START_SET:
1011     {
1012         // Match may start on any char from a pre-computed set.
1013         U_ASSERT(fPattern->fMinMatchLen > 0);
1014         for (;;) {
1015             int32_t pos = startPos;
1016             U16_NEXT(inputBuf, startPos, fActiveLimit, c);  // like c = inputBuf[startPos++];
1017             if ((c<256 && fPattern->fInitialChars8->contains(c)) ||
1018                 (c>=256 && fPattern->fInitialChars->contains(c))) {
1019                 MatchChunkAt(pos, FALSE, status);
1020                 if (U_FAILURE(status)) {
1021                     return FALSE;
1022                 }
1023                 if (fMatch) {
1024                     return TRUE;
1025                 }
1026             }
1027             if (startPos > testLen) {
1028                 fMatch = FALSE;
1029                 fHitEnd = TRUE;
1030                 return FALSE;
1031             }
1032             if  (findProgressInterrupt(startPos, status))
1033                 return FALSE;
1034         }
1035     }
1036     UPRV_UNREACHABLE;
1037 
1038     case START_STRING:
1039     case START_CHAR:
1040     {
1041         // Match starts on exactly one char.
1042         U_ASSERT(fPattern->fMinMatchLen > 0);
1043         UChar32 theChar = fPattern->fInitialChar;
1044         for (;;) {
1045             int32_t pos = startPos;
1046             U16_NEXT(inputBuf, startPos, fActiveLimit, c);  // like c = inputBuf[startPos++];
1047             if (c == theChar) {
1048                 MatchChunkAt(pos, FALSE, status);
1049                 if (U_FAILURE(status)) {
1050                     return FALSE;
1051                 }
1052                 if (fMatch) {
1053                     return TRUE;
1054                 }
1055             }
1056             if (startPos > testLen) {
1057                 fMatch = FALSE;
1058                 fHitEnd = TRUE;
1059                 return FALSE;
1060             }
1061             if  (findProgressInterrupt(startPos, status))
1062                 return FALSE;
1063         }
1064     }
1065     UPRV_UNREACHABLE;
1066 
1067     case START_LINE:
1068     {
1069         UChar32 ch;
1070         if (startPos == fAnchorStart) {
1071             MatchChunkAt(startPos, FALSE, status);
1072             if (U_FAILURE(status)) {
1073                 return FALSE;
1074             }
1075             if (fMatch) {
1076                 return TRUE;
1077             }
1078             U16_FWD_1(inputBuf, startPos, fActiveLimit);
1079         }
1080 
1081         if (fPattern->fFlags & UREGEX_UNIX_LINES) {
1082             for (;;) {
1083                 ch = inputBuf[startPos-1];
1084                 if (ch == 0x0a) {
1085                     MatchChunkAt(startPos, FALSE, status);
1086                     if (U_FAILURE(status)) {
1087                         return FALSE;
1088                     }
1089                     if (fMatch) {
1090                         return TRUE;
1091                     }
1092                 }
1093                 if (startPos >= testLen) {
1094                     fMatch = FALSE;
1095                     fHitEnd = TRUE;
1096                     return FALSE;
1097                 }
1098                 U16_FWD_1(inputBuf, startPos, fActiveLimit);
1099                 // Note that it's perfectly OK for a pattern to have a zero-length
1100                 //   match at the end of a string, so we must make sure that the loop
1101                 //   runs with startPos == testLen the last time through.
1102                 if  (findProgressInterrupt(startPos, status))
1103                     return FALSE;
1104             }
1105         } else {
1106             for (;;) {
1107                 ch = inputBuf[startPos-1];
1108                 if (isLineTerminator(ch)) {
1109                     if (ch == 0x0d && startPos < fActiveLimit && inputBuf[startPos] == 0x0a) {
1110                         startPos++;
1111                     }
1112                     MatchChunkAt(startPos, FALSE, status);
1113                     if (U_FAILURE(status)) {
1114                         return FALSE;
1115                     }
1116                     if (fMatch) {
1117                         return TRUE;
1118                     }
1119                 }
1120                 if (startPos >= testLen) {
1121                     fMatch = FALSE;
1122                     fHitEnd = TRUE;
1123                     return FALSE;
1124                 }
1125                 U16_FWD_1(inputBuf, startPos, fActiveLimit);
1126                 // Note that it's perfectly OK for a pattern to have a zero-length
1127                 //   match at the end of a string, so we must make sure that the loop
1128                 //   runs with startPos == testLen the last time through.
1129                 if  (findProgressInterrupt(startPos, status))
1130                     return FALSE;
1131             }
1132         }
1133     }
1134 
1135     default:
1136         UPRV_UNREACHABLE;
1137     }
1138 
1139     UPRV_UNREACHABLE;
1140 }
1141 
1142 
1143 
1144 //--------------------------------------------------------------------------------
1145 //
1146 //  group()
1147 //
1148 //--------------------------------------------------------------------------------
group(UErrorCode & status) const1149 UnicodeString RegexMatcher::group(UErrorCode &status) const {
1150     return group(0, status);
1151 }
1152 
1153 //  Return immutable shallow clone
group(UText * dest,int64_t & group_len,UErrorCode & status) const1154 UText *RegexMatcher::group(UText *dest, int64_t &group_len, UErrorCode &status) const {
1155     return group(0, dest, group_len, status);
1156 }
1157 
1158 //  Return immutable shallow clone
group(int32_t groupNum,UText * dest,int64_t & group_len,UErrorCode & status) const1159 UText *RegexMatcher::group(int32_t groupNum, UText *dest, int64_t &group_len, UErrorCode &status) const {
1160     group_len = 0;
1161     if (U_FAILURE(status)) {
1162         return dest;
1163     }
1164     if (U_FAILURE(fDeferredStatus)) {
1165         status = fDeferredStatus;
1166     } else if (fMatch == FALSE) {
1167         status = U_REGEX_INVALID_STATE;
1168     } else if (groupNum < 0 || groupNum > fPattern->fGroupMap->size()) {
1169         status = U_INDEX_OUTOFBOUNDS_ERROR;
1170     }
1171 
1172     if (U_FAILURE(status)) {
1173         return dest;
1174     }
1175 
1176     int64_t s, e;
1177     if (groupNum == 0) {
1178         s = fMatchStart;
1179         e = fMatchEnd;
1180     } else {
1181         int32_t groupOffset = fPattern->fGroupMap->elementAti(groupNum-1);
1182         U_ASSERT(groupOffset < fPattern->fFrameSize);
1183         U_ASSERT(groupOffset >= 0);
1184         s = fFrame->fExtra[groupOffset];
1185         e = fFrame->fExtra[groupOffset+1];
1186     }
1187 
1188     if (s < 0) {
1189         // A capture group wasn't part of the match
1190         return utext_clone(dest, fInputText, FALSE, TRUE, &status);
1191     }
1192     U_ASSERT(s <= e);
1193     group_len = e - s;
1194 
1195     dest = utext_clone(dest, fInputText, FALSE, TRUE, &status);
1196     if (dest)
1197         UTEXT_SETNATIVEINDEX(dest, s);
1198     return dest;
1199 }
1200 
group(int32_t groupNum,UErrorCode & status) const1201 UnicodeString RegexMatcher::group(int32_t groupNum, UErrorCode &status) const {
1202     UnicodeString result;
1203     int64_t groupStart = start64(groupNum, status);
1204     int64_t groupEnd = end64(groupNum, status);
1205     if (U_FAILURE(status) || groupStart == -1 || groupStart == groupEnd) {
1206         return result;
1207     }
1208 
1209     // Get the group length using a utext_extract preflight.
1210     //    UText is actually pretty efficient at this when underlying encoding is UTF-16.
1211     int32_t length = utext_extract(fInputText, groupStart, groupEnd, NULL, 0, &status);
1212     if (status != U_BUFFER_OVERFLOW_ERROR) {
1213         return result;
1214     }
1215 
1216     status = U_ZERO_ERROR;
1217     UChar *buf = result.getBuffer(length);
1218     if (buf == NULL) {
1219         status = U_MEMORY_ALLOCATION_ERROR;
1220     } else {
1221         int32_t extractLength = utext_extract(fInputText, groupStart, groupEnd, buf, length, &status);
1222         result.releaseBuffer(extractLength);
1223         U_ASSERT(length == extractLength);
1224     }
1225     return result;
1226 }
1227 
1228 
1229 //--------------------------------------------------------------------------------
1230 //
1231 //  appendGroup() -- currently internal only, appends a group to a UText rather
1232 //                   than replacing its contents
1233 //
1234 //--------------------------------------------------------------------------------
1235 
appendGroup(int32_t groupNum,UText * dest,UErrorCode & status) const1236 int64_t RegexMatcher::appendGroup(int32_t groupNum, UText *dest, UErrorCode &status) const {
1237     if (U_FAILURE(status)) {
1238         return 0;
1239     }
1240     if (U_FAILURE(fDeferredStatus)) {
1241         status = fDeferredStatus;
1242         return 0;
1243     }
1244     int64_t destLen = utext_nativeLength(dest);
1245 
1246     if (fMatch == FALSE) {
1247         status = U_REGEX_INVALID_STATE;
1248         return utext_replace(dest, destLen, destLen, NULL, 0, &status);
1249     }
1250     if (groupNum < 0 || groupNum > fPattern->fGroupMap->size()) {
1251         status = U_INDEX_OUTOFBOUNDS_ERROR;
1252         return utext_replace(dest, destLen, destLen, NULL, 0, &status);
1253     }
1254 
1255     int64_t s, e;
1256     if (groupNum == 0) {
1257         s = fMatchStart;
1258         e = fMatchEnd;
1259     } else {
1260         int32_t groupOffset = fPattern->fGroupMap->elementAti(groupNum-1);
1261         U_ASSERT(groupOffset < fPattern->fFrameSize);
1262         U_ASSERT(groupOffset >= 0);
1263         s = fFrame->fExtra[groupOffset];
1264         e = fFrame->fExtra[groupOffset+1];
1265     }
1266 
1267     if (s < 0) {
1268         // A capture group wasn't part of the match
1269         return utext_replace(dest, destLen, destLen, NULL, 0, &status);
1270     }
1271     U_ASSERT(s <= e);
1272 
1273     int64_t deltaLen;
1274     if (UTEXT_FULL_TEXT_IN_CHUNK(fInputText, fInputLength)) {
1275         U_ASSERT(e <= fInputLength);
1276         deltaLen = utext_replace(dest, destLen, destLen, fInputText->chunkContents+s, (int32_t)(e-s), &status);
1277     } else {
1278         int32_t len16;
1279         if (UTEXT_USES_U16(fInputText)) {
1280             len16 = (int32_t)(e-s);
1281         } else {
1282             UErrorCode lengthStatus = U_ZERO_ERROR;
1283             len16 = utext_extract(fInputText, s, e, NULL, 0, &lengthStatus);
1284         }
1285         UChar *groupChars = (UChar *)uprv_malloc(sizeof(UChar)*(len16+1));
1286         if (groupChars == NULL) {
1287             status = U_MEMORY_ALLOCATION_ERROR;
1288             return 0;
1289         }
1290         utext_extract(fInputText, s, e, groupChars, len16+1, &status);
1291 
1292         deltaLen = utext_replace(dest, destLen, destLen, groupChars, len16, &status);
1293         uprv_free(groupChars);
1294     }
1295     return deltaLen;
1296 }
1297 
1298 
1299 
1300 //--------------------------------------------------------------------------------
1301 //
1302 //  groupCount()
1303 //
1304 //--------------------------------------------------------------------------------
groupCount() const1305 int32_t RegexMatcher::groupCount() const {
1306     return fPattern->fGroupMap->size();
1307 }
1308 
1309 //--------------------------------------------------------------------------------
1310 //
1311 //  hasAnchoringBounds()
1312 //
1313 //--------------------------------------------------------------------------------
hasAnchoringBounds() const1314 UBool RegexMatcher::hasAnchoringBounds() const {
1315     return fAnchoringBounds;
1316 }
1317 
1318 
1319 //--------------------------------------------------------------------------------
1320 //
1321 //  hasTransparentBounds()
1322 //
1323 //--------------------------------------------------------------------------------
hasTransparentBounds() const1324 UBool RegexMatcher::hasTransparentBounds() const {
1325     return fTransparentBounds;
1326 }
1327 
1328 
1329 
1330 //--------------------------------------------------------------------------------
1331 //
1332 //  hitEnd()
1333 //
1334 //--------------------------------------------------------------------------------
hitEnd() const1335 UBool RegexMatcher::hitEnd() const {
1336     return fHitEnd;
1337 }
1338 
1339 
1340 //--------------------------------------------------------------------------------
1341 //
1342 //  input()
1343 //
1344 //--------------------------------------------------------------------------------
input() const1345 const UnicodeString &RegexMatcher::input() const {
1346     if (!fInput) {
1347         UErrorCode status = U_ZERO_ERROR;
1348         int32_t len16;
1349         if (UTEXT_USES_U16(fInputText)) {
1350             len16 = (int32_t)fInputLength;
1351         } else {
1352             len16 = utext_extract(fInputText, 0, fInputLength, NULL, 0, &status);
1353             status = U_ZERO_ERROR; // overflow, length status
1354         }
1355         UnicodeString *result = new UnicodeString(len16, 0, 0);
1356 
1357         UChar *inputChars = result->getBuffer(len16);
1358         utext_extract(fInputText, 0, fInputLength, inputChars, len16, &status); // unterminated warning
1359         result->releaseBuffer(len16);
1360 
1361         (*(const UnicodeString **)&fInput) = result; // pointer assignment, rather than operator=
1362     }
1363 
1364     return *fInput;
1365 }
1366 
1367 //--------------------------------------------------------------------------------
1368 //
1369 //  inputText()
1370 //
1371 //--------------------------------------------------------------------------------
inputText() const1372 UText *RegexMatcher::inputText() const {
1373     return fInputText;
1374 }
1375 
1376 
1377 //--------------------------------------------------------------------------------
1378 //
1379 //  getInput() -- like inputText(), but makes a clone or copies into another UText
1380 //
1381 //--------------------------------------------------------------------------------
getInput(UText * dest,UErrorCode & status) const1382 UText *RegexMatcher::getInput (UText *dest, UErrorCode &status) const {
1383     if (U_FAILURE(status)) {
1384         return dest;
1385     }
1386     if (U_FAILURE(fDeferredStatus)) {
1387         status = fDeferredStatus;
1388         return dest;
1389     }
1390 
1391     if (dest) {
1392         if (UTEXT_FULL_TEXT_IN_CHUNK(fInputText, fInputLength)) {
1393             utext_replace(dest, 0, utext_nativeLength(dest), fInputText->chunkContents, (int32_t)fInputLength, &status);
1394         } else {
1395             int32_t input16Len;
1396             if (UTEXT_USES_U16(fInputText)) {
1397                 input16Len = (int32_t)fInputLength;
1398             } else {
1399                 UErrorCode lengthStatus = U_ZERO_ERROR;
1400                 input16Len = utext_extract(fInputText, 0, fInputLength, NULL, 0, &lengthStatus); // buffer overflow error
1401             }
1402             UChar *inputChars = (UChar *)uprv_malloc(sizeof(UChar)*(input16Len));
1403             if (inputChars == NULL) {
1404                 return dest;
1405             }
1406 
1407             status = U_ZERO_ERROR;
1408             utext_extract(fInputText, 0, fInputLength, inputChars, input16Len, &status); // not terminated warning
1409             status = U_ZERO_ERROR;
1410             utext_replace(dest, 0, utext_nativeLength(dest), inputChars, input16Len, &status);
1411 
1412             uprv_free(inputChars);
1413         }
1414         return dest;
1415     } else {
1416         return utext_clone(NULL, fInputText, FALSE, TRUE, &status);
1417     }
1418 }
1419 
1420 
1421 static UBool compat_SyncMutableUTextContents(UText *ut);
compat_SyncMutableUTextContents(UText * ut)1422 static UBool compat_SyncMutableUTextContents(UText *ut) {
1423     UBool retVal = FALSE;
1424 
1425     //  In the following test, we're really only interested in whether the UText should switch
1426     //  between heap and stack allocation.  If length hasn't changed, we won't, so the chunkContents
1427     //  will still point to the correct data.
1428     if (utext_nativeLength(ut) != ut->nativeIndexingLimit) {
1429         UnicodeString *us=(UnicodeString *)ut->context;
1430 
1431         // Update to the latest length.
1432         // For example, (utext_nativeLength(ut) != ut->nativeIndexingLimit).
1433         int32_t newLength = us->length();
1434 
1435         // Update the chunk description.
1436         // The buffer may have switched between stack- and heap-based.
1437         ut->chunkContents    = us->getBuffer();
1438         ut->chunkLength      = newLength;
1439         ut->chunkNativeLimit = newLength;
1440         ut->nativeIndexingLimit = newLength;
1441         retVal = TRUE;
1442     }
1443 
1444     return retVal;
1445 }
1446 
1447 //--------------------------------------------------------------------------------
1448 //
1449 //  lookingAt()
1450 //
1451 //--------------------------------------------------------------------------------
lookingAt(UErrorCode & status)1452 UBool RegexMatcher::lookingAt(UErrorCode &status) {
1453     if (U_FAILURE(status)) {
1454         return FALSE;
1455     }
1456     if (U_FAILURE(fDeferredStatus)) {
1457         status = fDeferredStatus;
1458         return FALSE;
1459     }
1460 
1461     if (fInputUniStrMaybeMutable) {
1462         if (compat_SyncMutableUTextContents(fInputText)) {
1463         fInputLength = utext_nativeLength(fInputText);
1464         reset();
1465         }
1466     }
1467     else {
1468         resetPreserveRegion();
1469     }
1470     if (UTEXT_FULL_TEXT_IN_CHUNK(fInputText, fInputLength)) {
1471         MatchChunkAt((int32_t)fActiveStart, FALSE, status);
1472     } else {
1473         MatchAt(fActiveStart, FALSE, status);
1474     }
1475     return fMatch;
1476 }
1477 
1478 
lookingAt(int64_t start,UErrorCode & status)1479 UBool RegexMatcher::lookingAt(int64_t start, UErrorCode &status) {
1480     if (U_FAILURE(status)) {
1481         return FALSE;
1482     }
1483     if (U_FAILURE(fDeferredStatus)) {
1484         status = fDeferredStatus;
1485         return FALSE;
1486     }
1487     reset();
1488 
1489     if (start < 0) {
1490         status = U_INDEX_OUTOFBOUNDS_ERROR;
1491         return FALSE;
1492     }
1493 
1494     if (fInputUniStrMaybeMutable) {
1495         if (compat_SyncMutableUTextContents(fInputText)) {
1496         fInputLength = utext_nativeLength(fInputText);
1497         reset();
1498         }
1499     }
1500 
1501     int64_t nativeStart;
1502     nativeStart = start;
1503     if (nativeStart < fActiveStart || nativeStart > fActiveLimit) {
1504         status = U_INDEX_OUTOFBOUNDS_ERROR;
1505         return FALSE;
1506     }
1507 
1508     if (UTEXT_FULL_TEXT_IN_CHUNK(fInputText, fInputLength)) {
1509         MatchChunkAt((int32_t)nativeStart, FALSE, status);
1510     } else {
1511         MatchAt(nativeStart, FALSE, status);
1512     }
1513     return fMatch;
1514 }
1515 
1516 
1517 
1518 //--------------------------------------------------------------------------------
1519 //
1520 //  matches()
1521 //
1522 //--------------------------------------------------------------------------------
matches(UErrorCode & status)1523 UBool RegexMatcher::matches(UErrorCode &status) {
1524     if (U_FAILURE(status)) {
1525         return FALSE;
1526     }
1527     if (U_FAILURE(fDeferredStatus)) {
1528         status = fDeferredStatus;
1529         return FALSE;
1530     }
1531 
1532     if (fInputUniStrMaybeMutable) {
1533         if (compat_SyncMutableUTextContents(fInputText)) {
1534         fInputLength = utext_nativeLength(fInputText);
1535         reset();
1536         }
1537     }
1538     else {
1539         resetPreserveRegion();
1540     }
1541 
1542     if (UTEXT_FULL_TEXT_IN_CHUNK(fInputText, fInputLength)) {
1543         MatchChunkAt((int32_t)fActiveStart, TRUE, status);
1544     } else {
1545         MatchAt(fActiveStart, TRUE, status);
1546     }
1547     return fMatch;
1548 }
1549 
1550 
matches(int64_t start,UErrorCode & status)1551 UBool RegexMatcher::matches(int64_t start, UErrorCode &status) {
1552     if (U_FAILURE(status)) {
1553         return FALSE;
1554     }
1555     if (U_FAILURE(fDeferredStatus)) {
1556         status = fDeferredStatus;
1557         return FALSE;
1558     }
1559     reset();
1560 
1561     if (start < 0) {
1562         status = U_INDEX_OUTOFBOUNDS_ERROR;
1563         return FALSE;
1564     }
1565 
1566     if (fInputUniStrMaybeMutable) {
1567         if (compat_SyncMutableUTextContents(fInputText)) {
1568         fInputLength = utext_nativeLength(fInputText);
1569         reset();
1570         }
1571     }
1572 
1573     int64_t nativeStart;
1574     nativeStart = start;
1575     if (nativeStart < fActiveStart || nativeStart > fActiveLimit) {
1576         status = U_INDEX_OUTOFBOUNDS_ERROR;
1577         return FALSE;
1578     }
1579 
1580     if (UTEXT_FULL_TEXT_IN_CHUNK(fInputText, fInputLength)) {
1581         MatchChunkAt((int32_t)nativeStart, TRUE, status);
1582     } else {
1583         MatchAt(nativeStart, TRUE, status);
1584     }
1585     return fMatch;
1586 }
1587 
1588 
1589 
1590 //--------------------------------------------------------------------------------
1591 //
1592 //    pattern
1593 //
1594 //--------------------------------------------------------------------------------
pattern() const1595 const RegexPattern &RegexMatcher::pattern() const {
1596     return *fPattern;
1597 }
1598 
1599 
1600 
1601 //--------------------------------------------------------------------------------
1602 //
1603 //    region
1604 //
1605 //--------------------------------------------------------------------------------
region(int64_t regionStart,int64_t regionLimit,int64_t startIndex,UErrorCode & status)1606 RegexMatcher &RegexMatcher::region(int64_t regionStart, int64_t regionLimit, int64_t startIndex, UErrorCode &status) {
1607     if (U_FAILURE(status)) {
1608         return *this;
1609     }
1610 
1611     if (regionStart>regionLimit || regionStart<0 || regionLimit<0) {
1612         status = U_ILLEGAL_ARGUMENT_ERROR;
1613     }
1614 
1615     int64_t nativeStart = regionStart;
1616     int64_t nativeLimit = regionLimit;
1617     if (nativeStart > fInputLength || nativeLimit > fInputLength) {
1618       status = U_ILLEGAL_ARGUMENT_ERROR;
1619     }
1620 
1621     if (startIndex == -1)
1622       this->reset();
1623     else
1624       resetPreserveRegion();
1625 
1626     fRegionStart = nativeStart;
1627     fRegionLimit = nativeLimit;
1628     fActiveStart = nativeStart;
1629     fActiveLimit = nativeLimit;
1630 
1631     if (startIndex != -1) {
1632       if (startIndex < fActiveStart || startIndex > fActiveLimit) {
1633           status = U_INDEX_OUTOFBOUNDS_ERROR;
1634       }
1635       fMatchEnd = startIndex;
1636     }
1637 
1638     if (!fTransparentBounds) {
1639         fLookStart = nativeStart;
1640         fLookLimit = nativeLimit;
1641     }
1642     if (fAnchoringBounds) {
1643         fAnchorStart = nativeStart;
1644         fAnchorLimit = nativeLimit;
1645     }
1646     return *this;
1647 }
1648 
region(int64_t start,int64_t limit,UErrorCode & status)1649 RegexMatcher &RegexMatcher::region(int64_t start, int64_t limit, UErrorCode &status) {
1650   return region(start, limit, -1, status);
1651 }
1652 
1653 //--------------------------------------------------------------------------------
1654 //
1655 //    regionEnd
1656 //
1657 //--------------------------------------------------------------------------------
regionEnd() const1658 int32_t RegexMatcher::regionEnd() const {
1659     return (int32_t)fRegionLimit;
1660 }
1661 
regionEnd64() const1662 int64_t RegexMatcher::regionEnd64() const {
1663     return fRegionLimit;
1664 }
1665 
1666 //--------------------------------------------------------------------------------
1667 //
1668 //    regionStart
1669 //
1670 //--------------------------------------------------------------------------------
regionStart() const1671 int32_t RegexMatcher::regionStart() const {
1672     return (int32_t)fRegionStart;
1673 }
1674 
regionStart64() const1675 int64_t RegexMatcher::regionStart64() const {
1676     return fRegionStart;
1677 }
1678 
1679 
1680 //--------------------------------------------------------------------------------
1681 //
1682 //    replaceAll
1683 //
1684 //--------------------------------------------------------------------------------
replaceAll(const UnicodeString & replacement,UErrorCode & status)1685 UnicodeString RegexMatcher::replaceAll(const UnicodeString &replacement, UErrorCode &status) {
1686     UText replacementText = UTEXT_INITIALIZER;
1687     UText resultText = UTEXT_INITIALIZER;
1688     UnicodeString resultString;
1689     if (U_FAILURE(status)) {
1690         return resultString;
1691     }
1692 
1693     utext_openConstUnicodeString(&replacementText, &replacement, &status);
1694     utext_openUnicodeString(&resultText, &resultString, &status);
1695 
1696     replaceAll(&replacementText, &resultText, status);
1697 
1698     utext_close(&resultText);
1699     utext_close(&replacementText);
1700 
1701     return resultString;
1702 }
1703 
1704 
1705 //
1706 //    replaceAll, UText mode
1707 //
replaceAll(UText * replacement,UText * dest,UErrorCode & status)1708 UText *RegexMatcher::replaceAll(UText *replacement, UText *dest, UErrorCode &status) {
1709     if (U_FAILURE(status)) {
1710         return dest;
1711     }
1712     if (U_FAILURE(fDeferredStatus)) {
1713         status = fDeferredStatus;
1714         return dest;
1715     }
1716 
1717     if (dest == NULL) {
1718         UnicodeString emptyString;
1719         UText empty = UTEXT_INITIALIZER;
1720 
1721         utext_openUnicodeString(&empty, &emptyString, &status);
1722         dest = utext_clone(NULL, &empty, TRUE, FALSE, &status);
1723         utext_close(&empty);
1724     }
1725 
1726     if (U_SUCCESS(status)) {
1727         reset();
1728         while (find()) {
1729             appendReplacement(dest, replacement, status);
1730             if (U_FAILURE(status)) {
1731                 break;
1732             }
1733         }
1734         appendTail(dest, status);
1735     }
1736 
1737     return dest;
1738 }
1739 
1740 
1741 //--------------------------------------------------------------------------------
1742 //
1743 //    replaceFirst
1744 //
1745 //--------------------------------------------------------------------------------
replaceFirst(const UnicodeString & replacement,UErrorCode & status)1746 UnicodeString RegexMatcher::replaceFirst(const UnicodeString &replacement, UErrorCode &status) {
1747     UText replacementText = UTEXT_INITIALIZER;
1748     UText resultText = UTEXT_INITIALIZER;
1749     UnicodeString resultString;
1750 
1751     utext_openConstUnicodeString(&replacementText, &replacement, &status);
1752     utext_openUnicodeString(&resultText, &resultString, &status);
1753 
1754     replaceFirst(&replacementText, &resultText, status);
1755 
1756     utext_close(&resultText);
1757     utext_close(&replacementText);
1758 
1759     return resultString;
1760 }
1761 
1762 //
1763 //    replaceFirst, UText mode
1764 //
replaceFirst(UText * replacement,UText * dest,UErrorCode & status)1765 UText *RegexMatcher::replaceFirst(UText *replacement, UText *dest, UErrorCode &status) {
1766     if (U_FAILURE(status)) {
1767         return dest;
1768     }
1769     if (U_FAILURE(fDeferredStatus)) {
1770         status = fDeferredStatus;
1771         return dest;
1772     }
1773 
1774     reset();
1775     if (!find()) {
1776         return getInput(dest, status);
1777     }
1778 
1779     if (dest == NULL) {
1780         UnicodeString emptyString;
1781         UText empty = UTEXT_INITIALIZER;
1782 
1783         utext_openUnicodeString(&empty, &emptyString, &status);
1784         dest = utext_clone(NULL, &empty, TRUE, FALSE, &status);
1785         utext_close(&empty);
1786     }
1787 
1788     appendReplacement(dest, replacement, status);
1789     appendTail(dest, status);
1790 
1791     return dest;
1792 }
1793 
1794 
1795 //--------------------------------------------------------------------------------
1796 //
1797 //     requireEnd
1798 //
1799 //--------------------------------------------------------------------------------
requireEnd() const1800 UBool RegexMatcher::requireEnd() const {
1801     return fRequireEnd;
1802 }
1803 
1804 
1805 //--------------------------------------------------------------------------------
1806 //
1807 //     reset
1808 //
1809 //--------------------------------------------------------------------------------
reset()1810 RegexMatcher &RegexMatcher::reset() {
1811     fRegionStart    = 0;
1812     fRegionLimit    = fInputLength;
1813     fActiveStart    = 0;
1814     fActiveLimit    = fInputLength;
1815     fAnchorStart    = 0;
1816     fAnchorLimit    = fInputLength;
1817     fLookStart      = 0;
1818     fLookLimit      = fInputLength;
1819     resetPreserveRegion();
1820     return *this;
1821 }
1822 
1823 
1824 
resetPreserveRegion()1825 void RegexMatcher::resetPreserveRegion() {
1826     fMatchStart     = 0;
1827     fMatchEnd       = 0;
1828     fLastMatchEnd   = -1;
1829     fAppendPosition = 0;
1830     fMatch          = FALSE;
1831     fHitEnd         = FALSE;
1832     fRequireEnd     = FALSE;
1833     fTime           = 0;
1834     fTickCounter    = TIMER_INITIAL_VALUE;
1835     //resetStack(); // more expensive than it looks...
1836 }
1837 
1838 
reset(const UnicodeString & input)1839 RegexMatcher &RegexMatcher::reset(const UnicodeString &input) {
1840     fInputText = utext_openConstUnicodeString(fInputText, &input, &fDeferredStatus);
1841     if (fPattern->fNeedsAltInput) {
1842         fAltInputText = utext_clone(fAltInputText, fInputText, FALSE, TRUE, &fDeferredStatus);
1843     }
1844     if (U_FAILURE(fDeferredStatus)) {
1845         return *this;
1846     }
1847     fInputLength = utext_nativeLength(fInputText);
1848 
1849     reset();
1850     delete fInput;
1851     fInput = NULL;
1852 
1853     //  Do the following for any UnicodeString.
1854     //  This is for compatibility for those clients who modify the input string "live" during regex operations.
1855     fInputUniStrMaybeMutable = TRUE;
1856 
1857     if (fWordBreakItr != NULL) {
1858 #if UCONFIG_NO_BREAK_ITERATION==0
1859         UErrorCode status = U_ZERO_ERROR;
1860         fWordBreakItr->setText(fInputText, status);
1861 #endif
1862     }
1863     return *this;
1864 }
1865 
1866 
reset(UText * input)1867 RegexMatcher &RegexMatcher::reset(UText *input) {
1868     if (fInputText != input) {
1869         fInputText = utext_clone(fInputText, input, FALSE, TRUE, &fDeferredStatus);
1870         if (fPattern->fNeedsAltInput) fAltInputText = utext_clone(fAltInputText, fInputText, FALSE, TRUE, &fDeferredStatus);
1871         if (U_FAILURE(fDeferredStatus)) {
1872             return *this;
1873         }
1874         fInputLength = utext_nativeLength(fInputText);
1875 
1876         delete fInput;
1877         fInput = NULL;
1878 
1879         if (fWordBreakItr != NULL) {
1880 #if UCONFIG_NO_BREAK_ITERATION==0
1881             UErrorCode status = U_ZERO_ERROR;
1882             fWordBreakItr->setText(input, status);
1883 #endif
1884         }
1885     }
1886     reset();
1887     fInputUniStrMaybeMutable = FALSE;
1888 
1889     return *this;
1890 }
1891 
1892 /*RegexMatcher &RegexMatcher::reset(const UChar *) {
1893     fDeferredStatus = U_INTERNAL_PROGRAM_ERROR;
1894     return *this;
1895 }*/
1896 
reset(int64_t position,UErrorCode & status)1897 RegexMatcher &RegexMatcher::reset(int64_t position, UErrorCode &status) {
1898     if (U_FAILURE(status)) {
1899         return *this;
1900     }
1901     reset();       // Reset also resets the region to be the entire string.
1902 
1903     if (position < 0 || position > fActiveLimit) {
1904         status = U_INDEX_OUTOFBOUNDS_ERROR;
1905         return *this;
1906     }
1907     fMatchEnd = position;
1908     return *this;
1909 }
1910 
1911 
1912 //--------------------------------------------------------------------------------
1913 //
1914 //    refresh
1915 //
1916 //--------------------------------------------------------------------------------
refreshInputText(UText * input,UErrorCode & status)1917 RegexMatcher &RegexMatcher::refreshInputText(UText *input, UErrorCode &status) {
1918     if (U_FAILURE(status)) {
1919         return *this;
1920     }
1921     if (input == NULL) {
1922         status = U_ILLEGAL_ARGUMENT_ERROR;
1923         return *this;
1924     }
1925     if (utext_nativeLength(fInputText) != utext_nativeLength(input)) {
1926         status = U_ILLEGAL_ARGUMENT_ERROR;
1927         return *this;
1928     }
1929     int64_t  pos = utext_getNativeIndex(fInputText);
1930     //  Shallow read-only clone of the new UText into the existing input UText
1931     fInputText = utext_clone(fInputText, input, FALSE, TRUE, &status);
1932     if (U_FAILURE(status)) {
1933         return *this;
1934     }
1935     utext_setNativeIndex(fInputText, pos);
1936 
1937     if (fAltInputText != NULL) {
1938         pos = utext_getNativeIndex(fAltInputText);
1939         fAltInputText = utext_clone(fAltInputText, input, FALSE, TRUE, &status);
1940         if (U_FAILURE(status)) {
1941             return *this;
1942         }
1943         utext_setNativeIndex(fAltInputText, pos);
1944     }
1945     return *this;
1946 }
1947 
1948 
1949 
1950 //--------------------------------------------------------------------------------
1951 //
1952 //    setTrace
1953 //
1954 //--------------------------------------------------------------------------------
setTrace(UBool state)1955 void RegexMatcher::setTrace(UBool state) {
1956     fTraceDebug = state;
1957 }
1958 
1959 
1960 
1961 /**
1962   *  UText, replace entire contents of the destination UText with a substring of the source UText.
1963   *
1964   *     @param src    The source UText
1965   *     @param dest   The destination UText. Must be writable.
1966   *                   May be NULL, in which case a new UText will be allocated.
1967   *     @param start  Start index of source substring.
1968   *     @param limit  Limit index of source substring.
1969   *     @param status An error code.
1970   */
utext_extract_replace(UText * src,UText * dest,int64_t start,int64_t limit,UErrorCode * status)1971 static UText *utext_extract_replace(UText *src, UText *dest, int64_t start, int64_t limit, UErrorCode *status) {
1972     if (U_FAILURE(*status)) {
1973         return dest;
1974     }
1975     if (start == limit) {
1976         if (dest) {
1977             utext_replace(dest, 0, utext_nativeLength(dest), NULL, 0, status);
1978             return dest;
1979         } else {
1980             return utext_openUChars(NULL, NULL, 0, status);
1981         }
1982     }
1983     int32_t length = utext_extract(src, start, limit, NULL, 0, status);
1984     if (*status != U_BUFFER_OVERFLOW_ERROR && U_FAILURE(*status)) {
1985         return dest;
1986     }
1987     *status = U_ZERO_ERROR;
1988     MaybeStackArray<UChar, 40> buffer;
1989     if (length >= buffer.getCapacity()) {
1990         UChar *newBuf = buffer.resize(length+1);   // Leave space for terminating Nul.
1991         if (newBuf == NULL) {
1992             *status = U_MEMORY_ALLOCATION_ERROR;
1993         }
1994     }
1995     utext_extract(src, start, limit, buffer.getAlias(), length+1, status);
1996     if (dest) {
1997         utext_replace(dest, 0, utext_nativeLength(dest), buffer.getAlias(), length, status);
1998         return dest;
1999     }
2000 
2001     // Caller did not provide a prexisting UText.
2002     // Open a new one, and have it adopt the text buffer storage.
2003     if (U_FAILURE(*status)) {
2004         return NULL;
2005     }
2006     int32_t ownedLength = 0;
2007     UChar *ownedBuf = buffer.orphanOrClone(length+1, ownedLength);
2008     if (ownedBuf == NULL) {
2009         *status = U_MEMORY_ALLOCATION_ERROR;
2010         return NULL;
2011     }
2012     UText *result = utext_openUChars(NULL, ownedBuf, length, status);
2013     if (U_FAILURE(*status)) {
2014         uprv_free(ownedBuf);
2015         return NULL;
2016     }
2017     result->providerProperties |= (1 << UTEXT_PROVIDER_OWNS_TEXT);
2018     return result;
2019 }
2020 
2021 
2022 //---------------------------------------------------------------------
2023 //
2024 //   split
2025 //
2026 //---------------------------------------------------------------------
split(const UnicodeString & input,UnicodeString dest[],int32_t destCapacity,UErrorCode & status)2027 int32_t  RegexMatcher::split(const UnicodeString &input,
2028         UnicodeString    dest[],
2029         int32_t          destCapacity,
2030         UErrorCode      &status)
2031 {
2032     UText inputText = UTEXT_INITIALIZER;
2033     utext_openConstUnicodeString(&inputText, &input, &status);
2034     if (U_FAILURE(status)) {
2035         return 0;
2036     }
2037 
2038     UText **destText = (UText **)uprv_malloc(sizeof(UText*)*destCapacity);
2039     if (destText == NULL) {
2040         status = U_MEMORY_ALLOCATION_ERROR;
2041         return 0;
2042     }
2043     int32_t i;
2044     for (i = 0; i < destCapacity; i++) {
2045         destText[i] = utext_openUnicodeString(NULL, &dest[i], &status);
2046     }
2047 
2048     int32_t fieldCount = split(&inputText, destText, destCapacity, status);
2049 
2050     for (i = 0; i < destCapacity; i++) {
2051         utext_close(destText[i]);
2052     }
2053 
2054     uprv_free(destText);
2055     utext_close(&inputText);
2056     return fieldCount;
2057 }
2058 
2059 //
2060 //   split, UText mode
2061 //
split(UText * input,UText * dest[],int32_t destCapacity,UErrorCode & status)2062 int32_t  RegexMatcher::split(UText *input,
2063         UText           *dest[],
2064         int32_t          destCapacity,
2065         UErrorCode      &status)
2066 {
2067     //
2068     // Check arguements for validity
2069     //
2070     if (U_FAILURE(status)) {
2071         return 0;
2072     }
2073 
2074     if (destCapacity < 1) {
2075         status = U_ILLEGAL_ARGUMENT_ERROR;
2076         return 0;
2077     }
2078 
2079     //
2080     // Reset for the input text
2081     //
2082     reset(input);
2083     int64_t   nextOutputStringStart = 0;
2084     if (fActiveLimit == 0) {
2085         return 0;
2086     }
2087 
2088     //
2089     // Loop through the input text, searching for the delimiter pattern
2090     //
2091     int32_t i;
2092     int32_t numCaptureGroups = fPattern->fGroupMap->size();
2093     for (i=0; ; i++) {
2094         if (i>=destCapacity-1) {
2095             // There is one or zero output string left.
2096             // Fill the last output string with whatever is left from the input, then exit the loop.
2097             //  ( i will be == destCapacity if we filled the output array while processing
2098             //    capture groups of the delimiter expression, in which case we will discard the
2099             //    last capture group saved in favor of the unprocessed remainder of the
2100             //    input string.)
2101             i = destCapacity-1;
2102             if (fActiveLimit > nextOutputStringStart) {
2103                 if (UTEXT_FULL_TEXT_IN_CHUNK(input, fInputLength)) {
2104                     if (dest[i]) {
2105                         utext_replace(dest[i], 0, utext_nativeLength(dest[i]),
2106                                       input->chunkContents+nextOutputStringStart,
2107                                       (int32_t)(fActiveLimit-nextOutputStringStart), &status);
2108                     } else {
2109                         UText remainingText = UTEXT_INITIALIZER;
2110                         utext_openUChars(&remainingText, input->chunkContents+nextOutputStringStart,
2111                                          fActiveLimit-nextOutputStringStart, &status);
2112                         dest[i] = utext_clone(NULL, &remainingText, TRUE, FALSE, &status);
2113                         utext_close(&remainingText);
2114                     }
2115                 } else {
2116                     UErrorCode lengthStatus = U_ZERO_ERROR;
2117                     int32_t remaining16Length =
2118                         utext_extract(input, nextOutputStringStart, fActiveLimit, NULL, 0, &lengthStatus);
2119                     UChar *remainingChars = (UChar *)uprv_malloc(sizeof(UChar)*(remaining16Length+1));
2120                     if (remainingChars == NULL) {
2121                         status = U_MEMORY_ALLOCATION_ERROR;
2122                         break;
2123                     }
2124 
2125                     utext_extract(input, nextOutputStringStart, fActiveLimit, remainingChars, remaining16Length+1, &status);
2126                     if (dest[i]) {
2127                         utext_replace(dest[i], 0, utext_nativeLength(dest[i]), remainingChars, remaining16Length, &status);
2128                     } else {
2129                         UText remainingText = UTEXT_INITIALIZER;
2130                         utext_openUChars(&remainingText, remainingChars, remaining16Length, &status);
2131                         dest[i] = utext_clone(NULL, &remainingText, TRUE, FALSE, &status);
2132                         utext_close(&remainingText);
2133                     }
2134 
2135                     uprv_free(remainingChars);
2136                 }
2137             }
2138             break;
2139         }
2140         if (find()) {
2141             // We found another delimiter.  Move everything from where we started looking
2142             //  up until the start of the delimiter into the next output string.
2143             if (UTEXT_FULL_TEXT_IN_CHUNK(input, fInputLength)) {
2144                 if (dest[i]) {
2145                     utext_replace(dest[i], 0, utext_nativeLength(dest[i]),
2146                                   input->chunkContents+nextOutputStringStart,
2147                                   (int32_t)(fMatchStart-nextOutputStringStart), &status);
2148                 } else {
2149                     UText remainingText = UTEXT_INITIALIZER;
2150                     utext_openUChars(&remainingText, input->chunkContents+nextOutputStringStart,
2151                                       fMatchStart-nextOutputStringStart, &status);
2152                     dest[i] = utext_clone(NULL, &remainingText, TRUE, FALSE, &status);
2153                     utext_close(&remainingText);
2154                 }
2155             } else {
2156                 UErrorCode lengthStatus = U_ZERO_ERROR;
2157                 int32_t remaining16Length = utext_extract(input, nextOutputStringStart, fMatchStart, NULL, 0, &lengthStatus);
2158                 UChar *remainingChars = (UChar *)uprv_malloc(sizeof(UChar)*(remaining16Length+1));
2159                 if (remainingChars == NULL) {
2160                     status = U_MEMORY_ALLOCATION_ERROR;
2161                     break;
2162                 }
2163                 utext_extract(input, nextOutputStringStart, fMatchStart, remainingChars, remaining16Length+1, &status);
2164                 if (dest[i]) {
2165                     utext_replace(dest[i], 0, utext_nativeLength(dest[i]), remainingChars, remaining16Length, &status);
2166                 } else {
2167                     UText remainingText = UTEXT_INITIALIZER;
2168                     utext_openUChars(&remainingText, remainingChars, remaining16Length, &status);
2169                     dest[i] = utext_clone(NULL, &remainingText, TRUE, FALSE, &status);
2170                     utext_close(&remainingText);
2171                 }
2172 
2173                 uprv_free(remainingChars);
2174             }
2175             nextOutputStringStart = fMatchEnd;
2176 
2177             // If the delimiter pattern has capturing parentheses, the captured
2178             //  text goes out into the next n destination strings.
2179             int32_t groupNum;
2180             for (groupNum=1; groupNum<=numCaptureGroups; groupNum++) {
2181                 if (i >= destCapacity-2) {
2182                     // Never fill the last available output string with capture group text.
2183                     // It will filled with the last field, the remainder of the
2184                     //  unsplit input text.
2185                     break;
2186                 }
2187                 i++;
2188                 dest[i] = utext_extract_replace(fInputText, dest[i],
2189                                                start64(groupNum, status), end64(groupNum, status), &status);
2190             }
2191 
2192             if (nextOutputStringStart == fActiveLimit) {
2193                 // The delimiter was at the end of the string.  We're done, but first
2194                 // we output one last empty string, for the empty field following
2195                 //   the delimiter at the end of input.
2196                 if (i+1 < destCapacity) {
2197                     ++i;
2198                     if (dest[i] == NULL) {
2199                         dest[i] = utext_openUChars(NULL, NULL, 0, &status);
2200                     } else {
2201                         static const UChar emptyString[] = {(UChar)0};
2202                         utext_replace(dest[i], 0, utext_nativeLength(dest[i]), emptyString, 0, &status);
2203                     }
2204                 }
2205                 break;
2206 
2207             }
2208         }
2209         else
2210         {
2211             // We ran off the end of the input while looking for the next delimiter.
2212             // All the remaining text goes into the current output string.
2213             if (UTEXT_FULL_TEXT_IN_CHUNK(input, fInputLength)) {
2214                 if (dest[i]) {
2215                     utext_replace(dest[i], 0, utext_nativeLength(dest[i]),
2216                                   input->chunkContents+nextOutputStringStart,
2217                                   (int32_t)(fActiveLimit-nextOutputStringStart), &status);
2218                 } else {
2219                     UText remainingText = UTEXT_INITIALIZER;
2220                     utext_openUChars(&remainingText, input->chunkContents+nextOutputStringStart,
2221                                      fActiveLimit-nextOutputStringStart, &status);
2222                     dest[i] = utext_clone(NULL, &remainingText, TRUE, FALSE, &status);
2223                     utext_close(&remainingText);
2224                 }
2225             } else {
2226                 UErrorCode lengthStatus = U_ZERO_ERROR;
2227                 int32_t remaining16Length = utext_extract(input, nextOutputStringStart, fActiveLimit, NULL, 0, &lengthStatus);
2228                 UChar *remainingChars = (UChar *)uprv_malloc(sizeof(UChar)*(remaining16Length+1));
2229                 if (remainingChars == NULL) {
2230                     status = U_MEMORY_ALLOCATION_ERROR;
2231                     break;
2232                 }
2233 
2234                 utext_extract(input, nextOutputStringStart, fActiveLimit, remainingChars, remaining16Length+1, &status);
2235                 if (dest[i]) {
2236                     utext_replace(dest[i], 0, utext_nativeLength(dest[i]), remainingChars, remaining16Length, &status);
2237                 } else {
2238                     UText remainingText = UTEXT_INITIALIZER;
2239                     utext_openUChars(&remainingText, remainingChars, remaining16Length, &status);
2240                     dest[i] = utext_clone(NULL, &remainingText, TRUE, FALSE, &status);
2241                     utext_close(&remainingText);
2242                 }
2243 
2244                 uprv_free(remainingChars);
2245             }
2246             break;
2247         }
2248         if (U_FAILURE(status)) {
2249             break;
2250         }
2251     }   // end of for loop
2252     return i+1;
2253 }
2254 
2255 
2256 //--------------------------------------------------------------------------------
2257 //
2258 //     start
2259 //
2260 //--------------------------------------------------------------------------------
start(UErrorCode & status) const2261 int32_t RegexMatcher::start(UErrorCode &status) const {
2262     return start(0, status);
2263 }
2264 
start64(UErrorCode & status) const2265 int64_t RegexMatcher::start64(UErrorCode &status) const {
2266     return start64(0, status);
2267 }
2268 
2269 //--------------------------------------------------------------------------------
2270 //
2271 //     start(int32_t group, UErrorCode &status)
2272 //
2273 //--------------------------------------------------------------------------------
2274 
start64(int32_t group,UErrorCode & status) const2275 int64_t RegexMatcher::start64(int32_t group, UErrorCode &status) const {
2276     if (U_FAILURE(status)) {
2277         return -1;
2278     }
2279     if (U_FAILURE(fDeferredStatus)) {
2280         status = fDeferredStatus;
2281         return -1;
2282     }
2283     if (fMatch == FALSE) {
2284         status = U_REGEX_INVALID_STATE;
2285         return -1;
2286     }
2287     if (group < 0 || group > fPattern->fGroupMap->size()) {
2288         status = U_INDEX_OUTOFBOUNDS_ERROR;
2289         return -1;
2290     }
2291     int64_t s;
2292     if (group == 0) {
2293         s = fMatchStart;
2294     } else {
2295         int32_t groupOffset = fPattern->fGroupMap->elementAti(group-1);
2296         U_ASSERT(groupOffset < fPattern->fFrameSize);
2297         U_ASSERT(groupOffset >= 0);
2298         s = fFrame->fExtra[groupOffset];
2299     }
2300 
2301     return s;
2302 }
2303 
2304 
start(int32_t group,UErrorCode & status) const2305 int32_t RegexMatcher::start(int32_t group, UErrorCode &status) const {
2306     return (int32_t)start64(group, status);
2307 }
2308 
2309 //--------------------------------------------------------------------------------
2310 //
2311 //     useAnchoringBounds
2312 //
2313 //--------------------------------------------------------------------------------
useAnchoringBounds(UBool b)2314 RegexMatcher &RegexMatcher::useAnchoringBounds(UBool b) {
2315     fAnchoringBounds = b;
2316     fAnchorStart = (fAnchoringBounds ? fRegionStart : 0);
2317     fAnchorLimit = (fAnchoringBounds ? fRegionLimit : fInputLength);
2318     return *this;
2319 }
2320 
2321 
2322 //--------------------------------------------------------------------------------
2323 //
2324 //     useTransparentBounds
2325 //
2326 //--------------------------------------------------------------------------------
useTransparentBounds(UBool b)2327 RegexMatcher &RegexMatcher::useTransparentBounds(UBool b) {
2328     fTransparentBounds = b;
2329     fLookStart = (fTransparentBounds ? 0 : fRegionStart);
2330     fLookLimit = (fTransparentBounds ? fInputLength : fRegionLimit);
2331     return *this;
2332 }
2333 
2334 //--------------------------------------------------------------------------------
2335 //
2336 //     setTimeLimit
2337 //
2338 //--------------------------------------------------------------------------------
setTimeLimit(int32_t limit,UErrorCode & status)2339 void RegexMatcher::setTimeLimit(int32_t limit, UErrorCode &status) {
2340     if (U_FAILURE(status)) {
2341         return;
2342     }
2343     if (U_FAILURE(fDeferredStatus)) {
2344         status = fDeferredStatus;
2345         return;
2346     }
2347     if (limit < 0) {
2348         status = U_ILLEGAL_ARGUMENT_ERROR;
2349         return;
2350     }
2351     fTimeLimit = limit;
2352 }
2353 
2354 
2355 //--------------------------------------------------------------------------------
2356 //
2357 //     getTimeLimit
2358 //
2359 //--------------------------------------------------------------------------------
getTimeLimit() const2360 int32_t RegexMatcher::getTimeLimit() const {
2361     return fTimeLimit;
2362 }
2363 
2364 
2365 //--------------------------------------------------------------------------------
2366 //
2367 //     setStackLimit
2368 //
2369 //--------------------------------------------------------------------------------
setStackLimit(int32_t limit,UErrorCode & status)2370 void RegexMatcher::setStackLimit(int32_t limit, UErrorCode &status) {
2371     if (U_FAILURE(status)) {
2372         return;
2373     }
2374     if (U_FAILURE(fDeferredStatus)) {
2375         status = fDeferredStatus;
2376         return;
2377     }
2378     if (limit < 0) {
2379         status = U_ILLEGAL_ARGUMENT_ERROR;
2380         return;
2381     }
2382 
2383     // Reset the matcher.  This is needed here in case there is a current match
2384     //    whose final stack frame (containing the match results, pointed to by fFrame)
2385     //    would be lost by resizing to a smaller stack size.
2386     reset();
2387 
2388     if (limit == 0) {
2389         // Unlimited stack expansion
2390         fStack->setMaxCapacity(0);
2391     } else {
2392         // Change the units of the limit  from bytes to ints, and bump the size up
2393         //   to be big enough to hold at least one stack frame for the pattern,
2394         //   if it isn't there already.
2395         int32_t adjustedLimit = limit / sizeof(int32_t);
2396         if (adjustedLimit < fPattern->fFrameSize) {
2397             adjustedLimit = fPattern->fFrameSize;
2398         }
2399         fStack->setMaxCapacity(adjustedLimit);
2400     }
2401     fStackLimit = limit;
2402 }
2403 
2404 
2405 //--------------------------------------------------------------------------------
2406 //
2407 //     getStackLimit
2408 //
2409 //--------------------------------------------------------------------------------
getStackLimit() const2410 int32_t RegexMatcher::getStackLimit() const {
2411     return fStackLimit;
2412 }
2413 
2414 
2415 //--------------------------------------------------------------------------------
2416 //
2417 //     setMatchCallback
2418 //
2419 //--------------------------------------------------------------------------------
setMatchCallback(URegexMatchCallback * callback,const void * context,UErrorCode & status)2420 void RegexMatcher::setMatchCallback(URegexMatchCallback     *callback,
2421                                     const void              *context,
2422                                     UErrorCode              &status) {
2423     if (U_FAILURE(status)) {
2424         return;
2425     }
2426     fCallbackFn = callback;
2427     fCallbackContext = context;
2428 }
2429 
2430 
2431 //--------------------------------------------------------------------------------
2432 //
2433 //     getMatchCallback
2434 //
2435 //--------------------------------------------------------------------------------
getMatchCallback(URegexMatchCallback * & callback,const void * & context,UErrorCode & status)2436 void RegexMatcher::getMatchCallback(URegexMatchCallback   *&callback,
2437                                   const void              *&context,
2438                                   UErrorCode              &status) {
2439     if (U_FAILURE(status)) {
2440        return;
2441     }
2442     callback = fCallbackFn;
2443     context  = fCallbackContext;
2444 }
2445 
2446 
2447 //--------------------------------------------------------------------------------
2448 //
2449 //     setMatchCallback
2450 //
2451 //--------------------------------------------------------------------------------
setFindProgressCallback(URegexFindProgressCallback * callback,const void * context,UErrorCode & status)2452 void RegexMatcher::setFindProgressCallback(URegexFindProgressCallback      *callback,
2453                                                 const void                      *context,
2454                                                 UErrorCode                      &status) {
2455     if (U_FAILURE(status)) {
2456         return;
2457     }
2458     fFindProgressCallbackFn = callback;
2459     fFindProgressCallbackContext = context;
2460 }
2461 
2462 
2463 //--------------------------------------------------------------------------------
2464 //
2465 //     getMatchCallback
2466 //
2467 //--------------------------------------------------------------------------------
getFindProgressCallback(URegexFindProgressCallback * & callback,const void * & context,UErrorCode & status)2468 void RegexMatcher::getFindProgressCallback(URegexFindProgressCallback    *&callback,
2469                                                 const void                    *&context,
2470                                                 UErrorCode                    &status) {
2471     if (U_FAILURE(status)) {
2472        return;
2473     }
2474     callback = fFindProgressCallbackFn;
2475     context  = fFindProgressCallbackContext;
2476 }
2477 
2478 
2479 //================================================================================
2480 //
2481 //    Code following this point in this file is the internal
2482 //    Match Engine Implementation.
2483 //
2484 //================================================================================
2485 
2486 
2487 //--------------------------------------------------------------------------------
2488 //
2489 //   resetStack
2490 //           Discard any previous contents of the state save stack, and initialize a
2491 //           new stack frame to all -1.  The -1s are needed for capture group limits,
2492 //           where they indicate that a group has not yet matched anything.
2493 //--------------------------------------------------------------------------------
resetStack()2494 REStackFrame *RegexMatcher::resetStack() {
2495     // Discard any previous contents of the state save stack, and initialize a
2496     //  new stack frame with all -1 data.  The -1s are needed for capture group limits,
2497     //  where they indicate that a group has not yet matched anything.
2498     fStack->removeAllElements();
2499 
2500     REStackFrame *iFrame = (REStackFrame *)fStack->reserveBlock(fPattern->fFrameSize, fDeferredStatus);
2501     if(U_FAILURE(fDeferredStatus)) {
2502         return NULL;
2503     }
2504 
2505     int32_t i;
2506     for (i=0; i<fPattern->fFrameSize-RESTACKFRAME_HDRCOUNT; i++) {
2507         iFrame->fExtra[i] = -1;
2508     }
2509     return iFrame;
2510 }
2511 
2512 
2513 
2514 //--------------------------------------------------------------------------------
2515 //
2516 //   isWordBoundary
2517 //                     in perl, "xab..cd..", \b is true at positions 0,3,5,7
2518 //                     For us,
2519 //                       If the current char is a combining mark,
2520 //                          \b is FALSE.
2521 //                       Else Scan backwards to the first non-combining char.
2522 //                            We are at a boundary if the this char and the original chars are
2523 //                               opposite in membership in \w set
2524 //
2525 //          parameters:   pos   - the current position in the input buffer
2526 //
2527 //              TODO:  double-check edge cases at region boundaries.
2528 //
2529 //--------------------------------------------------------------------------------
isWordBoundary(int64_t pos)2530 UBool RegexMatcher::isWordBoundary(int64_t pos) {
2531     UBool isBoundary = FALSE;
2532     UBool cIsWord    = FALSE;
2533 
2534     if (pos >= fLookLimit) {
2535         fHitEnd = TRUE;
2536     } else {
2537         // Determine whether char c at current position is a member of the word set of chars.
2538         // If we're off the end of the string, behave as though we're not at a word char.
2539         UTEXT_SETNATIVEINDEX(fInputText, pos);
2540         UChar32  c = UTEXT_CURRENT32(fInputText);
2541         if (u_hasBinaryProperty(c, UCHAR_GRAPHEME_EXTEND) || u_charType(c) == U_FORMAT_CHAR) {
2542             // Current char is a combining one.  Not a boundary.
2543             return FALSE;
2544         }
2545         cIsWord = fPattern->fStaticSets[URX_ISWORD_SET]->contains(c);
2546     }
2547 
2548     // Back up until we come to a non-combining char, determine whether
2549     //  that char is a word char.
2550     UBool prevCIsWord = FALSE;
2551     for (;;) {
2552         if (UTEXT_GETNATIVEINDEX(fInputText) <= fLookStart) {
2553             break;
2554         }
2555         UChar32 prevChar = UTEXT_PREVIOUS32(fInputText);
2556         if (!(u_hasBinaryProperty(prevChar, UCHAR_GRAPHEME_EXTEND)
2557               || u_charType(prevChar) == U_FORMAT_CHAR)) {
2558             prevCIsWord = fPattern->fStaticSets[URX_ISWORD_SET]->contains(prevChar);
2559             break;
2560         }
2561     }
2562     isBoundary = cIsWord ^ prevCIsWord;
2563     return isBoundary;
2564 }
2565 
isChunkWordBoundary(int32_t pos)2566 UBool RegexMatcher::isChunkWordBoundary(int32_t pos) {
2567     UBool isBoundary = FALSE;
2568     UBool cIsWord    = FALSE;
2569 
2570     const UChar *inputBuf = fInputText->chunkContents;
2571 
2572     if (pos >= fLookLimit) {
2573         fHitEnd = TRUE;
2574     } else {
2575         // Determine whether char c at current position is a member of the word set of chars.
2576         // If we're off the end of the string, behave as though we're not at a word char.
2577         UChar32 c;
2578         U16_GET(inputBuf, fLookStart, pos, fLookLimit, c);
2579         if (u_hasBinaryProperty(c, UCHAR_GRAPHEME_EXTEND) || u_charType(c) == U_FORMAT_CHAR) {
2580             // Current char is a combining one.  Not a boundary.
2581             return FALSE;
2582         }
2583         cIsWord = fPattern->fStaticSets[URX_ISWORD_SET]->contains(c);
2584     }
2585 
2586     // Back up until we come to a non-combining char, determine whether
2587     //  that char is a word char.
2588     UBool prevCIsWord = FALSE;
2589     for (;;) {
2590         if (pos <= fLookStart) {
2591             break;
2592         }
2593         UChar32 prevChar;
2594         U16_PREV(inputBuf, fLookStart, pos, prevChar);
2595         if (!(u_hasBinaryProperty(prevChar, UCHAR_GRAPHEME_EXTEND)
2596               || u_charType(prevChar) == U_FORMAT_CHAR)) {
2597             prevCIsWord = fPattern->fStaticSets[URX_ISWORD_SET]->contains(prevChar);
2598             break;
2599         }
2600     }
2601     isBoundary = cIsWord ^ prevCIsWord;
2602     return isBoundary;
2603 }
2604 
2605 //--------------------------------------------------------------------------------
2606 //
2607 //   isUWordBoundary
2608 //
2609 //         Test for a word boundary using RBBI word break.
2610 //
2611 //          parameters:   pos   - the current position in the input buffer
2612 //
2613 //--------------------------------------------------------------------------------
isUWordBoundary(int64_t pos)2614 UBool RegexMatcher::isUWordBoundary(int64_t pos) {
2615     UBool       returnVal = FALSE;
2616 #if UCONFIG_NO_BREAK_ITERATION==0
2617 
2618     // If we haven't yet created a break iterator for this matcher, do it now.
2619     if (fWordBreakItr == NULL) {
2620         fWordBreakItr =
2621             (RuleBasedBreakIterator *)BreakIterator::createWordInstance(Locale::getEnglish(), fDeferredStatus);
2622         if (U_FAILURE(fDeferredStatus)) {
2623             return FALSE;
2624         }
2625         fWordBreakItr->setText(fInputText, fDeferredStatus);
2626     }
2627 
2628     if (pos >= fLookLimit) {
2629         fHitEnd = TRUE;
2630         returnVal = TRUE;   // With Unicode word rules, only positions within the interior of "real"
2631                             //    words are not boundaries.  All non-word chars stand by themselves,
2632                             //    with word boundaries on both sides.
2633     } else {
2634         if (!UTEXT_USES_U16(fInputText)) {
2635             // !!!: Would like a better way to do this!
2636             UErrorCode status = U_ZERO_ERROR;
2637             pos = utext_extract(fInputText, 0, pos, NULL, 0, &status);
2638         }
2639         returnVal = fWordBreakItr->isBoundary((int32_t)pos);
2640     }
2641 #endif
2642     return   returnVal;
2643 }
2644 
2645 //--------------------------------------------------------------------------------
2646 //
2647 //   IncrementTime     This function is called once each TIMER_INITIAL_VALUE state
2648 //                     saves. Increment the "time" counter, and call the
2649 //                     user callback function if there is one installed.
2650 //
2651 //                     If the match operation needs to be aborted, either for a time-out
2652 //                     or because the user callback asked for it, just set an error status.
2653 //                     The engine will pick that up and stop in its outer loop.
2654 //
2655 //--------------------------------------------------------------------------------
IncrementTime(UErrorCode & status)2656 void RegexMatcher::IncrementTime(UErrorCode &status) {
2657     fTickCounter = TIMER_INITIAL_VALUE;
2658     fTime++;
2659     if (fCallbackFn != NULL) {
2660         if ((*fCallbackFn)(fCallbackContext, fTime) == FALSE) {
2661             status = U_REGEX_STOPPED_BY_CALLER;
2662             return;
2663         }
2664     }
2665     if (fTimeLimit > 0 && fTime >= fTimeLimit) {
2666         status = U_REGEX_TIME_OUT;
2667     }
2668 }
2669 
2670 //--------------------------------------------------------------------------------
2671 //
2672 //   StateSave
2673 //       Make a new stack frame, initialized as a copy of the current stack frame.
2674 //       Set the pattern index in the original stack frame from the operand value
2675 //       in the opcode.  Execution of the engine continues with the state in
2676 //       the newly created stack frame
2677 //
2678 //       Note that reserveBlock() may grow the stack, resulting in the
2679 //       whole thing being relocated in memory.
2680 //
2681 //    Parameters:
2682 //       fp           The top frame pointer when called.  At return, a new
2683 //                    fame will be present
2684 //       savePatIdx   An index into the compiled pattern.  Goes into the original
2685 //                    (not new) frame.  If execution ever back-tracks out of the
2686 //                    new frame, this will be where we continue from in the pattern.
2687 //    Return
2688 //                    The new frame pointer.
2689 //
2690 //--------------------------------------------------------------------------------
StateSave(REStackFrame * fp,int64_t savePatIdx,UErrorCode & status)2691 inline REStackFrame *RegexMatcher::StateSave(REStackFrame *fp, int64_t savePatIdx, UErrorCode &status) {
2692     if (U_FAILURE(status)) {
2693         return fp;
2694     }
2695     // push storage for a new frame.
2696     int64_t *newFP = fStack->reserveBlock(fFrameSize, status);
2697     if (U_FAILURE(status)) {
2698         // Failure on attempted stack expansion.
2699         //   Stack function set some other error code, change it to a more
2700         //   specific one for regular expressions.
2701         status = U_REGEX_STACK_OVERFLOW;
2702         // We need to return a writable stack frame, so just return the
2703         //    previous frame.  The match operation will stop quickly
2704         //    because of the error status, after which the frame will never
2705         //    be looked at again.
2706         return fp;
2707     }
2708     fp = (REStackFrame *)(newFP - fFrameSize);  // in case of realloc of stack.
2709 
2710     // New stack frame = copy of old top frame.
2711     int64_t *source = (int64_t *)fp;
2712     int64_t *dest   = newFP;
2713     for (;;) {
2714         *dest++ = *source++;
2715         if (source == newFP) {
2716             break;
2717         }
2718     }
2719 
2720     fTickCounter--;
2721     if (fTickCounter <= 0) {
2722        IncrementTime(status);    // Re-initializes fTickCounter
2723     }
2724     fp->fPatIdx = savePatIdx;
2725     return (REStackFrame *)newFP;
2726 }
2727 
2728 #if defined(REGEX_DEBUG)
2729 namespace {
StringFromUText(UText * ut)2730 UnicodeString StringFromUText(UText *ut) {
2731     UnicodeString result;
2732     for (UChar32 c = utext_next32From(ut, 0); c != U_SENTINEL; c = UTEXT_NEXT32(ut)) {
2733         result.append(c);
2734     }
2735     return result;
2736 }
2737 }
2738 #endif // REGEX_DEBUG
2739 
2740 
2741 //--------------------------------------------------------------------------------
2742 //
2743 //   MatchAt      This is the actual matching engine.
2744 //
2745 //                  startIdx:    begin matching a this index.
2746 //                  toEnd:       if true, match must extend to end of the input region
2747 //
2748 //--------------------------------------------------------------------------------
MatchAt(int64_t startIdx,UBool toEnd,UErrorCode & status)2749 void RegexMatcher::MatchAt(int64_t startIdx, UBool toEnd, UErrorCode &status) {
2750     UBool       isMatch  = FALSE;      // True if the we have a match.
2751 
2752     int64_t     backSearchIndex = U_INT64_MAX; // used after greedy single-character matches for searching backwards
2753 
2754     int32_t     op;                    // Operation from the compiled pattern, split into
2755     int32_t     opType;                //    the opcode
2756     int32_t     opValue;               //    and the operand value.
2757 
2758 #ifdef REGEX_RUN_DEBUG
2759     if (fTraceDebug) {
2760         printf("MatchAt(startIdx=%ld)\n", startIdx);
2761         printf("Original Pattern: \"%s\"\n", CStr(StringFromUText(fPattern->fPattern))());
2762         printf("Input String:     \"%s\"\n\n", CStr(StringFromUText(fInputText))());
2763     }
2764 #endif
2765 
2766     if (U_FAILURE(status)) {
2767         return;
2768     }
2769 
2770     //  Cache frequently referenced items from the compiled pattern
2771     //
2772     int64_t             *pat           = fPattern->fCompiledPat->getBuffer();
2773 
2774     const UChar         *litText       = fPattern->fLiteralText.getBuffer();
2775     UVector             *fSets         = fPattern->fSets;
2776 
2777     fFrameSize = fPattern->fFrameSize;
2778     REStackFrame        *fp            = resetStack();
2779     if (U_FAILURE(fDeferredStatus)) {
2780         status = fDeferredStatus;
2781         return;
2782     }
2783 
2784     fp->fPatIdx   = 0;
2785     fp->fInputIdx = startIdx;
2786 
2787     // Zero out the pattern's static data
2788     int32_t i;
2789     for (i = 0; i<fPattern->fDataSize; i++) {
2790         fData[i] = 0;
2791     }
2792 
2793     //
2794     //  Main loop for interpreting the compiled pattern.
2795     //  One iteration of the loop per pattern operation performed.
2796     //
2797     for (;;) {
2798         op      = (int32_t)pat[fp->fPatIdx];
2799         opType  = URX_TYPE(op);
2800         opValue = URX_VAL(op);
2801 #ifdef REGEX_RUN_DEBUG
2802         if (fTraceDebug) {
2803             UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx);
2804             printf("inputIdx=%ld   inputChar=%x   sp=%3ld   activeLimit=%ld  ", fp->fInputIdx,
2805                 UTEXT_CURRENT32(fInputText), (int64_t *)fp-fStack->getBuffer(), fActiveLimit);
2806             fPattern->dumpOp(fp->fPatIdx);
2807         }
2808 #endif
2809         fp->fPatIdx++;
2810 
2811         switch (opType) {
2812 
2813 
2814         case URX_NOP:
2815             break;
2816 
2817 
2818         case URX_BACKTRACK:
2819             // Force a backtrack.  In some circumstances, the pattern compiler
2820             //   will notice that the pattern can't possibly match anything, and will
2821             //   emit one of these at that point.
2822             fp = (REStackFrame *)fStack->popFrame(fFrameSize);
2823             break;
2824 
2825 
2826         case URX_ONECHAR:
2827             if (fp->fInputIdx < fActiveLimit) {
2828                 UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx);
2829                 UChar32 c = UTEXT_NEXT32(fInputText);
2830                 if (c == opValue) {
2831                     fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
2832                     break;
2833                 }
2834             } else {
2835                 fHitEnd = TRUE;
2836             }
2837             fp = (REStackFrame *)fStack->popFrame(fFrameSize);
2838             break;
2839 
2840 
2841         case URX_STRING:
2842             {
2843                 // Test input against a literal string.
2844                 // Strings require two slots in the compiled pattern, one for the
2845                 //   offset to the string text, and one for the length.
2846 
2847                 int32_t   stringStartIdx = opValue;
2848                 op      = (int32_t)pat[fp->fPatIdx];     // Fetch the second operand
2849                 fp->fPatIdx++;
2850                 opType    = URX_TYPE(op);
2851                 int32_t stringLen = URX_VAL(op);
2852                 U_ASSERT(opType == URX_STRING_LEN);
2853                 U_ASSERT(stringLen >= 2);
2854 
2855                 const UChar *patternString = litText+stringStartIdx;
2856                 int32_t patternStringIndex = 0;
2857                 UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx);
2858                 UChar32 inputChar;
2859                 UChar32 patternChar;
2860                 UBool success = TRUE;
2861                 while (patternStringIndex < stringLen) {
2862                     if (UTEXT_GETNATIVEINDEX(fInputText) >= fActiveLimit) {
2863                         success = FALSE;
2864                         fHitEnd = TRUE;
2865                         break;
2866                     }
2867                     inputChar = UTEXT_NEXT32(fInputText);
2868                     U16_NEXT(patternString, patternStringIndex, stringLen, patternChar);
2869                     if (patternChar != inputChar) {
2870                         success = FALSE;
2871                         break;
2872                     }
2873                 }
2874 
2875                 if (success) {
2876                     fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
2877                 } else {
2878                     fp = (REStackFrame *)fStack->popFrame(fFrameSize);
2879                 }
2880             }
2881             break;
2882 
2883 
2884         case URX_STATE_SAVE:
2885             fp = StateSave(fp, opValue, status);
2886             break;
2887 
2888 
2889         case URX_END:
2890             // The match loop will exit via this path on a successful match,
2891             //   when we reach the end of the pattern.
2892             if (toEnd && fp->fInputIdx != fActiveLimit) {
2893                 // The pattern matched, but not to the end of input.  Try some more.
2894                 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
2895                 break;
2896             }
2897             isMatch = TRUE;
2898             goto  breakFromLoop;
2899 
2900         // Start and End Capture stack frame variables are laid out out like this:
2901             //  fp->fExtra[opValue]  - The start of a completed capture group
2902             //             opValue+1 - The end   of a completed capture group
2903             //             opValue+2 - the start of a capture group whose end
2904             //                          has not yet been reached (and might not ever be).
2905         case URX_START_CAPTURE:
2906             U_ASSERT(opValue >= 0 && opValue < fFrameSize-3);
2907             fp->fExtra[opValue+2] = fp->fInputIdx;
2908             break;
2909 
2910 
2911         case URX_END_CAPTURE:
2912             U_ASSERT(opValue >= 0 && opValue < fFrameSize-3);
2913             U_ASSERT(fp->fExtra[opValue+2] >= 0);            // Start pos for this group must be set.
2914             fp->fExtra[opValue]   = fp->fExtra[opValue+2];   // Tentative start becomes real.
2915             fp->fExtra[opValue+1] = fp->fInputIdx;           // End position
2916             U_ASSERT(fp->fExtra[opValue] <= fp->fExtra[opValue+1]);
2917             break;
2918 
2919 
2920         case URX_DOLLAR:                   //  $, test for End of line
2921                                            //     or for position before new line at end of input
2922             {
2923                 if (fp->fInputIdx >= fAnchorLimit) {
2924                     // We really are at the end of input.  Success.
2925                     fHitEnd = TRUE;
2926                     fRequireEnd = TRUE;
2927                     break;
2928                 }
2929 
2930                 UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx);
2931 
2932                 // If we are positioned just before a new-line that is located at the
2933                 //   end of input, succeed.
2934                 UChar32 c = UTEXT_NEXT32(fInputText);
2935                 if (UTEXT_GETNATIVEINDEX(fInputText) >= fAnchorLimit) {
2936                     if (isLineTerminator(c)) {
2937                         // If not in the middle of a CR/LF sequence
2938                         if ( !(c==0x0a && fp->fInputIdx>fAnchorStart && ((void)UTEXT_PREVIOUS32(fInputText), UTEXT_PREVIOUS32(fInputText))==0x0d)) {
2939                             // At new-line at end of input. Success
2940                             fHitEnd = TRUE;
2941                             fRequireEnd = TRUE;
2942 
2943                             break;
2944                         }
2945                     }
2946                 } else {
2947                     UChar32 nextC = UTEXT_NEXT32(fInputText);
2948                     if (c == 0x0d && nextC == 0x0a && UTEXT_GETNATIVEINDEX(fInputText) >= fAnchorLimit) {
2949                         fHitEnd = TRUE;
2950                         fRequireEnd = TRUE;
2951                         break;                         // At CR/LF at end of input.  Success
2952                     }
2953                 }
2954 
2955                 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
2956             }
2957             break;
2958 
2959 
2960          case URX_DOLLAR_D:                   //  $, test for End of Line, in UNIX_LINES mode.
2961             if (fp->fInputIdx >= fAnchorLimit) {
2962                 // Off the end of input.  Success.
2963                 fHitEnd = TRUE;
2964                 fRequireEnd = TRUE;
2965                 break;
2966             } else {
2967                 UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx);
2968                 UChar32 c = UTEXT_NEXT32(fInputText);
2969                 // Either at the last character of input, or off the end.
2970                 if (c == 0x0a && UTEXT_GETNATIVEINDEX(fInputText) == fAnchorLimit) {
2971                     fHitEnd = TRUE;
2972                     fRequireEnd = TRUE;
2973                     break;
2974                 }
2975             }
2976 
2977             // Not at end of input.  Back-track out.
2978             fp = (REStackFrame *)fStack->popFrame(fFrameSize);
2979             break;
2980 
2981 
2982          case URX_DOLLAR_M:                //  $, test for End of line in multi-line mode
2983              {
2984                  if (fp->fInputIdx >= fAnchorLimit) {
2985                      // We really are at the end of input.  Success.
2986                      fHitEnd = TRUE;
2987                      fRequireEnd = TRUE;
2988                      break;
2989                  }
2990                  // If we are positioned just before a new-line, succeed.
2991                  // It makes no difference where the new-line is within the input.
2992                  UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx);
2993                  UChar32 c = UTEXT_CURRENT32(fInputText);
2994                  if (isLineTerminator(c)) {
2995                      // At a line end, except for the odd chance of  being in the middle of a CR/LF sequence
2996                      //  In multi-line mode, hitting a new-line just before the end of input does not
2997                      //   set the hitEnd or requireEnd flags
2998                      if ( !(c==0x0a && fp->fInputIdx>fAnchorStart && UTEXT_PREVIOUS32(fInputText)==0x0d)) {
2999                         break;
3000                      }
3001                  }
3002                  // not at a new line.  Fail.
3003                  fp = (REStackFrame *)fStack->popFrame(fFrameSize);
3004              }
3005              break;
3006 
3007 
3008          case URX_DOLLAR_MD:                //  $, test for End of line in multi-line and UNIX_LINES mode
3009              {
3010                  if (fp->fInputIdx >= fAnchorLimit) {
3011                      // We really are at the end of input.  Success.
3012                      fHitEnd = TRUE;
3013                      fRequireEnd = TRUE;  // Java set requireEnd in this case, even though
3014                      break;               //   adding a new-line would not lose the match.
3015                  }
3016                  // If we are not positioned just before a new-line, the test fails; backtrack out.
3017                  // It makes no difference where the new-line is within the input.
3018                  UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx);
3019                  if (UTEXT_CURRENT32(fInputText) != 0x0a) {
3020                      fp = (REStackFrame *)fStack->popFrame(fFrameSize);
3021                  }
3022              }
3023              break;
3024 
3025 
3026        case URX_CARET:                    //  ^, test for start of line
3027             if (fp->fInputIdx != fAnchorStart) {
3028                 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
3029             }
3030             break;
3031 
3032 
3033        case URX_CARET_M:                   //  ^, test for start of line in mulit-line mode
3034            {
3035                if (fp->fInputIdx == fAnchorStart) {
3036                    // We are at the start input.  Success.
3037                    break;
3038                }
3039                // Check whether character just before the current pos is a new-line
3040                //   unless we are at the end of input
3041                UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx);
3042                UChar32  c = UTEXT_PREVIOUS32(fInputText);
3043                if ((fp->fInputIdx < fAnchorLimit) && isLineTerminator(c)) {
3044                    //  It's a new-line.  ^ is true.  Success.
3045                    //  TODO:  what should be done with positions between a CR and LF?
3046                    break;
3047                }
3048                // Not at the start of a line.  Fail.
3049                fp = (REStackFrame *)fStack->popFrame(fFrameSize);
3050            }
3051            break;
3052 
3053 
3054        case URX_CARET_M_UNIX:       //  ^, test for start of line in mulit-line + Unix-line mode
3055            {
3056                U_ASSERT(fp->fInputIdx >= fAnchorStart);
3057                if (fp->fInputIdx <= fAnchorStart) {
3058                    // We are at the start input.  Success.
3059                    break;
3060                }
3061                // Check whether character just before the current pos is a new-line
3062                U_ASSERT(fp->fInputIdx <= fAnchorLimit);
3063                UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx);
3064                UChar32  c = UTEXT_PREVIOUS32(fInputText);
3065                if (c != 0x0a) {
3066                    // Not at the start of a line.  Back-track out.
3067                    fp = (REStackFrame *)fStack->popFrame(fFrameSize);
3068                }
3069            }
3070            break;
3071 
3072         case URX_BACKSLASH_B:          // Test for word boundaries
3073             {
3074                 UBool success = isWordBoundary(fp->fInputIdx);
3075                 success ^= (UBool)(opValue != 0);     // flip sense for \B
3076                 if (!success) {
3077                     fp = (REStackFrame *)fStack->popFrame(fFrameSize);
3078                 }
3079             }
3080             break;
3081 
3082 
3083         case URX_BACKSLASH_BU:          // Test for word boundaries, Unicode-style
3084             {
3085                 UBool success = isUWordBoundary(fp->fInputIdx);
3086                 success ^= (UBool)(opValue != 0);     // flip sense for \B
3087                 if (!success) {
3088                     fp = (REStackFrame *)fStack->popFrame(fFrameSize);
3089                 }
3090             }
3091             break;
3092 
3093 
3094         case URX_BACKSLASH_D:            // Test for decimal digit
3095             {
3096                 if (fp->fInputIdx >= fActiveLimit) {
3097                     fHitEnd = TRUE;
3098                     fp = (REStackFrame *)fStack->popFrame(fFrameSize);
3099                     break;
3100                 }
3101 
3102                 UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx);
3103 
3104                 UChar32 c = UTEXT_NEXT32(fInputText);
3105                 int8_t ctype = u_charType(c);     // TODO:  make a unicode set for this.  Will be faster.
3106                 UBool success = (ctype == U_DECIMAL_DIGIT_NUMBER);
3107                 success ^= (UBool)(opValue != 0);        // flip sense for \D
3108                 if (success) {
3109                     fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
3110                 } else {
3111                     fp = (REStackFrame *)fStack->popFrame(fFrameSize);
3112                 }
3113             }
3114             break;
3115 
3116 
3117         case URX_BACKSLASH_G:          // Test for position at end of previous match
3118             if (!((fMatch && fp->fInputIdx==fMatchEnd) || (fMatch==FALSE && fp->fInputIdx==fActiveStart))) {
3119                 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
3120             }
3121             break;
3122 
3123 
3124         case URX_BACKSLASH_H:            // Test for \h, horizontal white space.
3125             {
3126                 if (fp->fInputIdx >= fActiveLimit) {
3127                     fHitEnd = TRUE;
3128                     fp = (REStackFrame *)fStack->popFrame(fFrameSize);
3129                     break;
3130                 }
3131                 UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx);
3132                 UChar32 c = UTEXT_NEXT32(fInputText);
3133                 int8_t ctype = u_charType(c);
3134                 UBool success = (ctype == U_SPACE_SEPARATOR || c == 9);  // SPACE_SEPARATOR || TAB
3135                 success ^= (UBool)(opValue != 0);        // flip sense for \H
3136                 if (success) {
3137                     fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
3138                 } else {
3139                     fp = (REStackFrame *)fStack->popFrame(fFrameSize);
3140                 }
3141             }
3142             break;
3143 
3144 
3145         case URX_BACKSLASH_R:            // Test for \R, any line break sequence.
3146             {
3147                 if (fp->fInputIdx >= fActiveLimit) {
3148                     fHitEnd = TRUE;
3149                     fp = (REStackFrame *)fStack->popFrame(fFrameSize);
3150                     break;
3151                 }
3152                 UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx);
3153                 UChar32 c = UTEXT_NEXT32(fInputText);
3154                 if (isLineTerminator(c)) {
3155                     if (c == 0x0d && utext_current32(fInputText) == 0x0a) {
3156                         utext_next32(fInputText);
3157                     }
3158                     fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
3159                 } else {
3160                     fp = (REStackFrame *)fStack->popFrame(fFrameSize);
3161                 }
3162             }
3163             break;
3164 
3165 
3166         case URX_BACKSLASH_V:            // \v, any single line ending character.
3167             {
3168                 if (fp->fInputIdx >= fActiveLimit) {
3169                     fHitEnd = TRUE;
3170                     fp = (REStackFrame *)fStack->popFrame(fFrameSize);
3171                     break;
3172                 }
3173                 UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx);
3174                 UChar32 c = UTEXT_NEXT32(fInputText);
3175                 UBool success = isLineTerminator(c);
3176                 success ^= (UBool)(opValue != 0);        // flip sense for \V
3177                 if (success) {
3178                     fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
3179                 } else {
3180                     fp = (REStackFrame *)fStack->popFrame(fFrameSize);
3181                 }
3182             }
3183             break;
3184 
3185 
3186         case URX_BACKSLASH_X:
3187             //  Match a Grapheme, as defined by Unicode TR 29.
3188             //  Differs slightly from Perl, which consumes combining marks independently
3189             //    of context.
3190             {
3191 
3192                 // Fail if at end of input
3193                 if (fp->fInputIdx >= fActiveLimit) {
3194                     fHitEnd = TRUE;
3195                     fp = (REStackFrame *)fStack->popFrame(fFrameSize);
3196                     break;
3197                 }
3198 
3199                 UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx);
3200 
3201                 // Examine (and consume) the current char.
3202                 //   Dispatch into a little state machine, based on the char.
3203                 UChar32  c;
3204                 c = UTEXT_NEXT32(fInputText);
3205                 fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
3206                 UnicodeSet **sets = fPattern->fStaticSets;
3207                 if (sets[URX_GC_NORMAL]->contains(c))  goto GC_Extend;
3208                 if (sets[URX_GC_CONTROL]->contains(c)) goto GC_Control;
3209                 if (sets[URX_GC_L]->contains(c))       goto GC_L;
3210                 if (sets[URX_GC_LV]->contains(c))      goto GC_V;
3211                 if (sets[URX_GC_LVT]->contains(c))     goto GC_T;
3212                 if (sets[URX_GC_V]->contains(c))       goto GC_V;
3213                 if (sets[URX_GC_T]->contains(c))       goto GC_T;
3214                 goto GC_Extend;
3215 
3216 
3217 
3218 GC_L:
3219                 if (fp->fInputIdx >= fActiveLimit)         goto GC_Done;
3220                 c = UTEXT_NEXT32(fInputText);
3221                 fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
3222                 if (sets[URX_GC_L]->contains(c))       goto GC_L;
3223                 if (sets[URX_GC_LV]->contains(c))      goto GC_V;
3224                 if (sets[URX_GC_LVT]->contains(c))     goto GC_T;
3225                 if (sets[URX_GC_V]->contains(c))       goto GC_V;
3226                 (void)UTEXT_PREVIOUS32(fInputText);
3227                 fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
3228                 goto GC_Extend;
3229 
3230 GC_V:
3231                 if (fp->fInputIdx >= fActiveLimit)         goto GC_Done;
3232                 c = UTEXT_NEXT32(fInputText);
3233                 fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
3234                 if (sets[URX_GC_V]->contains(c))       goto GC_V;
3235                 if (sets[URX_GC_T]->contains(c))       goto GC_T;
3236                 (void)UTEXT_PREVIOUS32(fInputText);
3237                 fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
3238                 goto GC_Extend;
3239 
3240 GC_T:
3241                 if (fp->fInputIdx >= fActiveLimit)         goto GC_Done;
3242                 c = UTEXT_NEXT32(fInputText);
3243                 fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
3244                 if (sets[URX_GC_T]->contains(c))       goto GC_T;
3245                 (void)UTEXT_PREVIOUS32(fInputText);
3246                 fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
3247                 goto GC_Extend;
3248 
3249 GC_Extend:
3250                 // Combining characters are consumed here
3251                 for (;;) {
3252                     if (fp->fInputIdx >= fActiveLimit) {
3253                         break;
3254                     }
3255                     c = UTEXT_CURRENT32(fInputText);
3256                     if (sets[URX_GC_EXTEND]->contains(c) == FALSE) {
3257                         break;
3258                     }
3259                     (void)UTEXT_NEXT32(fInputText);
3260                     fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
3261                 }
3262                 goto GC_Done;
3263 
3264 GC_Control:
3265                 // Most control chars stand alone (don't combine with combining chars),
3266                 //   except for that CR/LF sequence is a single grapheme cluster.
3267                 if (c == 0x0d && fp->fInputIdx < fActiveLimit && UTEXT_CURRENT32(fInputText) == 0x0a) {
3268                     c = UTEXT_NEXT32(fInputText);
3269                     fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
3270                 }
3271 
3272 GC_Done:
3273                 if (fp->fInputIdx >= fActiveLimit) {
3274                     fHitEnd = TRUE;
3275                 }
3276                 break;
3277             }
3278 
3279 
3280 
3281 
3282         case URX_BACKSLASH_Z:          // Test for end of Input
3283             if (fp->fInputIdx < fAnchorLimit) {
3284                 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
3285             } else {
3286                 fHitEnd = TRUE;
3287                 fRequireEnd = TRUE;
3288             }
3289             break;
3290 
3291 
3292 
3293         case URX_STATIC_SETREF:
3294             {
3295                 // Test input character against one of the predefined sets
3296                 //    (Word Characters, for example)
3297                 // The high bit of the op value is a flag for the match polarity.
3298                 //    0:   success if input char is in set.
3299                 //    1:   success if input char is not in set.
3300                 if (fp->fInputIdx >= fActiveLimit) {
3301                     fHitEnd = TRUE;
3302                     fp = (REStackFrame *)fStack->popFrame(fFrameSize);
3303                     break;
3304                 }
3305 
3306                 UBool success = ((opValue & URX_NEG_SET) == URX_NEG_SET);
3307                 opValue &= ~URX_NEG_SET;
3308                 U_ASSERT(opValue > 0 && opValue < URX_LAST_SET);
3309 
3310                 UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx);
3311                 UChar32 c = UTEXT_NEXT32(fInputText);
3312                 if (c < 256) {
3313                     Regex8BitSet *s8 = &fPattern->fStaticSets8[opValue];
3314                     if (s8->contains(c)) {
3315                         success = !success;
3316                     }
3317                 } else {
3318                     const UnicodeSet *s = fPattern->fStaticSets[opValue];
3319                     if (s->contains(c)) {
3320                         success = !success;
3321                     }
3322                 }
3323                 if (success) {
3324                     fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
3325                 } else {
3326                     // the character wasn't in the set.
3327                     fp = (REStackFrame *)fStack->popFrame(fFrameSize);
3328                 }
3329             }
3330             break;
3331 
3332 
3333         case URX_STAT_SETREF_N:
3334             {
3335                 // Test input character for NOT being a member of  one of
3336                 //    the predefined sets (Word Characters, for example)
3337                 if (fp->fInputIdx >= fActiveLimit) {
3338                     fHitEnd = TRUE;
3339                     fp = (REStackFrame *)fStack->popFrame(fFrameSize);
3340                     break;
3341                 }
3342 
3343                 U_ASSERT(opValue > 0 && opValue < URX_LAST_SET);
3344 
3345                 UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx);
3346 
3347                 UChar32 c = UTEXT_NEXT32(fInputText);
3348                 if (c < 256) {
3349                     Regex8BitSet *s8 = &fPattern->fStaticSets8[opValue];
3350                     if (s8->contains(c) == FALSE) {
3351                         fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
3352                         break;
3353                     }
3354                 } else {
3355                     const UnicodeSet *s = fPattern->fStaticSets[opValue];
3356                     if (s->contains(c) == FALSE) {
3357                         fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
3358                         break;
3359                     }
3360                 }
3361                 // the character wasn't in the set.
3362                 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
3363             }
3364             break;
3365 
3366 
3367         case URX_SETREF:
3368             if (fp->fInputIdx >= fActiveLimit) {
3369                 fHitEnd = TRUE;
3370                 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
3371                 break;
3372             } else {
3373                 UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx);
3374 
3375                 // There is input left.  Pick up one char and test it for set membership.
3376                 UChar32 c = UTEXT_NEXT32(fInputText);
3377                 U_ASSERT(opValue > 0 && opValue < fSets->size());
3378                 if (c<256) {
3379                     Regex8BitSet *s8 = &fPattern->fSets8[opValue];
3380                     if (s8->contains(c)) {
3381                         fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
3382                         break;
3383                     }
3384                 } else {
3385                     UnicodeSet *s = (UnicodeSet *)fSets->elementAt(opValue);
3386                     if (s->contains(c)) {
3387                         // The character is in the set.  A Match.
3388                         fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
3389                         break;
3390                     }
3391                 }
3392 
3393                 // the character wasn't in the set.
3394                 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
3395             }
3396             break;
3397 
3398 
3399         case URX_DOTANY:
3400             {
3401                 // . matches anything, but stops at end-of-line.
3402                 if (fp->fInputIdx >= fActiveLimit) {
3403                     // At end of input.  Match failed.  Backtrack out.
3404                     fHitEnd = TRUE;
3405                     fp = (REStackFrame *)fStack->popFrame(fFrameSize);
3406                     break;
3407                 }
3408 
3409                 UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx);
3410 
3411                 // There is input left.  Advance over one char, unless we've hit end-of-line
3412                 UChar32 c = UTEXT_NEXT32(fInputText);
3413                 if (isLineTerminator(c)) {
3414                     // End of line in normal mode.   . does not match.
3415                         fp = (REStackFrame *)fStack->popFrame(fFrameSize);
3416                     break;
3417                 }
3418                 fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
3419             }
3420             break;
3421 
3422 
3423         case URX_DOTANY_ALL:
3424             {
3425                 // ., in dot-matches-all (including new lines) mode
3426                 if (fp->fInputIdx >= fActiveLimit) {
3427                     // At end of input.  Match failed.  Backtrack out.
3428                     fHitEnd = TRUE;
3429                     fp = (REStackFrame *)fStack->popFrame(fFrameSize);
3430                     break;
3431                 }
3432 
3433                 UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx);
3434 
3435                 // There is input left.  Advance over one char, except if we are
3436                 //   at a cr/lf, advance over both of them.
3437                 UChar32 c;
3438                 c = UTEXT_NEXT32(fInputText);
3439                 fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
3440                 if (c==0x0d && fp->fInputIdx < fActiveLimit) {
3441                     // In the case of a CR/LF, we need to advance over both.
3442                     UChar32 nextc = UTEXT_CURRENT32(fInputText);
3443                     if (nextc == 0x0a) {
3444                         (void)UTEXT_NEXT32(fInputText);
3445                         fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
3446                     }
3447                 }
3448             }
3449             break;
3450 
3451 
3452         case URX_DOTANY_UNIX:
3453             {
3454                 // '.' operator, matches all, but stops at end-of-line.
3455                 //   UNIX_LINES mode, so 0x0a is the only recognized line ending.
3456                 if (fp->fInputIdx >= fActiveLimit) {
3457                     // At end of input.  Match failed.  Backtrack out.
3458                     fHitEnd = TRUE;
3459                     fp = (REStackFrame *)fStack->popFrame(fFrameSize);
3460                     break;
3461                 }
3462 
3463                 UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx);
3464 
3465                 // There is input left.  Advance over one char, unless we've hit end-of-line
3466                 UChar32 c = UTEXT_NEXT32(fInputText);
3467                 if (c == 0x0a) {
3468                     // End of line in normal mode.   '.' does not match the \n
3469                     fp = (REStackFrame *)fStack->popFrame(fFrameSize);
3470                 } else {
3471                     fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
3472                 }
3473             }
3474             break;
3475 
3476 
3477         case URX_JMP:
3478             fp->fPatIdx = opValue;
3479             break;
3480 
3481         case URX_FAIL:
3482             isMatch = FALSE;
3483             goto breakFromLoop;
3484 
3485         case URX_JMP_SAV:
3486             U_ASSERT(opValue < fPattern->fCompiledPat->size());
3487             fp = StateSave(fp, fp->fPatIdx, status);       // State save to loc following current
3488             fp->fPatIdx = opValue;                         // Then JMP.
3489             break;
3490 
3491         case URX_JMP_SAV_X:
3492             // This opcode is used with (x)+, when x can match a zero length string.
3493             // Same as JMP_SAV, except conditional on the match having made forward progress.
3494             // Destination of the JMP must be a URX_STO_INP_LOC, from which we get the
3495             //   data address of the input position at the start of the loop.
3496             {
3497                 U_ASSERT(opValue > 0 && opValue < fPattern->fCompiledPat->size());
3498                 int32_t  stoOp = (int32_t)pat[opValue-1];
3499                 U_ASSERT(URX_TYPE(stoOp) == URX_STO_INP_LOC);
3500                 int32_t  frameLoc = URX_VAL(stoOp);
3501                 U_ASSERT(frameLoc >= 0 && frameLoc < fFrameSize);
3502                 int64_t prevInputIdx = fp->fExtra[frameLoc];
3503                 U_ASSERT(prevInputIdx <= fp->fInputIdx);
3504                 if (prevInputIdx < fp->fInputIdx) {
3505                     // The match did make progress.  Repeat the loop.
3506                     fp = StateSave(fp, fp->fPatIdx, status);  // State save to loc following current
3507                     fp->fPatIdx = opValue;
3508                     fp->fExtra[frameLoc] = fp->fInputIdx;
3509                 }
3510                 // If the input position did not advance, we do nothing here,
3511                 //   execution will fall out of the loop.
3512             }
3513             break;
3514 
3515         case URX_CTR_INIT:
3516             {
3517                 U_ASSERT(opValue >= 0 && opValue < fFrameSize-2);
3518                 fp->fExtra[opValue] = 0;                 //  Set the loop counter variable to zero
3519 
3520                 // Pick up the three extra operands that CTR_INIT has, and
3521                 //    skip the pattern location counter past
3522                 int32_t instrOperandLoc = (int32_t)fp->fPatIdx;
3523                 fp->fPatIdx += 3;
3524                 int32_t loopLoc  = URX_VAL(pat[instrOperandLoc]);
3525                 int32_t minCount = (int32_t)pat[instrOperandLoc+1];
3526                 int32_t maxCount = (int32_t)pat[instrOperandLoc+2];
3527                 U_ASSERT(minCount>=0);
3528                 U_ASSERT(maxCount>=minCount || maxCount==-1);
3529                 U_ASSERT(loopLoc>=fp->fPatIdx);
3530 
3531                 if (minCount == 0) {
3532                     fp = StateSave(fp, loopLoc+1, status);
3533                 }
3534                 if (maxCount == -1) {
3535                     fp->fExtra[opValue+1] = fp->fInputIdx;   //  For loop breaking.
3536                 } else if (maxCount == 0) {
3537                     fp = (REStackFrame *)fStack->popFrame(fFrameSize);
3538                 }
3539             }
3540             break;
3541 
3542         case URX_CTR_LOOP:
3543             {
3544                 U_ASSERT(opValue>0 && opValue < fp->fPatIdx-2);
3545                 int32_t initOp = (int32_t)pat[opValue];
3546                 U_ASSERT(URX_TYPE(initOp) == URX_CTR_INIT);
3547                 int64_t *pCounter = &fp->fExtra[URX_VAL(initOp)];
3548                 int32_t minCount  = (int32_t)pat[opValue+2];
3549                 int32_t maxCount  = (int32_t)pat[opValue+3];
3550                 (*pCounter)++;
3551                 if ((uint64_t)*pCounter >= (uint32_t)maxCount && maxCount != -1) {
3552                     U_ASSERT(*pCounter == maxCount);
3553                     break;
3554                 }
3555                 if (*pCounter >= minCount) {
3556                     if (maxCount == -1) {
3557                         // Loop has no hard upper bound.
3558                         // Check that it is progressing through the input, break if it is not.
3559                         int64_t *pLastInputIdx =  &fp->fExtra[URX_VAL(initOp) + 1];
3560                         if (fp->fInputIdx == *pLastInputIdx) {
3561                             break;
3562                         } else {
3563                             *pLastInputIdx = fp->fInputIdx;
3564                         }
3565                     }
3566                     fp = StateSave(fp, fp->fPatIdx, status);
3567                 } else {
3568                     // Increment time-out counter. (StateSave() does it if count >= minCount)
3569                     fTickCounter--;
3570                     if (fTickCounter <= 0) {
3571                         IncrementTime(status);    // Re-initializes fTickCounter
3572                     }
3573                 }
3574 
3575                 fp->fPatIdx = opValue + 4;    // Loop back.
3576             }
3577             break;
3578 
3579         case URX_CTR_INIT_NG:
3580             {
3581                 // Initialize a non-greedy loop
3582                 U_ASSERT(opValue >= 0 && opValue < fFrameSize-2);
3583                 fp->fExtra[opValue] = 0;                 //  Set the loop counter variable to zero
3584 
3585                 // Pick up the three extra operands that CTR_INIT_NG has, and
3586                 //    skip the pattern location counter past
3587                 int32_t instrOperandLoc = (int32_t)fp->fPatIdx;
3588                 fp->fPatIdx += 3;
3589                 int32_t loopLoc  = URX_VAL(pat[instrOperandLoc]);
3590                 int32_t minCount = (int32_t)pat[instrOperandLoc+1];
3591                 int32_t maxCount = (int32_t)pat[instrOperandLoc+2];
3592                 U_ASSERT(minCount>=0);
3593                 U_ASSERT(maxCount>=minCount || maxCount==-1);
3594                 U_ASSERT(loopLoc>fp->fPatIdx);
3595                 if (maxCount == -1) {
3596                     fp->fExtra[opValue+1] = fp->fInputIdx;   //  Save initial input index for loop breaking.
3597                 }
3598 
3599                 if (minCount == 0) {
3600                     if (maxCount != 0) {
3601                         fp = StateSave(fp, fp->fPatIdx, status);
3602                     }
3603                     fp->fPatIdx = loopLoc+1;   // Continue with stuff after repeated block
3604                 }
3605             }
3606             break;
3607 
3608         case URX_CTR_LOOP_NG:
3609             {
3610                 // Non-greedy {min, max} loops
3611                 U_ASSERT(opValue>0 && opValue < fp->fPatIdx-2);
3612                 int32_t initOp = (int32_t)pat[opValue];
3613                 U_ASSERT(URX_TYPE(initOp) == URX_CTR_INIT_NG);
3614                 int64_t *pCounter = &fp->fExtra[URX_VAL(initOp)];
3615                 int32_t minCount  = (int32_t)pat[opValue+2];
3616                 int32_t maxCount  = (int32_t)pat[opValue+3];
3617 
3618                 (*pCounter)++;
3619                 if ((uint64_t)*pCounter >= (uint32_t)maxCount && maxCount != -1) {
3620                     // The loop has matched the maximum permitted number of times.
3621                     //   Break out of here with no action.  Matching will
3622                     //   continue with the following pattern.
3623                     U_ASSERT(*pCounter == maxCount);
3624                     break;
3625                 }
3626 
3627                 if (*pCounter < minCount) {
3628                     // We haven't met the minimum number of matches yet.
3629                     //   Loop back for another one.
3630                     fp->fPatIdx = opValue + 4;    // Loop back.
3631                     // Increment time-out counter. (StateSave() does it if count >= minCount)
3632                     fTickCounter--;
3633                     if (fTickCounter <= 0) {
3634                         IncrementTime(status);    // Re-initializes fTickCounter
3635                     }
3636                 } else {
3637                     // We do have the minimum number of matches.
3638 
3639                     // If there is no upper bound on the loop iterations, check that the input index
3640                     // is progressing, and stop the loop if it is not.
3641                     if (maxCount == -1) {
3642                         int64_t *pLastInputIdx =  &fp->fExtra[URX_VAL(initOp) + 1];
3643                         if (fp->fInputIdx == *pLastInputIdx) {
3644                             break;
3645                         }
3646                         *pLastInputIdx = fp->fInputIdx;
3647                     }
3648 
3649                     // Loop Continuation: we will fall into the pattern following the loop
3650                     //   (non-greedy, don't execute loop body first), but first do
3651                     //   a state save to the top of the loop, so that a match failure
3652                     //   in the following pattern will try another iteration of the loop.
3653                     fp = StateSave(fp, opValue + 4, status);
3654                 }
3655             }
3656             break;
3657 
3658         case URX_STO_SP:
3659             U_ASSERT(opValue >= 0 && opValue < fPattern->fDataSize);
3660             fData[opValue] = fStack->size();
3661             break;
3662 
3663         case URX_LD_SP:
3664             {
3665                 U_ASSERT(opValue >= 0 && opValue < fPattern->fDataSize);
3666                 int32_t newStackSize = (int32_t)fData[opValue];
3667                 U_ASSERT(newStackSize <= fStack->size());
3668                 int64_t *newFP = fStack->getBuffer() + newStackSize - fFrameSize;
3669                 if (newFP == (int64_t *)fp) {
3670                     break;
3671                 }
3672                 int32_t j;
3673                 for (j=0; j<fFrameSize; j++) {
3674                     newFP[j] = ((int64_t *)fp)[j];
3675                 }
3676                 fp = (REStackFrame *)newFP;
3677                 fStack->setSize(newStackSize);
3678             }
3679             break;
3680 
3681         case URX_BACKREF:
3682             {
3683                 U_ASSERT(opValue < fFrameSize);
3684                 int64_t groupStartIdx = fp->fExtra[opValue];
3685                 int64_t groupEndIdx   = fp->fExtra[opValue+1];
3686                 U_ASSERT(groupStartIdx <= groupEndIdx);
3687                 if (groupStartIdx < 0) {
3688                     // This capture group has not participated in the match thus far,
3689                     fp = (REStackFrame *)fStack->popFrame(fFrameSize);   // FAIL, no match.
3690                     break;
3691                 }
3692                 UTEXT_SETNATIVEINDEX(fAltInputText, groupStartIdx);
3693                 UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx);
3694 
3695                 //   Note: if the capture group match was of an empty string the backref
3696                 //         match succeeds.  Verified by testing:  Perl matches succeed
3697                 //         in this case, so we do too.
3698 
3699                 UBool success = TRUE;
3700                 for (;;) {
3701                     if (utext_getNativeIndex(fAltInputText) >= groupEndIdx) {
3702                         success = TRUE;
3703                         break;
3704                     }
3705                     if (utext_getNativeIndex(fInputText) >= fActiveLimit) {
3706                         success = FALSE;
3707                         fHitEnd = TRUE;
3708                         break;
3709                     }
3710                     UChar32 captureGroupChar = utext_next32(fAltInputText);
3711                     UChar32 inputChar = utext_next32(fInputText);
3712                     if (inputChar != captureGroupChar) {
3713                         success = FALSE;
3714                         break;
3715                     }
3716                 }
3717 
3718                 if (success) {
3719                     fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
3720                 } else {
3721                     fp = (REStackFrame *)fStack->popFrame(fFrameSize);
3722                 }
3723             }
3724             break;
3725 
3726 
3727 
3728         case URX_BACKREF_I:
3729             {
3730                 U_ASSERT(opValue < fFrameSize);
3731                 int64_t groupStartIdx = fp->fExtra[opValue];
3732                 int64_t groupEndIdx   = fp->fExtra[opValue+1];
3733                 U_ASSERT(groupStartIdx <= groupEndIdx);
3734                 if (groupStartIdx < 0) {
3735                     // This capture group has not participated in the match thus far,
3736                     fp = (REStackFrame *)fStack->popFrame(fFrameSize);   // FAIL, no match.
3737                     break;
3738                 }
3739                 utext_setNativeIndex(fAltInputText, groupStartIdx);
3740                 utext_setNativeIndex(fInputText, fp->fInputIdx);
3741                 CaseFoldingUTextIterator captureGroupItr(*fAltInputText);
3742                 CaseFoldingUTextIterator inputItr(*fInputText);
3743 
3744                 //   Note: if the capture group match was of an empty string the backref
3745                 //         match succeeds.  Verified by testing:  Perl matches succeed
3746                 //         in this case, so we do too.
3747 
3748                 UBool success = TRUE;
3749                 for (;;) {
3750                     if (!captureGroupItr.inExpansion() && utext_getNativeIndex(fAltInputText) >= groupEndIdx) {
3751                         success = TRUE;
3752                         break;
3753                     }
3754                     if (!inputItr.inExpansion() && utext_getNativeIndex(fInputText) >= fActiveLimit) {
3755                         success = FALSE;
3756                         fHitEnd = TRUE;
3757                         break;
3758                     }
3759                     UChar32 captureGroupChar = captureGroupItr.next();
3760                     UChar32 inputChar = inputItr.next();
3761                     if (inputChar != captureGroupChar) {
3762                         success = FALSE;
3763                         break;
3764                     }
3765                 }
3766 
3767                 if (success && inputItr.inExpansion()) {
3768                     // We otained a match by consuming part of a string obtained from
3769                     // case-folding a single code point of the input text.
3770                     // This does not count as an overall match.
3771                     success = FALSE;
3772                 }
3773 
3774                 if (success) {
3775                     fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
3776                 } else {
3777                     fp = (REStackFrame *)fStack->popFrame(fFrameSize);
3778                 }
3779 
3780             }
3781             break;
3782 
3783         case URX_STO_INP_LOC:
3784             {
3785                 U_ASSERT(opValue >= 0 && opValue < fFrameSize);
3786                 fp->fExtra[opValue] = fp->fInputIdx;
3787             }
3788             break;
3789 
3790         case URX_JMPX:
3791             {
3792                 int32_t instrOperandLoc = (int32_t)fp->fPatIdx;
3793                 fp->fPatIdx += 1;
3794                 int32_t dataLoc  = URX_VAL(pat[instrOperandLoc]);
3795                 U_ASSERT(dataLoc >= 0 && dataLoc < fFrameSize);
3796                 int64_t savedInputIdx = fp->fExtra[dataLoc];
3797                 U_ASSERT(savedInputIdx <= fp->fInputIdx);
3798                 if (savedInputIdx < fp->fInputIdx) {
3799                     fp->fPatIdx = opValue;                               // JMP
3800                 } else {
3801                      fp = (REStackFrame *)fStack->popFrame(fFrameSize);   // FAIL, no progress in loop.
3802                 }
3803             }
3804             break;
3805 
3806         case URX_LA_START:
3807             {
3808                 // Entering a look around block.
3809                 // Save Stack Ptr, Input Pos.
3810                 U_ASSERT(opValue>=0 && opValue+3<fPattern->fDataSize);
3811                 fData[opValue]   = fStack->size();
3812                 fData[opValue+1] = fp->fInputIdx;
3813                 fData[opValue+2] = fActiveStart;
3814                 fData[opValue+3] = fActiveLimit;
3815                 fActiveStart     = fLookStart;          // Set the match region change for
3816                 fActiveLimit     = fLookLimit;          //   transparent bounds.
3817             }
3818             break;
3819 
3820         case URX_LA_END:
3821             {
3822                 // Leaving a look-ahead block.
3823                 //  restore Stack Ptr, Input Pos to positions they had on entry to block.
3824                 U_ASSERT(opValue>=0 && opValue+3<fPattern->fDataSize);
3825                 int32_t stackSize = fStack->size();
3826                 int32_t newStackSize =(int32_t)fData[opValue];
3827                 U_ASSERT(stackSize >= newStackSize);
3828                 if (stackSize > newStackSize) {
3829                     // Copy the current top frame back to the new (cut back) top frame.
3830                     //   This makes the capture groups from within the look-ahead
3831                     //   expression available.
3832                     int64_t *newFP = fStack->getBuffer() + newStackSize - fFrameSize;
3833                     int32_t j;
3834                     for (j=0; j<fFrameSize; j++) {
3835                         newFP[j] = ((int64_t *)fp)[j];
3836                     }
3837                     fp = (REStackFrame *)newFP;
3838                     fStack->setSize(newStackSize);
3839                 }
3840                 fp->fInputIdx = fData[opValue+1];
3841 
3842                 // Restore the active region bounds in the input string; they may have
3843                 //    been changed because of transparent bounds on a Region.
3844                 fActiveStart = fData[opValue+2];
3845                 fActiveLimit = fData[opValue+3];
3846                 U_ASSERT(fActiveStart >= 0);
3847                 U_ASSERT(fActiveLimit <= fInputLength);
3848             }
3849             break;
3850 
3851         case URX_ONECHAR_I:
3852             // Case insensitive one char.  The char from the pattern is already case folded.
3853             // Input text is not, but case folding the input can not reduce two or more code
3854             // points to one.
3855             if (fp->fInputIdx < fActiveLimit) {
3856                 UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx);
3857 
3858                 UChar32 c = UTEXT_NEXT32(fInputText);
3859                 if (u_foldCase(c, U_FOLD_CASE_DEFAULT) == opValue) {
3860                     fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
3861                     break;
3862                 }
3863             } else {
3864                 fHitEnd = TRUE;
3865             }
3866 
3867             fp = (REStackFrame *)fStack->popFrame(fFrameSize);
3868             break;
3869 
3870         case URX_STRING_I:
3871             {
3872                 // Case-insensitive test input against a literal string.
3873                 // Strings require two slots in the compiled pattern, one for the
3874                 //   offset to the string text, and one for the length.
3875                 //   The compiled string has already been case folded.
3876                 {
3877                     const UChar *patternString = litText + opValue;
3878                     int32_t      patternStringIdx  = 0;
3879 
3880                     op      = (int32_t)pat[fp->fPatIdx];
3881                     fp->fPatIdx++;
3882                     opType  = URX_TYPE(op);
3883                     opValue = URX_VAL(op);
3884                     U_ASSERT(opType == URX_STRING_LEN);
3885                     int32_t patternStringLen = opValue;  // Length of the string from the pattern.
3886 
3887 
3888                     UChar32   cPattern;
3889                     UChar32   cText;
3890                     UBool     success = TRUE;
3891 
3892                     UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx);
3893                     CaseFoldingUTextIterator inputIterator(*fInputText);
3894                     while (patternStringIdx < patternStringLen) {
3895                         if (!inputIterator.inExpansion() && UTEXT_GETNATIVEINDEX(fInputText) >= fActiveLimit) {
3896                             success = FALSE;
3897                             fHitEnd = TRUE;
3898                             break;
3899                         }
3900                         U16_NEXT(patternString, patternStringIdx, patternStringLen, cPattern);
3901                         cText = inputIterator.next();
3902                         if (cText != cPattern) {
3903                             success = FALSE;
3904                             break;
3905                         }
3906                     }
3907                     if (inputIterator.inExpansion()) {
3908                         success = FALSE;
3909                     }
3910 
3911                     if (success) {
3912                         fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
3913                     } else {
3914                         fp = (REStackFrame *)fStack->popFrame(fFrameSize);
3915                     }
3916                 }
3917             }
3918             break;
3919 
3920         case URX_LB_START:
3921             {
3922                 // Entering a look-behind block.
3923                 // Save Stack Ptr, Input Pos and active input region.
3924                 //   TODO:  implement transparent bounds.  Ticket #6067
3925                 U_ASSERT(opValue>=0 && opValue+4<fPattern->fDataSize);
3926                 fData[opValue]   = fStack->size();
3927                 fData[opValue+1] = fp->fInputIdx;
3928                 // Save input string length, then reset to pin any matches to end at
3929                 //   the current position.
3930                 fData[opValue+2] = fActiveStart;
3931                 fData[opValue+3] = fActiveLimit;
3932                 fActiveStart     = fRegionStart;
3933                 fActiveLimit     = fp->fInputIdx;
3934                 // Init the variable containing the start index for attempted matches.
3935                 fData[opValue+4] = -1;
3936             }
3937             break;
3938 
3939 
3940         case URX_LB_CONT:
3941             {
3942                 // Positive Look-Behind, at top of loop checking for matches of LB expression
3943                 //    at all possible input starting positions.
3944 
3945                 // Fetch the min and max possible match lengths.  They are the operands
3946                 //   of this op in the pattern.
3947                 int32_t minML = (int32_t)pat[fp->fPatIdx++];
3948                 int32_t maxML = (int32_t)pat[fp->fPatIdx++];
3949                 if (!UTEXT_USES_U16(fInputText)) {
3950                     // utf-8 fix to maximum match length. The pattern compiler assumes utf-16.
3951                     // The max length need not be exact; it just needs to be >= actual maximum.
3952                     maxML *= 3;
3953                 }
3954                 U_ASSERT(minML <= maxML);
3955                 U_ASSERT(minML >= 0);
3956 
3957                 // Fetch (from data) the last input index where a match was attempted.
3958                 U_ASSERT(opValue>=0 && opValue+4<fPattern->fDataSize);
3959                 int64_t  &lbStartIdx = fData[opValue+4];
3960                 if (lbStartIdx < 0) {
3961                     // First time through loop.
3962                     lbStartIdx = fp->fInputIdx - minML;
3963                     if (lbStartIdx > 0) {
3964                         // move index to a code point boudary, if it's not on one already.
3965                         UTEXT_SETNATIVEINDEX(fInputText, lbStartIdx);
3966                         lbStartIdx = UTEXT_GETNATIVEINDEX(fInputText);
3967                     }
3968                 } else {
3969                     // 2nd through nth time through the loop.
3970                     // Back up start position for match by one.
3971                     if (lbStartIdx == 0) {
3972                         (lbStartIdx)--;
3973                     } else {
3974                         UTEXT_SETNATIVEINDEX(fInputText, lbStartIdx);
3975                         (void)UTEXT_PREVIOUS32(fInputText);
3976                         lbStartIdx = UTEXT_GETNATIVEINDEX(fInputText);
3977                     }
3978                 }
3979 
3980                 if (lbStartIdx < 0 || lbStartIdx < fp->fInputIdx - maxML) {
3981                     // We have tried all potential match starting points without
3982                     //  getting a match.  Backtrack out, and out of the
3983                     //   Look Behind altogether.
3984                     fp = (REStackFrame *)fStack->popFrame(fFrameSize);
3985                     fActiveStart = fData[opValue+2];
3986                     fActiveLimit = fData[opValue+3];
3987                     U_ASSERT(fActiveStart >= 0);
3988                     U_ASSERT(fActiveLimit <= fInputLength);
3989                     break;
3990                 }
3991 
3992                 //    Save state to this URX_LB_CONT op, so failure to match will repeat the loop.
3993                 //      (successful match will fall off the end of the loop.)
3994                 fp = StateSave(fp, fp->fPatIdx-3, status);
3995                 fp->fInputIdx = lbStartIdx;
3996             }
3997             break;
3998 
3999         case URX_LB_END:
4000             // End of a look-behind block, after a successful match.
4001             {
4002                 U_ASSERT(opValue>=0 && opValue+4<fPattern->fDataSize);
4003                 if (fp->fInputIdx != fActiveLimit) {
4004                     //  The look-behind expression matched, but the match did not
4005                     //    extend all the way to the point that we are looking behind from.
4006                     //  FAIL out of here, which will take us back to the LB_CONT, which
4007                     //     will retry the match starting at another position or fail
4008                     //     the look-behind altogether, whichever is appropriate.
4009                     fp = (REStackFrame *)fStack->popFrame(fFrameSize);
4010                     break;
4011                 }
4012 
4013                 // Look-behind match is good.  Restore the orignal input string region,
4014                 //   which had been truncated to pin the end of the lookbehind match to the
4015                 //   position being looked-behind.
4016                 fActiveStart = fData[opValue+2];
4017                 fActiveLimit = fData[opValue+3];
4018                 U_ASSERT(fActiveStart >= 0);
4019                 U_ASSERT(fActiveLimit <= fInputLength);
4020             }
4021             break;
4022 
4023 
4024         case URX_LBN_CONT:
4025             {
4026                 // Negative Look-Behind, at top of loop checking for matches of LB expression
4027                 //    at all possible input starting positions.
4028 
4029                 // Fetch the extra parameters of this op.
4030                 int32_t minML       = (int32_t)pat[fp->fPatIdx++];
4031                 int32_t maxML       = (int32_t)pat[fp->fPatIdx++];
4032                 if (!UTEXT_USES_U16(fInputText)) {
4033                     // utf-8 fix to maximum match length. The pattern compiler assumes utf-16.
4034                     // The max length need not be exact; it just needs to be >= actual maximum.
4035                     maxML *= 3;
4036                 }
4037                 int32_t continueLoc = (int32_t)pat[fp->fPatIdx++];
4038                         continueLoc = URX_VAL(continueLoc);
4039                 U_ASSERT(minML <= maxML);
4040                 U_ASSERT(minML >= 0);
4041                 U_ASSERT(continueLoc > fp->fPatIdx);
4042 
4043                 // Fetch (from data) the last input index where a match was attempted.
4044                 U_ASSERT(opValue>=0 && opValue+4<fPattern->fDataSize);
4045                 int64_t  &lbStartIdx = fData[opValue+4];
4046                 if (lbStartIdx < 0) {
4047                     // First time through loop.
4048                     lbStartIdx = fp->fInputIdx - minML;
4049                     if (lbStartIdx > 0) {
4050                         // move index to a code point boudary, if it's not on one already.
4051                         UTEXT_SETNATIVEINDEX(fInputText, lbStartIdx);
4052                         lbStartIdx = UTEXT_GETNATIVEINDEX(fInputText);
4053                     }
4054                 } else {
4055                     // 2nd through nth time through the loop.
4056                     // Back up start position for match by one.
4057                     if (lbStartIdx == 0) {
4058                         (lbStartIdx)--;
4059                     } else {
4060                         UTEXT_SETNATIVEINDEX(fInputText, lbStartIdx);
4061                         (void)UTEXT_PREVIOUS32(fInputText);
4062                         lbStartIdx = UTEXT_GETNATIVEINDEX(fInputText);
4063                     }
4064                 }
4065 
4066                 if (lbStartIdx < 0 || lbStartIdx < fp->fInputIdx - maxML) {
4067                     // We have tried all potential match starting points without
4068                     //  getting a match, which means that the negative lookbehind as
4069                     //  a whole has succeeded.  Jump forward to the continue location
4070                     fActiveStart = fData[opValue+2];
4071                     fActiveLimit = fData[opValue+3];
4072                     U_ASSERT(fActiveStart >= 0);
4073                     U_ASSERT(fActiveLimit <= fInputLength);
4074                     fp->fPatIdx = continueLoc;
4075                     break;
4076                 }
4077 
4078                 //    Save state to this URX_LB_CONT op, so failure to match will repeat the loop.
4079                 //      (successful match will cause a FAIL out of the loop altogether.)
4080                 fp = StateSave(fp, fp->fPatIdx-4, status);
4081                 fp->fInputIdx = lbStartIdx;
4082             }
4083             break;
4084 
4085         case URX_LBN_END:
4086             // End of a negative look-behind block, after a successful match.
4087             {
4088                 U_ASSERT(opValue>=0 && opValue+4<fPattern->fDataSize);
4089                 if (fp->fInputIdx != fActiveLimit) {
4090                     //  The look-behind expression matched, but the match did not
4091                     //    extend all the way to the point that we are looking behind from.
4092                     //  FAIL out of here, which will take us back to the LB_CONT, which
4093                     //     will retry the match starting at another position or succeed
4094                     //     the look-behind altogether, whichever is appropriate.
4095                     fp = (REStackFrame *)fStack->popFrame(fFrameSize);
4096                     break;
4097                 }
4098 
4099                 // Look-behind expression matched, which means look-behind test as
4100                 //   a whole Fails
4101 
4102                 //   Restore the orignal input string length, which had been truncated
4103                 //   inorder to pin the end of the lookbehind match
4104                 //   to the position being looked-behind.
4105                 fActiveStart = fData[opValue+2];
4106                 fActiveLimit = fData[opValue+3];
4107                 U_ASSERT(fActiveStart >= 0);
4108                 U_ASSERT(fActiveLimit <= fInputLength);
4109 
4110                 // Restore original stack position, discarding any state saved
4111                 //   by the successful pattern match.
4112                 U_ASSERT(opValue>=0 && opValue+1<fPattern->fDataSize);
4113                 int32_t newStackSize = (int32_t)fData[opValue];
4114                 U_ASSERT(fStack->size() > newStackSize);
4115                 fStack->setSize(newStackSize);
4116 
4117                 //  FAIL, which will take control back to someplace
4118                 //  prior to entering the look-behind test.
4119                 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
4120             }
4121             break;
4122 
4123 
4124         case URX_LOOP_SR_I:
4125             // Loop Initialization for the optimized implementation of
4126             //     [some character set]*
4127             //   This op scans through all matching input.
4128             //   The following LOOP_C op emulates stack unwinding if the following pattern fails.
4129             {
4130                 U_ASSERT(opValue > 0 && opValue < fSets->size());
4131                 Regex8BitSet *s8 = &fPattern->fSets8[opValue];
4132                 UnicodeSet   *s  = (UnicodeSet *)fSets->elementAt(opValue);
4133 
4134                 // Loop through input, until either the input is exhausted or
4135                 //   we reach a character that is not a member of the set.
4136                 int64_t ix = fp->fInputIdx;
4137                 UTEXT_SETNATIVEINDEX(fInputText, ix);
4138                 for (;;) {
4139                     if (ix >= fActiveLimit) {
4140                         fHitEnd = TRUE;
4141                         break;
4142                     }
4143                     UChar32 c = UTEXT_NEXT32(fInputText);
4144                     if (c<256) {
4145                         if (s8->contains(c) == FALSE) {
4146                             break;
4147                         }
4148                     } else {
4149                         if (s->contains(c) == FALSE) {
4150                             break;
4151                         }
4152                     }
4153                     ix = UTEXT_GETNATIVEINDEX(fInputText);
4154                 }
4155 
4156                 // If there were no matching characters, skip over the loop altogether.
4157                 //   The loop doesn't run at all, a * op always succeeds.
4158                 if (ix == fp->fInputIdx) {
4159                     fp->fPatIdx++;   // skip the URX_LOOP_C op.
4160                     break;
4161                 }
4162 
4163                 // Peek ahead in the compiled pattern, to the URX_LOOP_C that
4164                 //   must follow.  It's operand is the stack location
4165                 //   that holds the starting input index for the match of this [set]*
4166                 int32_t loopcOp = (int32_t)pat[fp->fPatIdx];
4167                 U_ASSERT(URX_TYPE(loopcOp) == URX_LOOP_C);
4168                 int32_t stackLoc = URX_VAL(loopcOp);
4169                 U_ASSERT(stackLoc >= 0 && stackLoc < fFrameSize);
4170                 fp->fExtra[stackLoc] = fp->fInputIdx;
4171                 fp->fInputIdx = ix;
4172 
4173                 // Save State to the URX_LOOP_C op that follows this one,
4174                 //   so that match failures in the following code will return to there.
4175                 //   Then bump the pattern idx so the LOOP_C is skipped on the way out of here.
4176                 fp = StateSave(fp, fp->fPatIdx, status);
4177                 fp->fPatIdx++;
4178             }
4179             break;
4180 
4181 
4182         case URX_LOOP_DOT_I:
4183             // Loop Initialization for the optimized implementation of .*
4184             //   This op scans through all remaining input.
4185             //   The following LOOP_C op emulates stack unwinding if the following pattern fails.
4186             {
4187                 // Loop through input until the input is exhausted (we reach an end-of-line)
4188                 // In DOTALL mode, we can just go straight to the end of the input.
4189                 int64_t ix;
4190                 if ((opValue & 1) == 1) {
4191                     // Dot-matches-All mode.  Jump straight to the end of the string.
4192                     ix = fActiveLimit;
4193                     fHitEnd = TRUE;
4194                 } else {
4195                     // NOT DOT ALL mode.  Line endings do not match '.'
4196                     // Scan forward until a line ending or end of input.
4197                     ix = fp->fInputIdx;
4198                     UTEXT_SETNATIVEINDEX(fInputText, ix);
4199                     for (;;) {
4200                         if (ix >= fActiveLimit) {
4201                             fHitEnd = TRUE;
4202                             break;
4203                         }
4204                         UChar32 c = UTEXT_NEXT32(fInputText);
4205                         if ((c & 0x7f) <= 0x29) {          // Fast filter of non-new-line-s
4206                             if ((c == 0x0a) ||             //  0x0a is newline in both modes.
4207                                (((opValue & 2) == 0) &&    // IF not UNIX_LINES mode
4208                                     isLineTerminator(c))) {
4209                                 //  char is a line ending.  Exit the scanning loop.
4210                                 break;
4211                             }
4212                         }
4213                         ix = UTEXT_GETNATIVEINDEX(fInputText);
4214                     }
4215                 }
4216 
4217                 // If there were no matching characters, skip over the loop altogether.
4218                 //   The loop doesn't run at all, a * op always succeeds.
4219                 if (ix == fp->fInputIdx) {
4220                     fp->fPatIdx++;   // skip the URX_LOOP_C op.
4221                     break;
4222                 }
4223 
4224                 // Peek ahead in the compiled pattern, to the URX_LOOP_C that
4225                 //   must follow.  It's operand is the stack location
4226                 //   that holds the starting input index for the match of this .*
4227                 int32_t loopcOp = (int32_t)pat[fp->fPatIdx];
4228                 U_ASSERT(URX_TYPE(loopcOp) == URX_LOOP_C);
4229                 int32_t stackLoc = URX_VAL(loopcOp);
4230                 U_ASSERT(stackLoc >= 0 && stackLoc < fFrameSize);
4231                 fp->fExtra[stackLoc] = fp->fInputIdx;
4232                 fp->fInputIdx = ix;
4233 
4234                 // Save State to the URX_LOOP_C op that follows this one,
4235                 //   so that match failures in the following code will return to there.
4236                 //   Then bump the pattern idx so the LOOP_C is skipped on the way out of here.
4237                 fp = StateSave(fp, fp->fPatIdx, status);
4238                 fp->fPatIdx++;
4239             }
4240             break;
4241 
4242 
4243         case URX_LOOP_C:
4244             {
4245                 U_ASSERT(opValue>=0 && opValue<fFrameSize);
4246                 backSearchIndex = fp->fExtra[opValue];
4247                 U_ASSERT(backSearchIndex <= fp->fInputIdx);
4248                 if (backSearchIndex == fp->fInputIdx) {
4249                     // We've backed up the input idx to the point that the loop started.
4250                     // The loop is done.  Leave here without saving state.
4251                     //  Subsequent failures won't come back here.
4252                     break;
4253                 }
4254                 // Set up for the next iteration of the loop, with input index
4255                 //   backed up by one from the last time through,
4256                 //   and a state save to this instruction in case the following code fails again.
4257                 //   (We're going backwards because this loop emulates stack unwinding, not
4258                 //    the initial scan forward.)
4259                 U_ASSERT(fp->fInputIdx > 0);
4260                 UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx);
4261                 UChar32 prevC = UTEXT_PREVIOUS32(fInputText);
4262                 fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
4263 
4264                 UChar32 twoPrevC = UTEXT_PREVIOUS32(fInputText);
4265                 if (prevC == 0x0a &&
4266                     fp->fInputIdx > backSearchIndex &&
4267                     twoPrevC == 0x0d) {
4268                     int32_t prevOp = (int32_t)pat[fp->fPatIdx-2];
4269                     if (URX_TYPE(prevOp) == URX_LOOP_DOT_I) {
4270                         // .*, stepping back over CRLF pair.
4271                         fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
4272                     }
4273                 }
4274 
4275 
4276                 fp = StateSave(fp, fp->fPatIdx-1, status);
4277             }
4278             break;
4279 
4280 
4281 
4282         default:
4283             // Trouble.  The compiled pattern contains an entry with an
4284             //           unrecognized type tag.
4285             UPRV_UNREACHABLE;
4286         }
4287 
4288         if (U_FAILURE(status)) {
4289             isMatch = FALSE;
4290             break;
4291         }
4292     }
4293 
4294 breakFromLoop:
4295     fMatch = isMatch;
4296     if (isMatch) {
4297         fLastMatchEnd = fMatchEnd;
4298         fMatchStart   = startIdx;
4299         fMatchEnd     = fp->fInputIdx;
4300     }
4301 
4302 #ifdef REGEX_RUN_DEBUG
4303     if (fTraceDebug) {
4304         if (isMatch) {
4305             printf("Match.  start=%ld   end=%ld\n\n", fMatchStart, fMatchEnd);
4306         } else {
4307             printf("No match\n\n");
4308         }
4309     }
4310 #endif
4311 
4312     fFrame = fp;                // The active stack frame when the engine stopped.
4313                                 //   Contains the capture group results that we need to
4314                                 //    access later.
4315     return;
4316 }
4317 
4318 
4319 //--------------------------------------------------------------------------------
4320 //
4321 //   MatchChunkAt   This is the actual matching engine. Like MatchAt, but with the
4322 //                  assumption that the entire string is available in the UText's
4323 //                  chunk buffer. For now, that means we can use int32_t indexes,
4324 //                  except for anything that needs to be saved (like group starts
4325 //                  and ends).
4326 //
4327 //                  startIdx:    begin matching a this index.
4328 //                  toEnd:       if true, match must extend to end of the input region
4329 //
4330 //--------------------------------------------------------------------------------
MatchChunkAt(int32_t startIdx,UBool toEnd,UErrorCode & status)4331 void RegexMatcher::MatchChunkAt(int32_t startIdx, UBool toEnd, UErrorCode &status) {
4332     UBool       isMatch  = FALSE;      // True if the we have a match.
4333 
4334     int32_t     backSearchIndex = INT32_MAX; // used after greedy single-character matches for searching backwards
4335 
4336     int32_t     op;                    // Operation from the compiled pattern, split into
4337     int32_t     opType;                //    the opcode
4338     int32_t     opValue;               //    and the operand value.
4339 
4340 #ifdef REGEX_RUN_DEBUG
4341     if (fTraceDebug) {
4342         printf("MatchAt(startIdx=%d)\n", startIdx);
4343         printf("Original Pattern: \"%s\"\n", CStr(StringFromUText(fPattern->fPattern))());
4344         printf("Input String:     \"%s\"\n\n", CStr(StringFromUText(fInputText))());
4345     }
4346 #endif
4347 
4348     if (U_FAILURE(status)) {
4349         return;
4350     }
4351 
4352     //  Cache frequently referenced items from the compiled pattern
4353     //
4354     int64_t             *pat           = fPattern->fCompiledPat->getBuffer();
4355 
4356     const UChar         *litText       = fPattern->fLiteralText.getBuffer();
4357     UVector             *fSets         = fPattern->fSets;
4358 
4359     const UChar         *inputBuf      = fInputText->chunkContents;
4360 
4361     fFrameSize = fPattern->fFrameSize;
4362     REStackFrame        *fp            = resetStack();
4363     if (U_FAILURE(fDeferredStatus)) {
4364         status = fDeferredStatus;
4365         return;
4366     }
4367 
4368     fp->fPatIdx   = 0;
4369     fp->fInputIdx = startIdx;
4370 
4371     // Zero out the pattern's static data
4372     int32_t i;
4373     for (i = 0; i<fPattern->fDataSize; i++) {
4374         fData[i] = 0;
4375     }
4376 
4377     //
4378     //  Main loop for interpreting the compiled pattern.
4379     //  One iteration of the loop per pattern operation performed.
4380     //
4381     for (;;) {
4382         op      = (int32_t)pat[fp->fPatIdx];
4383         opType  = URX_TYPE(op);
4384         opValue = URX_VAL(op);
4385 #ifdef REGEX_RUN_DEBUG
4386         if (fTraceDebug) {
4387             UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx);
4388             printf("inputIdx=%ld   inputChar=%x   sp=%3ld   activeLimit=%ld  ", fp->fInputIdx,
4389                    UTEXT_CURRENT32(fInputText), (int64_t *)fp-fStack->getBuffer(), fActiveLimit);
4390             fPattern->dumpOp(fp->fPatIdx);
4391         }
4392 #endif
4393         fp->fPatIdx++;
4394 
4395         switch (opType) {
4396 
4397 
4398         case URX_NOP:
4399             break;
4400 
4401 
4402         case URX_BACKTRACK:
4403             // Force a backtrack.  In some circumstances, the pattern compiler
4404             //   will notice that the pattern can't possibly match anything, and will
4405             //   emit one of these at that point.
4406             fp = (REStackFrame *)fStack->popFrame(fFrameSize);
4407             break;
4408 
4409 
4410         case URX_ONECHAR:
4411             if (fp->fInputIdx < fActiveLimit) {
4412                 UChar32 c;
4413                 U16_NEXT(inputBuf, fp->fInputIdx, fActiveLimit, c);
4414                 if (c == opValue) {
4415                     break;
4416                 }
4417             } else {
4418                 fHitEnd = TRUE;
4419             }
4420             fp = (REStackFrame *)fStack->popFrame(fFrameSize);
4421             break;
4422 
4423 
4424         case URX_STRING:
4425             {
4426                 // Test input against a literal string.
4427                 // Strings require two slots in the compiled pattern, one for the
4428                 //   offset to the string text, and one for the length.
4429                 int32_t   stringStartIdx = opValue;
4430                 int32_t   stringLen;
4431 
4432                 op      = (int32_t)pat[fp->fPatIdx];     // Fetch the second operand
4433                 fp->fPatIdx++;
4434                 opType    = URX_TYPE(op);
4435                 stringLen = URX_VAL(op);
4436                 U_ASSERT(opType == URX_STRING_LEN);
4437                 U_ASSERT(stringLen >= 2);
4438 
4439                 const UChar * pInp = inputBuf + fp->fInputIdx;
4440                 const UChar * pInpLimit = inputBuf + fActiveLimit;
4441                 const UChar * pPat = litText+stringStartIdx;
4442                 const UChar * pEnd = pInp + stringLen;
4443                 UBool success = TRUE;
4444                 while (pInp < pEnd) {
4445                     if (pInp >= pInpLimit) {
4446                         fHitEnd = TRUE;
4447                         success = FALSE;
4448                         break;
4449                     }
4450                     if (*pInp++ != *pPat++) {
4451                         success = FALSE;
4452                         break;
4453                     }
4454                 }
4455 
4456                 if (success) {
4457                     fp->fInputIdx += stringLen;
4458                 } else {
4459                     fp = (REStackFrame *)fStack->popFrame(fFrameSize);
4460                 }
4461             }
4462             break;
4463 
4464 
4465         case URX_STATE_SAVE:
4466             fp = StateSave(fp, opValue, status);
4467             break;
4468 
4469 
4470         case URX_END:
4471             // The match loop will exit via this path on a successful match,
4472             //   when we reach the end of the pattern.
4473             if (toEnd && fp->fInputIdx != fActiveLimit) {
4474                 // The pattern matched, but not to the end of input.  Try some more.
4475                 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
4476                 break;
4477             }
4478             isMatch = TRUE;
4479             goto  breakFromLoop;
4480 
4481             // Start and End Capture stack frame variables are laid out out like this:
4482             //  fp->fExtra[opValue]  - The start of a completed capture group
4483             //             opValue+1 - The end   of a completed capture group
4484             //             opValue+2 - the start of a capture group whose end
4485             //                          has not yet been reached (and might not ever be).
4486         case URX_START_CAPTURE:
4487             U_ASSERT(opValue >= 0 && opValue < fFrameSize-3);
4488             fp->fExtra[opValue+2] = fp->fInputIdx;
4489             break;
4490 
4491 
4492         case URX_END_CAPTURE:
4493             U_ASSERT(opValue >= 0 && opValue < fFrameSize-3);
4494             U_ASSERT(fp->fExtra[opValue+2] >= 0);            // Start pos for this group must be set.
4495             fp->fExtra[opValue]   = fp->fExtra[opValue+2];   // Tentative start becomes real.
4496             fp->fExtra[opValue+1] = fp->fInputIdx;           // End position
4497             U_ASSERT(fp->fExtra[opValue] <= fp->fExtra[opValue+1]);
4498             break;
4499 
4500 
4501         case URX_DOLLAR:                   //  $, test for End of line
4502             //     or for position before new line at end of input
4503             if (fp->fInputIdx < fAnchorLimit-2) {
4504                 // We are no where near the end of input.  Fail.
4505                 //   This is the common case.  Keep it first.
4506                 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
4507                 break;
4508             }
4509             if (fp->fInputIdx >= fAnchorLimit) {
4510                 // We really are at the end of input.  Success.
4511                 fHitEnd = TRUE;
4512                 fRequireEnd = TRUE;
4513                 break;
4514             }
4515 
4516             // If we are positioned just before a new-line that is located at the
4517             //   end of input, succeed.
4518             if (fp->fInputIdx == fAnchorLimit-1) {
4519                 UChar32 c;
4520                 U16_GET(inputBuf, fAnchorStart, fp->fInputIdx, fAnchorLimit, c);
4521 
4522                 if (isLineTerminator(c)) {
4523                     if ( !(c==0x0a && fp->fInputIdx>fAnchorStart && inputBuf[fp->fInputIdx-1]==0x0d)) {
4524                         // At new-line at end of input. Success
4525                         fHitEnd = TRUE;
4526                         fRequireEnd = TRUE;
4527                         break;
4528                     }
4529                 }
4530             } else if (fp->fInputIdx == fAnchorLimit-2 &&
4531                 inputBuf[fp->fInputIdx]==0x0d && inputBuf[fp->fInputIdx+1]==0x0a) {
4532                     fHitEnd = TRUE;
4533                     fRequireEnd = TRUE;
4534                     break;                         // At CR/LF at end of input.  Success
4535             }
4536 
4537             fp = (REStackFrame *)fStack->popFrame(fFrameSize);
4538 
4539             break;
4540 
4541 
4542         case URX_DOLLAR_D:                   //  $, test for End of Line, in UNIX_LINES mode.
4543             if (fp->fInputIdx >= fAnchorLimit-1) {
4544                 // Either at the last character of input, or off the end.
4545                 if (fp->fInputIdx == fAnchorLimit-1) {
4546                     // At last char of input.  Success if it's a new line.
4547                     if (inputBuf[fp->fInputIdx] == 0x0a) {
4548                         fHitEnd = TRUE;
4549                         fRequireEnd = TRUE;
4550                         break;
4551                     }
4552                 } else {
4553                     // Off the end of input.  Success.
4554                     fHitEnd = TRUE;
4555                     fRequireEnd = TRUE;
4556                     break;
4557                 }
4558             }
4559 
4560             // Not at end of input.  Back-track out.
4561             fp = (REStackFrame *)fStack->popFrame(fFrameSize);
4562             break;
4563 
4564 
4565         case URX_DOLLAR_M:                //  $, test for End of line in multi-line mode
4566             {
4567                 if (fp->fInputIdx >= fAnchorLimit) {
4568                     // We really are at the end of input.  Success.
4569                     fHitEnd = TRUE;
4570                     fRequireEnd = TRUE;
4571                     break;
4572                 }
4573                 // If we are positioned just before a new-line, succeed.
4574                 // It makes no difference where the new-line is within the input.
4575                 UChar32 c = inputBuf[fp->fInputIdx];
4576                 if (isLineTerminator(c)) {
4577                     // At a line end, except for the odd chance of  being in the middle of a CR/LF sequence
4578                     //  In multi-line mode, hitting a new-line just before the end of input does not
4579                     //   set the hitEnd or requireEnd flags
4580                     if ( !(c==0x0a && fp->fInputIdx>fAnchorStart && inputBuf[fp->fInputIdx-1]==0x0d)) {
4581                         break;
4582                     }
4583                 }
4584                 // not at a new line.  Fail.
4585                 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
4586             }
4587             break;
4588 
4589 
4590         case URX_DOLLAR_MD:                //  $, test for End of line in multi-line and UNIX_LINES mode
4591             {
4592                 if (fp->fInputIdx >= fAnchorLimit) {
4593                     // We really are at the end of input.  Success.
4594                     fHitEnd = TRUE;
4595                     fRequireEnd = TRUE;  // Java set requireEnd in this case, even though
4596                     break;               //   adding a new-line would not lose the match.
4597                 }
4598                 // If we are not positioned just before a new-line, the test fails; backtrack out.
4599                 // It makes no difference where the new-line is within the input.
4600                 if (inputBuf[fp->fInputIdx] != 0x0a) {
4601                     fp = (REStackFrame *)fStack->popFrame(fFrameSize);
4602                 }
4603             }
4604             break;
4605 
4606 
4607         case URX_CARET:                    //  ^, test for start of line
4608             if (fp->fInputIdx != fAnchorStart) {
4609                 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
4610             }
4611             break;
4612 
4613 
4614         case URX_CARET_M:                   //  ^, test for start of line in mulit-line mode
4615             {
4616                 if (fp->fInputIdx == fAnchorStart) {
4617                     // We are at the start input.  Success.
4618                     break;
4619                 }
4620                 // Check whether character just before the current pos is a new-line
4621                 //   unless we are at the end of input
4622                 UChar  c = inputBuf[fp->fInputIdx - 1];
4623                 if ((fp->fInputIdx < fAnchorLimit) &&
4624                     isLineTerminator(c)) {
4625                     //  It's a new-line.  ^ is true.  Success.
4626                     //  TODO:  what should be done with positions between a CR and LF?
4627                     break;
4628                 }
4629                 // Not at the start of a line.  Fail.
4630                 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
4631             }
4632             break;
4633 
4634 
4635         case URX_CARET_M_UNIX:       //  ^, test for start of line in mulit-line + Unix-line mode
4636             {
4637                 U_ASSERT(fp->fInputIdx >= fAnchorStart);
4638                 if (fp->fInputIdx <= fAnchorStart) {
4639                     // We are at the start input.  Success.
4640                     break;
4641                 }
4642                 // Check whether character just before the current pos is a new-line
4643                 U_ASSERT(fp->fInputIdx <= fAnchorLimit);
4644                 UChar  c = inputBuf[fp->fInputIdx - 1];
4645                 if (c != 0x0a) {
4646                     // Not at the start of a line.  Back-track out.
4647                     fp = (REStackFrame *)fStack->popFrame(fFrameSize);
4648                 }
4649             }
4650             break;
4651 
4652         case URX_BACKSLASH_B:          // Test for word boundaries
4653             {
4654                 UBool success = isChunkWordBoundary((int32_t)fp->fInputIdx);
4655                 success ^= (UBool)(opValue != 0);     // flip sense for \B
4656                 if (!success) {
4657                     fp = (REStackFrame *)fStack->popFrame(fFrameSize);
4658                 }
4659             }
4660             break;
4661 
4662 
4663         case URX_BACKSLASH_BU:          // Test for word boundaries, Unicode-style
4664             {
4665                 UBool success = isUWordBoundary(fp->fInputIdx);
4666                 success ^= (UBool)(opValue != 0);     // flip sense for \B
4667                 if (!success) {
4668                     fp = (REStackFrame *)fStack->popFrame(fFrameSize);
4669                 }
4670             }
4671             break;
4672 
4673 
4674         case URX_BACKSLASH_D:            // Test for decimal digit
4675             {
4676                 if (fp->fInputIdx >= fActiveLimit) {
4677                     fHitEnd = TRUE;
4678                     fp = (REStackFrame *)fStack->popFrame(fFrameSize);
4679                     break;
4680                 }
4681 
4682                 UChar32 c;
4683                 U16_NEXT(inputBuf, fp->fInputIdx, fActiveLimit, c);
4684                 int8_t ctype = u_charType(c);     // TODO:  make a unicode set for this.  Will be faster.
4685                 UBool success = (ctype == U_DECIMAL_DIGIT_NUMBER);
4686                 success ^= (UBool)(opValue != 0);        // flip sense for \D
4687                 if (!success) {
4688                     fp = (REStackFrame *)fStack->popFrame(fFrameSize);
4689                 }
4690             }
4691             break;
4692 
4693 
4694         case URX_BACKSLASH_G:          // Test for position at end of previous match
4695             if (!((fMatch && fp->fInputIdx==fMatchEnd) || (fMatch==FALSE && fp->fInputIdx==fActiveStart))) {
4696                 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
4697             }
4698             break;
4699 
4700 
4701         case URX_BACKSLASH_H:            // Test for \h, horizontal white space.
4702             {
4703                 if (fp->fInputIdx >= fActiveLimit) {
4704                     fHitEnd = TRUE;
4705                     fp = (REStackFrame *)fStack->popFrame(fFrameSize);
4706                     break;
4707                 }
4708                 UChar32 c;
4709                 U16_NEXT(inputBuf, fp->fInputIdx, fActiveLimit, c);
4710                 int8_t ctype = u_charType(c);
4711                 UBool success = (ctype == U_SPACE_SEPARATOR || c == 9);  // SPACE_SEPARATOR || TAB
4712                 success ^= (UBool)(opValue != 0);        // flip sense for \H
4713                 if (!success) {
4714                     fp = (REStackFrame *)fStack->popFrame(fFrameSize);
4715                 }
4716             }
4717             break;
4718 
4719 
4720         case URX_BACKSLASH_R:            // Test for \R, any line break sequence.
4721             {
4722                 if (fp->fInputIdx >= fActiveLimit) {
4723                     fHitEnd = TRUE;
4724                     fp = (REStackFrame *)fStack->popFrame(fFrameSize);
4725                     break;
4726                 }
4727                 UChar32 c;
4728                 U16_NEXT(inputBuf, fp->fInputIdx, fActiveLimit, c);
4729                 if (isLineTerminator(c)) {
4730                     if (c == 0x0d && fp->fInputIdx < fActiveLimit) {
4731                         // Check for CR/LF sequence. Consume both together when found.
4732                         UChar c2;
4733                         U16_NEXT(inputBuf, fp->fInputIdx, fActiveLimit, c2);
4734                         if (c2 != 0x0a) {
4735                             U16_PREV(inputBuf, 0, fp->fInputIdx, c2);
4736                         }
4737                     }
4738                 } else {
4739                     fp = (REStackFrame *)fStack->popFrame(fFrameSize);
4740                 }
4741             }
4742             break;
4743 
4744 
4745         case URX_BACKSLASH_V:         // Any single code point line ending.
4746             {
4747                 if (fp->fInputIdx >= fActiveLimit) {
4748                     fHitEnd = TRUE;
4749                     fp = (REStackFrame *)fStack->popFrame(fFrameSize);
4750                     break;
4751                 }
4752                 UChar32 c;
4753                 U16_NEXT(inputBuf, fp->fInputIdx, fActiveLimit, c);
4754                 UBool success = isLineTerminator(c);
4755                 success ^= (UBool)(opValue != 0);        // flip sense for \V
4756                 if (!success) {
4757                     fp = (REStackFrame *)fStack->popFrame(fFrameSize);
4758                 }
4759             }
4760             break;
4761 
4762 
4763 
4764         case URX_BACKSLASH_X:
4765         //  Match a Grapheme, as defined by Unicode TR 29.
4766         //  Differs slightly from Perl, which consumes combining marks independently
4767         //    of context.
4768         {
4769 
4770             // Fail if at end of input
4771             if (fp->fInputIdx >= fActiveLimit) {
4772                 fHitEnd = TRUE;
4773                 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
4774                 break;
4775             }
4776 
4777             // Examine (and consume) the current char.
4778             //   Dispatch into a little state machine, based on the char.
4779             UChar32  c;
4780             U16_NEXT(inputBuf, fp->fInputIdx, fActiveLimit, c);
4781             UnicodeSet **sets = fPattern->fStaticSets;
4782             if (sets[URX_GC_NORMAL]->contains(c))  goto GC_Extend;
4783             if (sets[URX_GC_CONTROL]->contains(c)) goto GC_Control;
4784             if (sets[URX_GC_L]->contains(c))       goto GC_L;
4785             if (sets[URX_GC_LV]->contains(c))      goto GC_V;
4786             if (sets[URX_GC_LVT]->contains(c))     goto GC_T;
4787             if (sets[URX_GC_V]->contains(c))       goto GC_V;
4788             if (sets[URX_GC_T]->contains(c))       goto GC_T;
4789             goto GC_Extend;
4790 
4791 
4792 
4793 GC_L:
4794             if (fp->fInputIdx >= fActiveLimit)         goto GC_Done;
4795             U16_NEXT(inputBuf, fp->fInputIdx, fActiveLimit, c);
4796             if (sets[URX_GC_L]->contains(c))       goto GC_L;
4797             if (sets[URX_GC_LV]->contains(c))      goto GC_V;
4798             if (sets[URX_GC_LVT]->contains(c))     goto GC_T;
4799             if (sets[URX_GC_V]->contains(c))       goto GC_V;
4800             U16_PREV(inputBuf, 0, fp->fInputIdx, c);
4801             goto GC_Extend;
4802 
4803 GC_V:
4804             if (fp->fInputIdx >= fActiveLimit)         goto GC_Done;
4805             U16_NEXT(inputBuf, fp->fInputIdx, fActiveLimit, c);
4806             if (sets[URX_GC_V]->contains(c))       goto GC_V;
4807             if (sets[URX_GC_T]->contains(c))       goto GC_T;
4808             U16_PREV(inputBuf, 0, fp->fInputIdx, c);
4809             goto GC_Extend;
4810 
4811 GC_T:
4812             if (fp->fInputIdx >= fActiveLimit)         goto GC_Done;
4813             U16_NEXT(inputBuf, fp->fInputIdx, fActiveLimit, c);
4814             if (sets[URX_GC_T]->contains(c))       goto GC_T;
4815             U16_PREV(inputBuf, 0, fp->fInputIdx, c);
4816             goto GC_Extend;
4817 
4818 GC_Extend:
4819             // Combining characters are consumed here
4820             for (;;) {
4821                 if (fp->fInputIdx >= fActiveLimit) {
4822                     break;
4823                 }
4824                 U16_NEXT(inputBuf, fp->fInputIdx, fActiveLimit, c);
4825                 if (sets[URX_GC_EXTEND]->contains(c) == FALSE) {
4826                     U16_BACK_1(inputBuf, 0, fp->fInputIdx);
4827                     break;
4828                 }
4829             }
4830             goto GC_Done;
4831 
4832 GC_Control:
4833             // Most control chars stand alone (don't combine with combining chars),
4834             //   except for that CR/LF sequence is a single grapheme cluster.
4835             if (c == 0x0d && fp->fInputIdx < fActiveLimit && inputBuf[fp->fInputIdx] == 0x0a) {
4836                 fp->fInputIdx++;
4837             }
4838 
4839 GC_Done:
4840             if (fp->fInputIdx >= fActiveLimit) {
4841                 fHitEnd = TRUE;
4842             }
4843             break;
4844         }
4845 
4846 
4847 
4848 
4849         case URX_BACKSLASH_Z:          // Test for end of Input
4850             if (fp->fInputIdx < fAnchorLimit) {
4851                 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
4852             } else {
4853                 fHitEnd = TRUE;
4854                 fRequireEnd = TRUE;
4855             }
4856             break;
4857 
4858 
4859 
4860         case URX_STATIC_SETREF:
4861             {
4862                 // Test input character against one of the predefined sets
4863                 //    (Word Characters, for example)
4864                 // The high bit of the op value is a flag for the match polarity.
4865                 //    0:   success if input char is in set.
4866                 //    1:   success if input char is not in set.
4867                 if (fp->fInputIdx >= fActiveLimit) {
4868                     fHitEnd = TRUE;
4869                     fp = (REStackFrame *)fStack->popFrame(fFrameSize);
4870                     break;
4871                 }
4872 
4873                 UBool success = ((opValue & URX_NEG_SET) == URX_NEG_SET);
4874                 opValue &= ~URX_NEG_SET;
4875                 U_ASSERT(opValue > 0 && opValue < URX_LAST_SET);
4876 
4877                 UChar32 c;
4878                 U16_NEXT(inputBuf, fp->fInputIdx, fActiveLimit, c);
4879                 if (c < 256) {
4880                     Regex8BitSet *s8 = &fPattern->fStaticSets8[opValue];
4881                     if (s8->contains(c)) {
4882                         success = !success;
4883                     }
4884                 } else {
4885                     const UnicodeSet *s = fPattern->fStaticSets[opValue];
4886                     if (s->contains(c)) {
4887                         success = !success;
4888                     }
4889                 }
4890                 if (!success) {
4891                     fp = (REStackFrame *)fStack->popFrame(fFrameSize);
4892                 }
4893             }
4894             break;
4895 
4896 
4897         case URX_STAT_SETREF_N:
4898             {
4899                 // Test input character for NOT being a member of  one of
4900                 //    the predefined sets (Word Characters, for example)
4901                 if (fp->fInputIdx >= fActiveLimit) {
4902                     fHitEnd = TRUE;
4903                     fp = (REStackFrame *)fStack->popFrame(fFrameSize);
4904                     break;
4905                 }
4906 
4907                 U_ASSERT(opValue > 0 && opValue < URX_LAST_SET);
4908 
4909                 UChar32  c;
4910                 U16_NEXT(inputBuf, fp->fInputIdx, fActiveLimit, c);
4911                 if (c < 256) {
4912                     Regex8BitSet *s8 = &fPattern->fStaticSets8[opValue];
4913                     if (s8->contains(c) == FALSE) {
4914                         break;
4915                     }
4916                 } else {
4917                     const UnicodeSet *s = fPattern->fStaticSets[opValue];
4918                     if (s->contains(c) == FALSE) {
4919                         break;
4920                     }
4921                 }
4922                 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
4923             }
4924             break;
4925 
4926 
4927         case URX_SETREF:
4928             {
4929                 if (fp->fInputIdx >= fActiveLimit) {
4930                     fHitEnd = TRUE;
4931                     fp = (REStackFrame *)fStack->popFrame(fFrameSize);
4932                     break;
4933                 }
4934 
4935                 U_ASSERT(opValue > 0 && opValue < fSets->size());
4936 
4937                 // There is input left.  Pick up one char and test it for set membership.
4938                 UChar32  c;
4939                 U16_NEXT(inputBuf, fp->fInputIdx, fActiveLimit, c);
4940                 if (c<256) {
4941                     Regex8BitSet *s8 = &fPattern->fSets8[opValue];
4942                     if (s8->contains(c)) {
4943                         // The character is in the set.  A Match.
4944                         break;
4945                     }
4946                 } else {
4947                     UnicodeSet *s = (UnicodeSet *)fSets->elementAt(opValue);
4948                     if (s->contains(c)) {
4949                         // The character is in the set.  A Match.
4950                         break;
4951                     }
4952                 }
4953 
4954                 // the character wasn't in the set.
4955                 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
4956             }
4957             break;
4958 
4959 
4960         case URX_DOTANY:
4961             {
4962                 // . matches anything, but stops at end-of-line.
4963                 if (fp->fInputIdx >= fActiveLimit) {
4964                     // At end of input.  Match failed.  Backtrack out.
4965                     fHitEnd = TRUE;
4966                     fp = (REStackFrame *)fStack->popFrame(fFrameSize);
4967                     break;
4968                 }
4969 
4970                 // There is input left.  Advance over one char, unless we've hit end-of-line
4971                 UChar32  c;
4972                 U16_NEXT(inputBuf, fp->fInputIdx, fActiveLimit, c);
4973                 if (isLineTerminator(c)) {
4974                     // End of line in normal mode.   . does not match.
4975                     fp = (REStackFrame *)fStack->popFrame(fFrameSize);
4976                     break;
4977                 }
4978             }
4979             break;
4980 
4981 
4982         case URX_DOTANY_ALL:
4983             {
4984                 // . in dot-matches-all (including new lines) mode
4985                 if (fp->fInputIdx >= fActiveLimit) {
4986                     // At end of input.  Match failed.  Backtrack out.
4987                     fHitEnd = TRUE;
4988                     fp = (REStackFrame *)fStack->popFrame(fFrameSize);
4989                     break;
4990                 }
4991 
4992                 // There is input left.  Advance over one char, except if we are
4993                 //   at a cr/lf, advance over both of them.
4994                 UChar32 c;
4995                 U16_NEXT(inputBuf, fp->fInputIdx, fActiveLimit, c);
4996                 if (c==0x0d && fp->fInputIdx < fActiveLimit) {
4997                     // In the case of a CR/LF, we need to advance over both.
4998                     if (inputBuf[fp->fInputIdx] == 0x0a) {
4999                         U16_FWD_1(inputBuf, fp->fInputIdx, fActiveLimit);
5000                     }
5001                 }
5002             }
5003             break;
5004 
5005 
5006         case URX_DOTANY_UNIX:
5007             {
5008                 // '.' operator, matches all, but stops at end-of-line.
5009                 //   UNIX_LINES mode, so 0x0a is the only recognized line ending.
5010                 if (fp->fInputIdx >= fActiveLimit) {
5011                     // At end of input.  Match failed.  Backtrack out.
5012                     fHitEnd = TRUE;
5013                     fp = (REStackFrame *)fStack->popFrame(fFrameSize);
5014                     break;
5015                 }
5016 
5017                 // There is input left.  Advance over one char, unless we've hit end-of-line
5018                 UChar32 c;
5019                 U16_NEXT(inputBuf, fp->fInputIdx, fActiveLimit, c);
5020                 if (c == 0x0a) {
5021                     // End of line in normal mode.   '.' does not match the \n
5022                     fp = (REStackFrame *)fStack->popFrame(fFrameSize);
5023                 }
5024             }
5025             break;
5026 
5027 
5028         case URX_JMP:
5029             fp->fPatIdx = opValue;
5030             break;
5031 
5032         case URX_FAIL:
5033             isMatch = FALSE;
5034             goto breakFromLoop;
5035 
5036         case URX_JMP_SAV:
5037             U_ASSERT(opValue < fPattern->fCompiledPat->size());
5038             fp = StateSave(fp, fp->fPatIdx, status);       // State save to loc following current
5039             fp->fPatIdx = opValue;                         // Then JMP.
5040             break;
5041 
5042         case URX_JMP_SAV_X:
5043             // This opcode is used with (x)+, when x can match a zero length string.
5044             // Same as JMP_SAV, except conditional on the match having made forward progress.
5045             // Destination of the JMP must be a URX_STO_INP_LOC, from which we get the
5046             //   data address of the input position at the start of the loop.
5047             {
5048                 U_ASSERT(opValue > 0 && opValue < fPattern->fCompiledPat->size());
5049                 int32_t  stoOp = (int32_t)pat[opValue-1];
5050                 U_ASSERT(URX_TYPE(stoOp) == URX_STO_INP_LOC);
5051                 int32_t  frameLoc = URX_VAL(stoOp);
5052                 U_ASSERT(frameLoc >= 0 && frameLoc < fFrameSize);
5053                 int32_t prevInputIdx = (int32_t)fp->fExtra[frameLoc];
5054                 U_ASSERT(prevInputIdx <= fp->fInputIdx);
5055                 if (prevInputIdx < fp->fInputIdx) {
5056                     // The match did make progress.  Repeat the loop.
5057                     fp = StateSave(fp, fp->fPatIdx, status);  // State save to loc following current
5058                     fp->fPatIdx = opValue;
5059                     fp->fExtra[frameLoc] = fp->fInputIdx;
5060                 }
5061                 // If the input position did not advance, we do nothing here,
5062                 //   execution will fall out of the loop.
5063             }
5064             break;
5065 
5066         case URX_CTR_INIT:
5067             {
5068                 U_ASSERT(opValue >= 0 && opValue < fFrameSize-2);
5069                 fp->fExtra[opValue] = 0;                 //  Set the loop counter variable to zero
5070 
5071                 // Pick up the three extra operands that CTR_INIT has, and
5072                 //    skip the pattern location counter past
5073                 int32_t instrOperandLoc = (int32_t)fp->fPatIdx;
5074                 fp->fPatIdx += 3;
5075                 int32_t loopLoc  = URX_VAL(pat[instrOperandLoc]);
5076                 int32_t minCount = (int32_t)pat[instrOperandLoc+1];
5077                 int32_t maxCount = (int32_t)pat[instrOperandLoc+2];
5078                 U_ASSERT(minCount>=0);
5079                 U_ASSERT(maxCount>=minCount || maxCount==-1);
5080                 U_ASSERT(loopLoc>=fp->fPatIdx);
5081 
5082                 if (minCount == 0) {
5083                     fp = StateSave(fp, loopLoc+1, status);
5084                 }
5085                 if (maxCount == -1) {
5086                     fp->fExtra[opValue+1] = fp->fInputIdx;   //  For loop breaking.
5087                 } else if (maxCount == 0) {
5088                     fp = (REStackFrame *)fStack->popFrame(fFrameSize);
5089                 }
5090             }
5091             break;
5092 
5093         case URX_CTR_LOOP:
5094             {
5095                 U_ASSERT(opValue>0 && opValue < fp->fPatIdx-2);
5096                 int32_t initOp = (int32_t)pat[opValue];
5097                 U_ASSERT(URX_TYPE(initOp) == URX_CTR_INIT);
5098                 int64_t *pCounter = &fp->fExtra[URX_VAL(initOp)];
5099                 int32_t minCount  = (int32_t)pat[opValue+2];
5100                 int32_t maxCount  = (int32_t)pat[opValue+3];
5101                 (*pCounter)++;
5102                 if ((uint64_t)*pCounter >= (uint32_t)maxCount && maxCount != -1) {
5103                     U_ASSERT(*pCounter == maxCount);
5104                     break;
5105                 }
5106                 if (*pCounter >= minCount) {
5107                     if (maxCount == -1) {
5108                         // Loop has no hard upper bound.
5109                         // Check that it is progressing through the input, break if it is not.
5110                         int64_t *pLastInputIdx =  &fp->fExtra[URX_VAL(initOp) + 1];
5111                         if (fp->fInputIdx == *pLastInputIdx) {
5112                             break;
5113                         } else {
5114                             *pLastInputIdx = fp->fInputIdx;
5115                         }
5116                     }
5117                     fp = StateSave(fp, fp->fPatIdx, status);
5118                 } else {
5119                     // Increment time-out counter. (StateSave() does it if count >= minCount)
5120                     fTickCounter--;
5121                     if (fTickCounter <= 0) {
5122                         IncrementTime(status);    // Re-initializes fTickCounter
5123                     }
5124                 }
5125                 fp->fPatIdx = opValue + 4;    // Loop back.
5126             }
5127             break;
5128 
5129         case URX_CTR_INIT_NG:
5130             {
5131                 // Initialize a non-greedy loop
5132                 U_ASSERT(opValue >= 0 && opValue < fFrameSize-2);
5133                 fp->fExtra[opValue] = 0;                 //  Set the loop counter variable to zero
5134 
5135                 // Pick up the three extra operands that CTR_INIT_NG has, and
5136                 //    skip the pattern location counter past
5137                 int32_t instrOperandLoc = (int32_t)fp->fPatIdx;
5138                 fp->fPatIdx += 3;
5139                 int32_t loopLoc  = URX_VAL(pat[instrOperandLoc]);
5140                 int32_t minCount = (int32_t)pat[instrOperandLoc+1];
5141                 int32_t maxCount = (int32_t)pat[instrOperandLoc+2];
5142                 U_ASSERT(minCount>=0);
5143                 U_ASSERT(maxCount>=minCount || maxCount==-1);
5144                 U_ASSERT(loopLoc>fp->fPatIdx);
5145                 if (maxCount == -1) {
5146                     fp->fExtra[opValue+1] = fp->fInputIdx;   //  Save initial input index for loop breaking.
5147                 }
5148 
5149                 if (minCount == 0) {
5150                     if (maxCount != 0) {
5151                         fp = StateSave(fp, fp->fPatIdx, status);
5152                     }
5153                     fp->fPatIdx = loopLoc+1;   // Continue with stuff after repeated block
5154                 }
5155             }
5156             break;
5157 
5158         case URX_CTR_LOOP_NG:
5159             {
5160                 // Non-greedy {min, max} loops
5161                 U_ASSERT(opValue>0 && opValue < fp->fPatIdx-2);
5162                 int32_t initOp = (int32_t)pat[opValue];
5163                 U_ASSERT(URX_TYPE(initOp) == URX_CTR_INIT_NG);
5164                 int64_t *pCounter = &fp->fExtra[URX_VAL(initOp)];
5165                 int32_t minCount  = (int32_t)pat[opValue+2];
5166                 int32_t maxCount  = (int32_t)pat[opValue+3];
5167 
5168                 (*pCounter)++;
5169                 if ((uint64_t)*pCounter >= (uint32_t)maxCount && maxCount != -1) {
5170                     // The loop has matched the maximum permitted number of times.
5171                     //   Break out of here with no action.  Matching will
5172                     //   continue with the following pattern.
5173                     U_ASSERT(*pCounter == maxCount);
5174                     break;
5175                 }
5176 
5177                 if (*pCounter < minCount) {
5178                     // We haven't met the minimum number of matches yet.
5179                     //   Loop back for another one.
5180                     fp->fPatIdx = opValue + 4;    // Loop back.
5181                     fTickCounter--;
5182                     if (fTickCounter <= 0) {
5183                         IncrementTime(status);    // Re-initializes fTickCounter
5184                     }
5185                 } else {
5186                     // We do have the minimum number of matches.
5187 
5188                     // If there is no upper bound on the loop iterations, check that the input index
5189                     // is progressing, and stop the loop if it is not.
5190                     if (maxCount == -1) {
5191                         int64_t *pLastInputIdx =  &fp->fExtra[URX_VAL(initOp) + 1];
5192                         if (fp->fInputIdx == *pLastInputIdx) {
5193                             break;
5194                         }
5195                         *pLastInputIdx = fp->fInputIdx;
5196                     }
5197 
5198                     // Loop Continuation: we will fall into the pattern following the loop
5199                     //   (non-greedy, don't execute loop body first), but first do
5200                     //   a state save to the top of the loop, so that a match failure
5201                     //   in the following pattern will try another iteration of the loop.
5202                     fp = StateSave(fp, opValue + 4, status);
5203                 }
5204             }
5205             break;
5206 
5207         case URX_STO_SP:
5208             U_ASSERT(opValue >= 0 && opValue < fPattern->fDataSize);
5209             fData[opValue] = fStack->size();
5210             break;
5211 
5212         case URX_LD_SP:
5213             {
5214                 U_ASSERT(opValue >= 0 && opValue < fPattern->fDataSize);
5215                 int32_t newStackSize = (int32_t)fData[opValue];
5216                 U_ASSERT(newStackSize <= fStack->size());
5217                 int64_t *newFP = fStack->getBuffer() + newStackSize - fFrameSize;
5218                 if (newFP == (int64_t *)fp) {
5219                     break;
5220                 }
5221                 int32_t j;
5222                 for (j=0; j<fFrameSize; j++) {
5223                     newFP[j] = ((int64_t *)fp)[j];
5224                 }
5225                 fp = (REStackFrame *)newFP;
5226                 fStack->setSize(newStackSize);
5227             }
5228             break;
5229 
5230         case URX_BACKREF:
5231             {
5232                 U_ASSERT(opValue < fFrameSize);
5233                 int64_t groupStartIdx = fp->fExtra[opValue];
5234                 int64_t groupEndIdx   = fp->fExtra[opValue+1];
5235                 U_ASSERT(groupStartIdx <= groupEndIdx);
5236                 int64_t inputIndex = fp->fInputIdx;
5237                 if (groupStartIdx < 0) {
5238                     // This capture group has not participated in the match thus far,
5239                     fp = (REStackFrame *)fStack->popFrame(fFrameSize);   // FAIL, no match.
5240                     break;
5241                 }
5242                 UBool success = TRUE;
5243                 for (int64_t groupIndex = groupStartIdx; groupIndex < groupEndIdx; ++groupIndex,++inputIndex) {
5244                     if (inputIndex >= fActiveLimit) {
5245                         success = FALSE;
5246                         fHitEnd = TRUE;
5247                         break;
5248                     }
5249                     if (inputBuf[groupIndex] != inputBuf[inputIndex]) {
5250                         success = FALSE;
5251                         break;
5252                     }
5253                 }
5254                 if (success && groupStartIdx < groupEndIdx && U16_IS_LEAD(inputBuf[groupEndIdx-1]) &&
5255                         inputIndex < fActiveLimit && U16_IS_TRAIL(inputBuf[inputIndex])) {
5256                     // Capture group ended with an unpaired lead surrogate.
5257                     // Back reference is not permitted to match lead only of a surrogatge pair.
5258                     success = FALSE;
5259                 }
5260                 if (success) {
5261                     fp->fInputIdx = inputIndex;
5262                 } else {
5263                     fp = (REStackFrame *)fStack->popFrame(fFrameSize);
5264                 }
5265             }
5266             break;
5267 
5268         case URX_BACKREF_I:
5269             {
5270                 U_ASSERT(opValue < fFrameSize);
5271                 int64_t groupStartIdx = fp->fExtra[opValue];
5272                 int64_t groupEndIdx   = fp->fExtra[opValue+1];
5273                 U_ASSERT(groupStartIdx <= groupEndIdx);
5274                 if (groupStartIdx < 0) {
5275                     // This capture group has not participated in the match thus far,
5276                     fp = (REStackFrame *)fStack->popFrame(fFrameSize);   // FAIL, no match.
5277                     break;
5278                 }
5279                 CaseFoldingUCharIterator captureGroupItr(inputBuf, groupStartIdx, groupEndIdx);
5280                 CaseFoldingUCharIterator inputItr(inputBuf, fp->fInputIdx, fActiveLimit);
5281 
5282                 //   Note: if the capture group match was of an empty string the backref
5283                 //         match succeeds.  Verified by testing:  Perl matches succeed
5284                 //         in this case, so we do too.
5285 
5286                 UBool success = TRUE;
5287                 for (;;) {
5288                     UChar32 captureGroupChar = captureGroupItr.next();
5289                     if (captureGroupChar == U_SENTINEL) {
5290                         success = TRUE;
5291                         break;
5292                     }
5293                     UChar32 inputChar = inputItr.next();
5294                     if (inputChar == U_SENTINEL) {
5295                         success = FALSE;
5296                         fHitEnd = TRUE;
5297                         break;
5298                     }
5299                     if (inputChar != captureGroupChar) {
5300                         success = FALSE;
5301                         break;
5302                     }
5303                 }
5304 
5305                 if (success && inputItr.inExpansion()) {
5306                     // We otained a match by consuming part of a string obtained from
5307                     // case-folding a single code point of the input text.
5308                     // This does not count as an overall match.
5309                     success = FALSE;
5310                 }
5311 
5312                 if (success) {
5313                     fp->fInputIdx = inputItr.getIndex();
5314                 } else {
5315                     fp = (REStackFrame *)fStack->popFrame(fFrameSize);
5316                 }
5317             }
5318             break;
5319 
5320         case URX_STO_INP_LOC:
5321             {
5322                 U_ASSERT(opValue >= 0 && opValue < fFrameSize);
5323                 fp->fExtra[opValue] = fp->fInputIdx;
5324             }
5325             break;
5326 
5327         case URX_JMPX:
5328             {
5329                 int32_t instrOperandLoc = (int32_t)fp->fPatIdx;
5330                 fp->fPatIdx += 1;
5331                 int32_t dataLoc  = URX_VAL(pat[instrOperandLoc]);
5332                 U_ASSERT(dataLoc >= 0 && dataLoc < fFrameSize);
5333                 int32_t savedInputIdx = (int32_t)fp->fExtra[dataLoc];
5334                 U_ASSERT(savedInputIdx <= fp->fInputIdx);
5335                 if (savedInputIdx < fp->fInputIdx) {
5336                     fp->fPatIdx = opValue;                               // JMP
5337                 } else {
5338                     fp = (REStackFrame *)fStack->popFrame(fFrameSize);   // FAIL, no progress in loop.
5339                 }
5340             }
5341             break;
5342 
5343         case URX_LA_START:
5344             {
5345                 // Entering a look around block.
5346                 // Save Stack Ptr, Input Pos.
5347                 U_ASSERT(opValue>=0 && opValue+3<fPattern->fDataSize);
5348                 fData[opValue]   = fStack->size();
5349                 fData[opValue+1] = fp->fInputIdx;
5350                 fData[opValue+2] = fActiveStart;
5351                 fData[opValue+3] = fActiveLimit;
5352                 fActiveStart     = fLookStart;          // Set the match region change for
5353                 fActiveLimit     = fLookLimit;          //   transparent bounds.
5354             }
5355             break;
5356 
5357         case URX_LA_END:
5358             {
5359                 // Leaving a look around block.
5360                 //  restore Stack Ptr, Input Pos to positions they had on entry to block.
5361                 U_ASSERT(opValue>=0 && opValue+3<fPattern->fDataSize);
5362                 int32_t stackSize = fStack->size();
5363                 int32_t newStackSize = (int32_t)fData[opValue];
5364                 U_ASSERT(stackSize >= newStackSize);
5365                 if (stackSize > newStackSize) {
5366                     // Copy the current top frame back to the new (cut back) top frame.
5367                     //   This makes the capture groups from within the look-ahead
5368                     //   expression available.
5369                     int64_t *newFP = fStack->getBuffer() + newStackSize - fFrameSize;
5370                     int32_t j;
5371                     for (j=0; j<fFrameSize; j++) {
5372                         newFP[j] = ((int64_t *)fp)[j];
5373                     }
5374                     fp = (REStackFrame *)newFP;
5375                     fStack->setSize(newStackSize);
5376                 }
5377                 fp->fInputIdx = fData[opValue+1];
5378 
5379                 // Restore the active region bounds in the input string; they may have
5380                 //    been changed because of transparent bounds on a Region.
5381                 fActiveStart = fData[opValue+2];
5382                 fActiveLimit = fData[opValue+3];
5383                 U_ASSERT(fActiveStart >= 0);
5384                 U_ASSERT(fActiveLimit <= fInputLength);
5385             }
5386             break;
5387 
5388         case URX_ONECHAR_I:
5389             if (fp->fInputIdx < fActiveLimit) {
5390                 UChar32 c;
5391                 U16_NEXT(inputBuf, fp->fInputIdx, fActiveLimit, c);
5392                 if (u_foldCase(c, U_FOLD_CASE_DEFAULT) == opValue) {
5393                     break;
5394                 }
5395             } else {
5396                 fHitEnd = TRUE;
5397             }
5398             fp = (REStackFrame *)fStack->popFrame(fFrameSize);
5399             break;
5400 
5401         case URX_STRING_I:
5402             // Case-insensitive test input against a literal string.
5403             // Strings require two slots in the compiled pattern, one for the
5404             //   offset to the string text, and one for the length.
5405             //   The compiled string has already been case folded.
5406             {
5407                 const UChar *patternString = litText + opValue;
5408 
5409                 op      = (int32_t)pat[fp->fPatIdx];
5410                 fp->fPatIdx++;
5411                 opType  = URX_TYPE(op);
5412                 opValue = URX_VAL(op);
5413                 U_ASSERT(opType == URX_STRING_LEN);
5414                 int32_t patternStringLen = opValue;  // Length of the string from the pattern.
5415 
5416                 UChar32      cText;
5417                 UChar32      cPattern;
5418                 UBool        success = TRUE;
5419                 int32_t      patternStringIdx  = 0;
5420                 CaseFoldingUCharIterator inputIterator(inputBuf, fp->fInputIdx, fActiveLimit);
5421                 while (patternStringIdx < patternStringLen) {
5422                     U16_NEXT(patternString, patternStringIdx, patternStringLen, cPattern);
5423                     cText = inputIterator.next();
5424                     if (cText != cPattern) {
5425                         success = FALSE;
5426                         if (cText == U_SENTINEL) {
5427                             fHitEnd = TRUE;
5428                         }
5429                         break;
5430                     }
5431                 }
5432                 if (inputIterator.inExpansion()) {
5433                     success = FALSE;
5434                 }
5435 
5436                 if (success) {
5437                     fp->fInputIdx = inputIterator.getIndex();
5438                 } else {
5439                     fp = (REStackFrame *)fStack->popFrame(fFrameSize);
5440                 }
5441             }
5442             break;
5443 
5444         case URX_LB_START:
5445             {
5446                 // Entering a look-behind block.
5447                 // Save Stack Ptr, Input Pos and active input region.
5448                 //   TODO:  implement transparent bounds.  Ticket #6067
5449                 U_ASSERT(opValue>=0 && opValue+4<fPattern->fDataSize);
5450                 fData[opValue]   = fStack->size();
5451                 fData[opValue+1] = fp->fInputIdx;
5452                 // Save input string length, then reset to pin any matches to end at
5453                 //   the current position.
5454                 fData[opValue+2] = fActiveStart;
5455                 fData[opValue+3] = fActiveLimit;
5456                 fActiveStart     = fRegionStart;
5457                 fActiveLimit     = fp->fInputIdx;
5458                 // Init the variable containing the start index for attempted matches.
5459                 fData[opValue+4] = -1;
5460             }
5461             break;
5462 
5463 
5464         case URX_LB_CONT:
5465             {
5466                 // Positive Look-Behind, at top of loop checking for matches of LB expression
5467                 //    at all possible input starting positions.
5468 
5469                 // Fetch the min and max possible match lengths.  They are the operands
5470                 //   of this op in the pattern.
5471                 int32_t minML = (int32_t)pat[fp->fPatIdx++];
5472                 int32_t maxML = (int32_t)pat[fp->fPatIdx++];
5473                 U_ASSERT(minML <= maxML);
5474                 U_ASSERT(minML >= 0);
5475 
5476                 // Fetch (from data) the last input index where a match was attempted.
5477                 U_ASSERT(opValue>=0 && opValue+4<fPattern->fDataSize);
5478                 int64_t  &lbStartIdx = fData[opValue+4];
5479                 if (lbStartIdx < 0) {
5480                     // First time through loop.
5481                     lbStartIdx = fp->fInputIdx - minML;
5482                     if (lbStartIdx > 0 && lbStartIdx < fInputLength) {
5483                         U16_SET_CP_START(inputBuf, 0, lbStartIdx);
5484                     }
5485                 } else {
5486                     // 2nd through nth time through the loop.
5487                     // Back up start position for match by one.
5488                     if (lbStartIdx == 0) {
5489                         lbStartIdx--;
5490                     } else {
5491                         U16_BACK_1(inputBuf, 0, lbStartIdx);
5492                     }
5493                 }
5494 
5495                 if (lbStartIdx < 0 || lbStartIdx < fp->fInputIdx - maxML) {
5496                     // We have tried all potential match starting points without
5497                     //  getting a match.  Backtrack out, and out of the
5498                     //   Look Behind altogether.
5499                     fp = (REStackFrame *)fStack->popFrame(fFrameSize);
5500                     fActiveStart = fData[opValue+2];
5501                     fActiveLimit = fData[opValue+3];
5502                     U_ASSERT(fActiveStart >= 0);
5503                     U_ASSERT(fActiveLimit <= fInputLength);
5504                     break;
5505                 }
5506 
5507                 //    Save state to this URX_LB_CONT op, so failure to match will repeat the loop.
5508                 //      (successful match will fall off the end of the loop.)
5509                 fp = StateSave(fp, fp->fPatIdx-3, status);
5510                 fp->fInputIdx =  lbStartIdx;
5511             }
5512             break;
5513 
5514         case URX_LB_END:
5515             // End of a look-behind block, after a successful match.
5516             {
5517                 U_ASSERT(opValue>=0 && opValue+4<fPattern->fDataSize);
5518                 if (fp->fInputIdx != fActiveLimit) {
5519                     //  The look-behind expression matched, but the match did not
5520                     //    extend all the way to the point that we are looking behind from.
5521                     //  FAIL out of here, which will take us back to the LB_CONT, which
5522                     //     will retry the match starting at another position or fail
5523                     //     the look-behind altogether, whichever is appropriate.
5524                     fp = (REStackFrame *)fStack->popFrame(fFrameSize);
5525                     break;
5526                 }
5527 
5528                 // Look-behind match is good.  Restore the orignal input string region,
5529                 //   which had been truncated to pin the end of the lookbehind match to the
5530                 //   position being looked-behind.
5531                 fActiveStart = fData[opValue+2];
5532                 fActiveLimit = fData[opValue+3];
5533                 U_ASSERT(fActiveStart >= 0);
5534                 U_ASSERT(fActiveLimit <= fInputLength);
5535             }
5536             break;
5537 
5538 
5539         case URX_LBN_CONT:
5540             {
5541                 // Negative Look-Behind, at top of loop checking for matches of LB expression
5542                 //    at all possible input starting positions.
5543 
5544                 // Fetch the extra parameters of this op.
5545                 int32_t minML       = (int32_t)pat[fp->fPatIdx++];
5546                 int32_t maxML       = (int32_t)pat[fp->fPatIdx++];
5547                 int32_t continueLoc = (int32_t)pat[fp->fPatIdx++];
5548                 continueLoc = URX_VAL(continueLoc);
5549                 U_ASSERT(minML <= maxML);
5550                 U_ASSERT(minML >= 0);
5551                 U_ASSERT(continueLoc > fp->fPatIdx);
5552 
5553                 // Fetch (from data) the last input index where a match was attempted.
5554                 U_ASSERT(opValue>=0 && opValue+4<fPattern->fDataSize);
5555                 int64_t  &lbStartIdx = fData[opValue+4];
5556                 if (lbStartIdx < 0) {
5557                     // First time through loop.
5558                     lbStartIdx = fp->fInputIdx - minML;
5559                     if (lbStartIdx > 0 && lbStartIdx < fInputLength) {
5560                         U16_SET_CP_START(inputBuf, 0, lbStartIdx);
5561                     }
5562                 } else {
5563                     // 2nd through nth time through the loop.
5564                     // Back up start position for match by one.
5565                     if (lbStartIdx == 0) {
5566                         lbStartIdx--;   // Because U16_BACK is unsafe starting at 0.
5567                     } else {
5568                         U16_BACK_1(inputBuf, 0, lbStartIdx);
5569                     }
5570                 }
5571 
5572                 if (lbStartIdx < 0 || lbStartIdx < fp->fInputIdx - maxML) {
5573                     // We have tried all potential match starting points without
5574                     //  getting a match, which means that the negative lookbehind as
5575                     //  a whole has succeeded.  Jump forward to the continue location
5576                     fActiveStart = fData[opValue+2];
5577                     fActiveLimit = fData[opValue+3];
5578                     U_ASSERT(fActiveStart >= 0);
5579                     U_ASSERT(fActiveLimit <= fInputLength);
5580                     fp->fPatIdx = continueLoc;
5581                     break;
5582                 }
5583 
5584                 //    Save state to this URX_LB_CONT op, so failure to match will repeat the loop.
5585                 //      (successful match will cause a FAIL out of the loop altogether.)
5586                 fp = StateSave(fp, fp->fPatIdx-4, status);
5587                 fp->fInputIdx =  lbStartIdx;
5588             }
5589             break;
5590 
5591         case URX_LBN_END:
5592             // End of a negative look-behind block, after a successful match.
5593             {
5594                 U_ASSERT(opValue>=0 && opValue+4<fPattern->fDataSize);
5595                 if (fp->fInputIdx != fActiveLimit) {
5596                     //  The look-behind expression matched, but the match did not
5597                     //    extend all the way to the point that we are looking behind from.
5598                     //  FAIL out of here, which will take us back to the LB_CONT, which
5599                     //     will retry the match starting at another position or succeed
5600                     //     the look-behind altogether, whichever is appropriate.
5601                     fp = (REStackFrame *)fStack->popFrame(fFrameSize);
5602                     break;
5603                 }
5604 
5605                 // Look-behind expression matched, which means look-behind test as
5606                 //   a whole Fails
5607 
5608                 //   Restore the orignal input string length, which had been truncated
5609                 //   inorder to pin the end of the lookbehind match
5610                 //   to the position being looked-behind.
5611                 fActiveStart = fData[opValue+2];
5612                 fActiveLimit = fData[opValue+3];
5613                 U_ASSERT(fActiveStart >= 0);
5614                 U_ASSERT(fActiveLimit <= fInputLength);
5615 
5616                 // Restore original stack position, discarding any state saved
5617                 //   by the successful pattern match.
5618                 U_ASSERT(opValue>=0 && opValue+1<fPattern->fDataSize);
5619                 int32_t newStackSize = (int32_t)fData[opValue];
5620                 U_ASSERT(fStack->size() > newStackSize);
5621                 fStack->setSize(newStackSize);
5622 
5623                 //  FAIL, which will take control back to someplace
5624                 //  prior to entering the look-behind test.
5625                 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
5626             }
5627             break;
5628 
5629 
5630         case URX_LOOP_SR_I:
5631             // Loop Initialization for the optimized implementation of
5632             //     [some character set]*
5633             //   This op scans through all matching input.
5634             //   The following LOOP_C op emulates stack unwinding if the following pattern fails.
5635             {
5636                 U_ASSERT(opValue > 0 && opValue < fSets->size());
5637                 Regex8BitSet *s8 = &fPattern->fSets8[opValue];
5638                 UnicodeSet   *s  = (UnicodeSet *)fSets->elementAt(opValue);
5639 
5640                 // Loop through input, until either the input is exhausted or
5641                 //   we reach a character that is not a member of the set.
5642                 int32_t ix = (int32_t)fp->fInputIdx;
5643                 for (;;) {
5644                     if (ix >= fActiveLimit) {
5645                         fHitEnd = TRUE;
5646                         break;
5647                     }
5648                     UChar32   c;
5649                     U16_NEXT(inputBuf, ix, fActiveLimit, c);
5650                     if (c<256) {
5651                         if (s8->contains(c) == FALSE) {
5652                             U16_BACK_1(inputBuf, 0, ix);
5653                             break;
5654                         }
5655                     } else {
5656                         if (s->contains(c) == FALSE) {
5657                             U16_BACK_1(inputBuf, 0, ix);
5658                             break;
5659                         }
5660                     }
5661                 }
5662 
5663                 // If there were no matching characters, skip over the loop altogether.
5664                 //   The loop doesn't run at all, a * op always succeeds.
5665                 if (ix == fp->fInputIdx) {
5666                     fp->fPatIdx++;   // skip the URX_LOOP_C op.
5667                     break;
5668                 }
5669 
5670                 // Peek ahead in the compiled pattern, to the URX_LOOP_C that
5671                 //   must follow.  It's operand is the stack location
5672                 //   that holds the starting input index for the match of this [set]*
5673                 int32_t loopcOp = (int32_t)pat[fp->fPatIdx];
5674                 U_ASSERT(URX_TYPE(loopcOp) == URX_LOOP_C);
5675                 int32_t stackLoc = URX_VAL(loopcOp);
5676                 U_ASSERT(stackLoc >= 0 && stackLoc < fFrameSize);
5677                 fp->fExtra[stackLoc] = fp->fInputIdx;
5678                 fp->fInputIdx = ix;
5679 
5680                 // Save State to the URX_LOOP_C op that follows this one,
5681                 //   so that match failures in the following code will return to there.
5682                 //   Then bump the pattern idx so the LOOP_C is skipped on the way out of here.
5683                 fp = StateSave(fp, fp->fPatIdx, status);
5684                 fp->fPatIdx++;
5685             }
5686             break;
5687 
5688 
5689         case URX_LOOP_DOT_I:
5690             // Loop Initialization for the optimized implementation of .*
5691             //   This op scans through all remaining input.
5692             //   The following LOOP_C op emulates stack unwinding if the following pattern fails.
5693             {
5694                 // Loop through input until the input is exhausted (we reach an end-of-line)
5695                 // In DOTALL mode, we can just go straight to the end of the input.
5696                 int32_t ix;
5697                 if ((opValue & 1) == 1) {
5698                     // Dot-matches-All mode.  Jump straight to the end of the string.
5699                     ix = (int32_t)fActiveLimit;
5700                     fHitEnd = TRUE;
5701                 } else {
5702                     // NOT DOT ALL mode.  Line endings do not match '.'
5703                     // Scan forward until a line ending or end of input.
5704                     ix = (int32_t)fp->fInputIdx;
5705                     for (;;) {
5706                         if (ix >= fActiveLimit) {
5707                             fHitEnd = TRUE;
5708                             break;
5709                         }
5710                         UChar32   c;
5711                         U16_NEXT(inputBuf, ix, fActiveLimit, c);   // c = inputBuf[ix++]
5712                         if ((c & 0x7f) <= 0x29) {          // Fast filter of non-new-line-s
5713                             if ((c == 0x0a) ||             //  0x0a is newline in both modes.
5714                                 (((opValue & 2) == 0) &&    // IF not UNIX_LINES mode
5715                                    isLineTerminator(c))) {
5716                                 //  char is a line ending.  Put the input pos back to the
5717                                 //    line ending char, and exit the scanning loop.
5718                                 U16_BACK_1(inputBuf, 0, ix);
5719                                 break;
5720                             }
5721                         }
5722                     }
5723                 }
5724 
5725                 // If there were no matching characters, skip over the loop altogether.
5726                 //   The loop doesn't run at all, a * op always succeeds.
5727                 if (ix == fp->fInputIdx) {
5728                     fp->fPatIdx++;   // skip the URX_LOOP_C op.
5729                     break;
5730                 }
5731 
5732                 // Peek ahead in the compiled pattern, to the URX_LOOP_C that
5733                 //   must follow.  It's operand is the stack location
5734                 //   that holds the starting input index for the match of this .*
5735                 int32_t loopcOp = (int32_t)pat[fp->fPatIdx];
5736                 U_ASSERT(URX_TYPE(loopcOp) == URX_LOOP_C);
5737                 int32_t stackLoc = URX_VAL(loopcOp);
5738                 U_ASSERT(stackLoc >= 0 && stackLoc < fFrameSize);
5739                 fp->fExtra[stackLoc] = fp->fInputIdx;
5740                 fp->fInputIdx = ix;
5741 
5742                 // Save State to the URX_LOOP_C op that follows this one,
5743                 //   so that match failures in the following code will return to there.
5744                 //   Then bump the pattern idx so the LOOP_C is skipped on the way out of here.
5745                 fp = StateSave(fp, fp->fPatIdx, status);
5746                 fp->fPatIdx++;
5747             }
5748             break;
5749 
5750 
5751         case URX_LOOP_C:
5752             {
5753                 U_ASSERT(opValue>=0 && opValue<fFrameSize);
5754                 backSearchIndex = (int32_t)fp->fExtra[opValue];
5755                 U_ASSERT(backSearchIndex <= fp->fInputIdx);
5756                 if (backSearchIndex == fp->fInputIdx) {
5757                     // We've backed up the input idx to the point that the loop started.
5758                     // The loop is done.  Leave here without saving state.
5759                     //  Subsequent failures won't come back here.
5760                     break;
5761                 }
5762                 // Set up for the next iteration of the loop, with input index
5763                 //   backed up by one from the last time through,
5764                 //   and a state save to this instruction in case the following code fails again.
5765                 //   (We're going backwards because this loop emulates stack unwinding, not
5766                 //    the initial scan forward.)
5767                 U_ASSERT(fp->fInputIdx > 0);
5768                 UChar32 prevC;
5769                 U16_PREV(inputBuf, 0, fp->fInputIdx, prevC); // !!!: should this 0 be one of f*Limit?
5770 
5771                 if (prevC == 0x0a &&
5772                     fp->fInputIdx > backSearchIndex &&
5773                     inputBuf[fp->fInputIdx-1] == 0x0d) {
5774                     int32_t prevOp = (int32_t)pat[fp->fPatIdx-2];
5775                     if (URX_TYPE(prevOp) == URX_LOOP_DOT_I) {
5776                         // .*, stepping back over CRLF pair.
5777                         U16_BACK_1(inputBuf, 0, fp->fInputIdx);
5778                     }
5779                 }
5780 
5781 
5782                 fp = StateSave(fp, fp->fPatIdx-1, status);
5783             }
5784             break;
5785 
5786 
5787 
5788         default:
5789             // Trouble.  The compiled pattern contains an entry with an
5790             //           unrecognized type tag.
5791             UPRV_UNREACHABLE;
5792         }
5793 
5794         if (U_FAILURE(status)) {
5795             isMatch = FALSE;
5796             break;
5797         }
5798     }
5799 
5800 breakFromLoop:
5801     fMatch = isMatch;
5802     if (isMatch) {
5803         fLastMatchEnd = fMatchEnd;
5804         fMatchStart   = startIdx;
5805         fMatchEnd     = fp->fInputIdx;
5806     }
5807 
5808 #ifdef REGEX_RUN_DEBUG
5809     if (fTraceDebug) {
5810         if (isMatch) {
5811             printf("Match.  start=%ld   end=%ld\n\n", fMatchStart, fMatchEnd);
5812         } else {
5813             printf("No match\n\n");
5814         }
5815     }
5816 #endif
5817 
5818     fFrame = fp;                // The active stack frame when the engine stopped.
5819                                 //   Contains the capture group results that we need to
5820                                 //    access later.
5821 
5822     return;
5823 }
5824 
5825 
5826 UOBJECT_DEFINE_RTTI_IMPLEMENTATION(RegexMatcher)
5827 
5828 U_NAMESPACE_END
5829 
5830 #endif  // !UCONFIG_NO_REGULAR_EXPRESSIONS
5831 
5832