1 // © 2016 and later: Unicode, Inc. and others.
2 // License & terms of use: http://www.unicode.org/copyright.html
3 /********************************************************************
4  * COPYRIGHT:
5  * Copyright (c) 2002-2016, International Business Machines Corporation and
6  * others. All Rights Reserved.
7  ********************************************************************/
8 
9 //
10 //   regextst.cpp
11 //
12 //      ICU Regular Expressions test, part of intltest.
13 //
14 
15 /*
16      NOTE!!
17 
18      PLEASE be careful about ASCII assumptions in this test.
19      This test is one of the worst repeat offenders.
20      If you have questions, contact someone on the ICU PMC
21      who has access to an EBCDIC system.
22 
23  */
24 
25 #include "intltest.h"
26 #if !UCONFIG_NO_REGULAR_EXPRESSIONS
27 
28 #include <stdlib.h>
29 #include <stdio.h>
30 #include <string.h>
31 
32 #include "unicode/localpointer.h"
33 #include "unicode/regex.h"
34 #include "unicode/stringpiece.h"
35 #include "unicode/uchar.h"
36 #include "unicode/ucnv.h"
37 #include "unicode/uniset.h"
38 #include "unicode/uregex.h"
39 #include "unicode/usetiter.h"
40 #include "unicode/ustring.h"
41 #include "unicode/utext.h"
42 #include "unicode/utf16.h"
43 #include "cstr.h"
44 #include "regextst.h"
45 #include "regexcmp.h"
46 #include "uvector.h"
47 #include "util.h"
48 #include "cmemory.h"
49 #include "cstring.h"
50 #include "uinvchar.h"
51 
52 #define SUPPORT_MUTATING_INPUT_STRING   0
53 
54 //---------------------------------------------------------------------------
55 //
56 //  Test class boilerplate
57 //
58 //---------------------------------------------------------------------------
RegexTest()59 RegexTest::RegexTest()
60 {
61 }
62 
63 
~RegexTest()64 RegexTest::~RegexTest()
65 {
66 }
67 
68 
69 
runIndexedTest(int32_t index,UBool exec,const char * & name,char *)70 void RegexTest::runIndexedTest( int32_t index, UBool exec, const char* &name, char* /*par*/ )
71 {
72     if (exec) logln("TestSuite RegexTest: ");
73     TESTCASE_AUTO_BEGIN;
74     TESTCASE_AUTO(Basic);
75     TESTCASE_AUTO(API_Match);
76     TESTCASE_AUTO(API_Replace);
77     TESTCASE_AUTO(API_Pattern);
78 #if !UCONFIG_NO_FILE_IO
79     TESTCASE_AUTO(Extended);
80 #endif
81     TESTCASE_AUTO(Errors);
82     TESTCASE_AUTO(PerlTests);
83     TESTCASE_AUTO(Callbacks);
84     TESTCASE_AUTO(FindProgressCallbacks);
85     TESTCASE_AUTO(Bug6149);
86     TESTCASE_AUTO(UTextBasic);
87     TESTCASE_AUTO(API_Match_UTF8);
88     TESTCASE_AUTO(API_Replace_UTF8);
89     TESTCASE_AUTO(API_Pattern_UTF8);
90     TESTCASE_AUTO(PerlTestsUTF8);
91     TESTCASE_AUTO(PreAllocatedUTextCAPI);
92     TESTCASE_AUTO(Bug7651);
93     TESTCASE_AUTO(Bug7740);
94     TESTCASE_AUTO(Bug8479);
95     TESTCASE_AUTO(Bug7029);
96     TESTCASE_AUTO(CheckInvBufSize);
97     TESTCASE_AUTO(Bug9283);
98     TESTCASE_AUTO(Bug10459);
99     TESTCASE_AUTO(TestCaseInsensitiveStarters);
100     TESTCASE_AUTO(TestBug11049);
101     TESTCASE_AUTO(TestBug11371);
102     TESTCASE_AUTO(TestBug11480);
103     TESTCASE_AUTO(NamedCapture);
104     TESTCASE_AUTO(NamedCaptureLimits);
105     TESTCASE_AUTO(TestBug12884);
106     TESTCASE_AUTO(TestBug13631);
107     TESTCASE_AUTO(TestBug13632);
108     TESTCASE_AUTO(TestBug20359);
109     TESTCASE_AUTO(TestBug20863);
110     TESTCASE_AUTO_END;
111 }
112 
113 
114 /**
115  * Calls utext_openUTF8 after, potentially, converting invariant text from the compilation codepage
116  * into ASCII.
117  * @see utext_openUTF8
118  */
119 static UText* regextst_openUTF8FromInvariant(UText* ut, const char *inv, int64_t length, UErrorCode *status);
120 
121 //---------------------------------------------------------------------------
122 //
123 //   Error Checking / Reporting macros used in all of the tests.
124 //
125 //---------------------------------------------------------------------------
126 
utextToPrintable(char * buf,int32_t bufLen,UText * text)127 static void utextToPrintable(char *buf, int32_t bufLen, UText *text) {
128   int64_t oldIndex = utext_getNativeIndex(text);
129   utext_setNativeIndex(text, 0);
130   char *bufPtr = buf;
131   UChar32 c = utext_next32From(text, 0);
132   while ((c != U_SENTINEL) && (bufPtr < buf+bufLen)) {
133     if (0x000020<=c && c<0x00007e) {
134       *bufPtr = c;
135     } else {
136 #if 0
137       sprintf(bufPtr,"U+%04X", c);
138       bufPtr+= strlen(bufPtr)-1;
139 #else
140       *bufPtr = '%';
141 #endif
142     }
143     bufPtr++;
144     c = UTEXT_NEXT32(text);
145   }
146   *bufPtr = 0;
147 #if (U_CHARSET_FAMILY==U_EBCDIC_FAMILY)
148   char *ebuf = (char*)malloc(bufLen);
149   uprv_eastrncpy((unsigned char*)ebuf, (const unsigned char*)buf, bufLen);
150   uprv_strncpy(buf, ebuf, bufLen);
151   free((void*)ebuf);
152 #endif
153   utext_setNativeIndex(text, oldIndex);
154 }
155 
156 
157 static char ASSERT_BUF[1024];
158 
extractToAssertBuf(const UnicodeString & message)159 const char* RegexTest::extractToAssertBuf(const UnicodeString& message) {
160   if(message.length()==0) {
161     strcpy(ASSERT_BUF, "[[empty UnicodeString]]");
162   } else {
163     UnicodeString buf;
164     IntlTest::prettify(message,buf);
165     if(buf.length()==0) {
166       strcpy(ASSERT_BUF, "[[escape() returned 0 chars]]");
167     } else {
168       buf.extract(0, 0x7FFFFFFF, ASSERT_BUF, sizeof(ASSERT_BUF)-1);
169       if(ASSERT_BUF[0]==0) {
170         ASSERT_BUF[0]=0;
171         for(int32_t i=0;i<buf.length();i++) {
172           UChar ch = buf[i];
173           sprintf(ASSERT_BUF+strlen(ASSERT_BUF),"\\u%02x",ch);
174         }
175       }
176     }
177   }
178   ASSERT_BUF[sizeof(ASSERT_BUF)-1] = 0;
179   return ASSERT_BUF;
180 }
181 
182 #define REGEX_VERBOSE_TEXT(text) UPRV_BLOCK_MACRO_BEGIN { \
183     char buf[200]; \
184     utextToPrintable(buf,UPRV_LENGTHOF(buf),text); \
185     logln("%s:%d: UText %s=\"%s\"", __FILE__, __LINE__, #text, buf); \
186 } UPRV_BLOCK_MACRO_END
187 
188 #define REGEX_CHECK_STATUS UPRV_BLOCK_MACRO_BEGIN { \
189     if (U_FAILURE(status)) { \
190         dataerrln("%s:%d: RegexTest failure.  status=%s", \
191                   __FILE__, __LINE__, u_errorName(status)); \
192         return; \
193     } \
194 } UPRV_BLOCK_MACRO_END
195 
196 #define REGEX_ASSERT(expr) UPRV_BLOCK_MACRO_BEGIN { \
197     if ((expr)==FALSE) { \
198         errln("%s:%d: RegexTest failure: REGEX_ASSERT(%s) failed \n", __FILE__, __LINE__, #expr); \
199     } \
200 } UPRV_BLOCK_MACRO_END
201 
202 #define REGEX_ASSERT_FAIL(expr, errcode) UPRV_BLOCK_MACRO_BEGIN { \
203     UErrorCode status=U_ZERO_ERROR; \
204     (expr); \
205     if (status!=errcode) { \
206         dataerrln("RegexTest failure at line %d.  Expected status=%s, got %s", \
207                   __LINE__, u_errorName(errcode), u_errorName(status)); \
208     } \
209 } UPRV_BLOCK_MACRO_END
210 
211 #define REGEX_CHECK_STATUS_L(line) UPRV_BLOCK_MACRO_BEGIN { \
212     if (U_FAILURE(status)) { \
213         errln("RegexTest failure at line %d, from %d.  status=%d\n",__LINE__, (line), status); \
214     } \
215 } UPRV_BLOCK_MACRO_END
216 
217 #define REGEX_ASSERT_L(expr, line) UPRV_BLOCK_MACRO_BEGIN { \
218     if ((expr)==FALSE) { \
219         errln("RegexTest failure at line %d, from %d.", __LINE__, (line)); \
220         return; \
221     } \
222 } UPRV_BLOCK_MACRO_END
223 
224 // expected: const char * , restricted to invariant characters.
225 // actual: const UnicodeString &
226 #define REGEX_ASSERT_UNISTR(expected, actual) UPRV_BLOCK_MACRO_BEGIN { \
227     if (UnicodeString(expected, -1, US_INV) != (actual)) { \
228         errln("%s:%d: RegexTest failure: REGEX_ASSERT_UNISTR(%s, %s) failed \n",  \
229               __FILE__, __LINE__, expected, extractToAssertBuf(actual)); \
230     } \
231 } UPRV_BLOCK_MACRO_END
232 
233 
testUTextEqual(UText * uta,UText * utb)234 static UBool testUTextEqual(UText *uta, UText *utb) {
235     UChar32 ca = 0;
236     UChar32 cb = 0;
237     utext_setNativeIndex(uta, 0);
238     utext_setNativeIndex(utb, 0);
239     do {
240         ca = utext_next32(uta);
241         cb = utext_next32(utb);
242         if (ca != cb) {
243             break;
244         }
245     } while (ca != U_SENTINEL);
246     return ca == cb;
247 }
248 
249 
250 /**
251  * @param expected expected text in UTF-8 (not platform) codepage
252  */
assertUText(const char * expected,UText * actual,const char * file,int line)253 void RegexTest::assertUText(const char *expected, UText *actual, const char *file, int line) {
254     UErrorCode status = U_ZERO_ERROR;
255     UText expectedText = UTEXT_INITIALIZER;
256     utext_openUTF8(&expectedText, expected, -1, &status);
257     if(U_FAILURE(status)) {
258       errln("%s:%d: assertUText: error %s calling utext_openUTF8(expected: %d chars)\n", file, line, u_errorName(status), strlen(expected));
259       return;
260     }
261     if(utext_nativeLength(&expectedText)==0 && (strlen(expected)!=0)) {
262       errln("%s:%d: assertUText:  expected is %d utf-8 bytes, but utext_nativeLength(expectedText) returned 0.", file, line, strlen(expected));
263       return;
264     }
265     utext_setNativeIndex(actual, 0);
266     if (!testUTextEqual(&expectedText, actual)) {
267         char buf[201 /*21*/];
268         char expectedBuf[201];
269         utextToPrintable(buf, UPRV_LENGTHOF(buf), actual);
270         utextToPrintable(expectedBuf, UPRV_LENGTHOF(expectedBuf), &expectedText);
271         errln("%s:%d: assertUText: Failure: expected \"%s\" (%d chars), got \"%s\" (%d chars)", file, line, expectedBuf, (int)utext_nativeLength(&expectedText), buf, (int)utext_nativeLength(actual));
272     }
273     utext_close(&expectedText);
274 }
275 /**
276  * @param expected invariant (platform local text) input
277  */
278 
assertUTextInvariant(const char * expected,UText * actual,const char * file,int line)279 void RegexTest::assertUTextInvariant(const char *expected, UText *actual, const char *file, int line) {
280     UErrorCode status = U_ZERO_ERROR;
281     UText expectedText = UTEXT_INITIALIZER;
282     regextst_openUTF8FromInvariant(&expectedText, expected, -1, &status);
283     if(U_FAILURE(status)) {
284       errln("%s:%d: assertUTextInvariant: error %s calling regextst_openUTF8FromInvariant(expected: %d chars)\n", file, line, u_errorName(status), strlen(expected));
285       return;
286     }
287     utext_setNativeIndex(actual, 0);
288     if (!testUTextEqual(&expectedText, actual)) {
289         char buf[201 /*21*/];
290         char expectedBuf[201];
291         utextToPrintable(buf, UPRV_LENGTHOF(buf), actual);
292         utextToPrintable(expectedBuf, UPRV_LENGTHOF(expectedBuf), &expectedText);
293         errln("%s:%d: assertUTextInvariant: Failure: expected \"%s\" (%d uchars), got \"%s\" (%d chars)", file, line, expectedBuf, (int)utext_nativeLength(&expectedText), buf, (int)utext_nativeLength(actual));
294     }
295     utext_close(&expectedText);
296 }
297 
298 /**
299  * Assumes utf-8 input
300  */
301 #define REGEX_ASSERT_UTEXT_UTF8(expected, actual) assertUText((expected), (actual), __FILE__, __LINE__)
302 /**
303  * Assumes Invariant input
304  */
305 #define REGEX_ASSERT_UTEXT_INVARIANT(expected, actual) assertUTextInvariant((expected), (actual), __FILE__, __LINE__)
306 
307 /**
308  * This buffer ( inv_buf ) is used to hold the UTF-8 strings
309  * passed into utext_openUTF8. An error will be given if
310  * INV_BUFSIZ is too small.  It's only used on EBCDIC systems.
311  */
312 
313 #define INV_BUFSIZ 2048 /* increase this if too small */
314 
315 static int64_t inv_next=0;
316 
317 #if U_CHARSET_FAMILY!=U_ASCII_FAMILY
318 static char inv_buf[INV_BUFSIZ];
319 #endif
320 
regextst_openUTF8FromInvariant(UText * ut,const char * inv,int64_t length,UErrorCode * status)321 static UText* regextst_openUTF8FromInvariant(UText *ut, const char *inv, int64_t length, UErrorCode *status) {
322   if(length==-1) length=strlen(inv);
323 #if U_CHARSET_FAMILY==U_ASCII_FAMILY
324   inv_next+=length;
325   return utext_openUTF8(ut, inv, length, status);
326 #else
327   if(inv_next+length+1>INV_BUFSIZ) {
328     fprintf(stderr, "%s:%d Error: INV_BUFSIZ #defined to be %d but needs to be at least %d.\n",
329             __FILE__, __LINE__, INV_BUFSIZ, (inv_next+length+1));
330     *status = U_MEMORY_ALLOCATION_ERROR;
331     return NULL;
332   }
333 
334   unsigned char *buf = (unsigned char*)inv_buf+inv_next;
335   uprv_aestrncpy(buf, (const uint8_t*)inv, length);
336   inv_next+=length;
337 
338 #if 0
339   fprintf(stderr, " Note: INV_BUFSIZ at %d, used=%d\n", INV_BUFSIZ, inv_next);
340 #endif
341 
342   return utext_openUTF8(ut, (const char*)buf, length, status);
343 #endif
344 }
345 
346 
347 //---------------------------------------------------------------------------
348 //
349 //    REGEX_TESTLM       Macro + invocation function to simplify writing quick tests
350 //                       for the LookingAt() and  Match() functions.
351 //
352 //       usage:
353 //          REGEX_TESTLM("pattern",  "input text",  lookingAt expected, matches expected);
354 //
355 //          The expected results are UBool - TRUE or FALSE.
356 //          The input text is unescaped.  The pattern is not.
357 //
358 //
359 //---------------------------------------------------------------------------
360 
361 #define REGEX_TESTLM(pat, text, looking, match) UPRV_BLOCK_MACRO_BEGIN { \
362     doRegexLMTest(pat, text, looking, match, __LINE__); \
363     doRegexLMTestUTF8(pat, text, looking, match, __LINE__); \
364 } UPRV_BLOCK_MACRO_END
365 
doRegexLMTest(const char * pat,const char * text,UBool looking,UBool match,int32_t line)366 UBool RegexTest::doRegexLMTest(const char *pat, const char *text, UBool looking, UBool match, int32_t line) {
367     const UnicodeString pattern(pat, -1, US_INV);
368     const UnicodeString inputText(text, -1, US_INV);
369     UErrorCode          status  = U_ZERO_ERROR;
370     UParseError         pe;
371     RegexPattern        *REPattern = NULL;
372     RegexMatcher        *REMatcher = NULL;
373     UBool               retVal     = TRUE;
374 
375     UnicodeString patString(pat, -1, US_INV);
376     REPattern = RegexPattern::compile(patString, 0, pe, status);
377     if (U_FAILURE(status)) {
378         dataerrln("RegexTest failure in RegexPattern::compile() at line %d.  Status = %s",
379             line, u_errorName(status));
380         return FALSE;
381     }
382     if (line==376) { REPattern->dumpPattern();}
383 
384     UnicodeString inputString(inputText);
385     UnicodeString unEscapedInput = inputString.unescape();
386     REMatcher = REPattern->matcher(unEscapedInput, status);
387     if (U_FAILURE(status)) {
388         errln("RegexTest failure in REPattern::matcher() at line %d.  Status = %s\n",
389             line, u_errorName(status));
390         return FALSE;
391     }
392 
393     UBool actualmatch;
394     actualmatch = REMatcher->lookingAt(status);
395     if (U_FAILURE(status)) {
396         errln("RegexTest failure in lookingAt() at line %d.  Status = %s\n",
397             line, u_errorName(status));
398         retVal =  FALSE;
399     }
400     if (actualmatch != looking) {
401         errln("RegexTest: wrong return from lookingAt() at line %d.\n", line);
402         retVal = FALSE;
403     }
404 
405     status = U_ZERO_ERROR;
406     actualmatch = REMatcher->matches(status);
407     if (U_FAILURE(status)) {
408         errln("RegexTest failure in matches() at line %d.  Status = %s\n",
409             line, u_errorName(status));
410         retVal = FALSE;
411     }
412     if (actualmatch != match) {
413         errln("RegexTest: wrong return from matches() at line %d.\n", line);
414         retVal = FALSE;
415     }
416 
417     if (retVal == FALSE) {
418         REPattern->dumpPattern();
419     }
420 
421     delete REPattern;
422     delete REMatcher;
423     return retVal;
424 }
425 
426 
doRegexLMTestUTF8(const char * pat,const char * text,UBool looking,UBool match,int32_t line)427 UBool RegexTest::doRegexLMTestUTF8(const char *pat, const char *text, UBool looking, UBool match, int32_t line) {
428     UText               pattern    = UTEXT_INITIALIZER;
429     int32_t             inputUTF8Length;
430     char                *textChars = NULL;
431     UText               inputText  = UTEXT_INITIALIZER;
432     UErrorCode          status     = U_ZERO_ERROR;
433     UParseError         pe;
434     RegexPattern        *REPattern = NULL;
435     RegexMatcher        *REMatcher = NULL;
436     UBool               retVal     = TRUE;
437 
438     regextst_openUTF8FromInvariant(&pattern, pat, -1, &status);
439     REPattern = RegexPattern::compile(&pattern, 0, pe, status);
440     if (U_FAILURE(status)) {
441         dataerrln("RegexTest failure in RegexPattern::compile() at line %d (UTF8).  Status = %s\n",
442             line, u_errorName(status));
443         return FALSE;
444     }
445 
446     UnicodeString inputString(text, -1, US_INV);
447     UnicodeString unEscapedInput = inputString.unescape();
448     LocalUConverterPointer UTF8Converter(ucnv_open("UTF8", &status));
449     ucnv_setFromUCallBack(UTF8Converter.getAlias(), UCNV_FROM_U_CALLBACK_STOP, NULL, NULL, NULL, &status);
450 
451     inputUTF8Length = unEscapedInput.extract(NULL, 0, UTF8Converter.getAlias(), status);
452     if (U_FAILURE(status) && status != U_BUFFER_OVERFLOW_ERROR) {
453         // UTF-8 does not allow unpaired surrogates, so this could actually happen
454         logln("RegexTest unable to convert input to UTF8 at line %d.  Status = %s\n", line, u_errorName(status));
455         return TRUE; // not a failure of the Regex engine
456     }
457     status = U_ZERO_ERROR; // buffer overflow
458     textChars = new char[inputUTF8Length+1];
459     unEscapedInput.extract(textChars, inputUTF8Length+1, UTF8Converter.getAlias(), status);
460     utext_openUTF8(&inputText, textChars, inputUTF8Length, &status);
461 
462     REMatcher = &REPattern->matcher(status)->reset(&inputText);
463     if (U_FAILURE(status)) {
464         errln("RegexTest failure in REPattern::matcher() at line %d (UTF8).  Status = %s\n",
465             line, u_errorName(status));
466         return FALSE;
467     }
468 
469     UBool actualmatch;
470     actualmatch = REMatcher->lookingAt(status);
471     if (U_FAILURE(status)) {
472         errln("RegexTest failure in lookingAt() at line %d (UTF8).  Status = %s\n",
473             line, u_errorName(status));
474         retVal =  FALSE;
475     }
476     if (actualmatch != looking) {
477         errln("RegexTest: wrong return from lookingAt() at line %d (UTF8).\n", line);
478         retVal = FALSE;
479     }
480 
481     status = U_ZERO_ERROR;
482     actualmatch = REMatcher->matches(status);
483     if (U_FAILURE(status)) {
484         errln("RegexTest failure in matches() at line %d (UTF8).  Status = %s\n",
485             line, u_errorName(status));
486         retVal = FALSE;
487     }
488     if (actualmatch != match) {
489         errln("RegexTest: wrong return from matches() at line %d (UTF8).\n", line);
490         retVal = FALSE;
491     }
492 
493     if (retVal == FALSE) {
494         REPattern->dumpPattern();
495     }
496 
497     delete REPattern;
498     delete REMatcher;
499     utext_close(&inputText);
500     utext_close(&pattern);
501     delete[] textChars;
502     return retVal;
503 }
504 
505 
506 
507 //---------------------------------------------------------------------------
508 //
509 //    REGEX_ERR       Macro + invocation function to simplify writing tests
510 //                       regex tests for incorrect patterns
511 //
512 //       usage:
513 //          REGEX_ERR("pattern",   expected error line, column, expected status);
514 //
515 //---------------------------------------------------------------------------
516 #define REGEX_ERR(pat, line, col, status) regex_err(pat, line, col, status, __LINE__)
517 
regex_err(const char * pat,int32_t errLine,int32_t errCol,UErrorCode expectedStatus,int32_t line)518 void RegexTest::regex_err(const char *pat, int32_t errLine, int32_t errCol,
519                           UErrorCode expectedStatus, int32_t line) {
520     UnicodeString       pattern(pat);
521 
522     UErrorCode          status         = U_ZERO_ERROR;
523     UParseError         pe;
524     RegexPattern        *callerPattern = NULL;
525 
526     //
527     //  Compile the caller's pattern
528     //
529     UnicodeString patString(pat);
530     callerPattern = RegexPattern::compile(patString, 0, pe, status);
531     if (status != expectedStatus) {
532         dataerrln("Line %d: unexpected error %s compiling pattern.", line, u_errorName(status));
533     } else {
534         if (status != U_ZERO_ERROR) {
535             if (pe.line != errLine || pe.offset != errCol) {
536                 errln("Line %d: incorrect line/offset from UParseError.  Expected %d/%d; got %d/%d.\n",
537                     line, errLine, errCol, pe.line, pe.offset);
538             }
539         }
540     }
541 
542     delete callerPattern;
543 
544     //
545     //  Compile again, using a UTF-8-based UText
546     //
547     UText patternText = UTEXT_INITIALIZER;
548     regextst_openUTF8FromInvariant(&patternText, pat, -1, &status);
549     callerPattern = RegexPattern::compile(&patternText, 0, pe, status);
550     if (status != expectedStatus) {
551         dataerrln("Line %d: unexpected error %s compiling pattern.", line, u_errorName(status));
552     } else {
553         if (status != U_ZERO_ERROR) {
554             if (pe.line != errLine || pe.offset != errCol) {
555                 errln("Line %d: incorrect line/offset from UParseError.  Expected %d/%d; got %d/%d.\n",
556                     line, errLine, errCol, pe.line, pe.offset);
557             }
558         }
559     }
560 
561     delete callerPattern;
562     utext_close(&patternText);
563 }
564 
565 
566 
567 //---------------------------------------------------------------------------
568 //
569 //      Basic      Check for basic functionality of regex pattern matching.
570 //                 Avoid the use of REGEX_FIND test macro, which has
571 //                 substantial dependencies on basic Regex functionality.
572 //
573 //---------------------------------------------------------------------------
Basic()574 void RegexTest::Basic() {
575 
576 
577 //
578 // Debug - slide failing test cases early
579 //
580 #if 0
581     {
582         // REGEX_TESTLM("a\N{LATIN SMALL LETTER B}c", "abc", FALSE, FALSE);
583         UParseError pe;
584         UErrorCode  status = U_ZERO_ERROR;
585         RegexPattern *pattern;
586         pattern = RegexPattern::compile(UNICODE_STRING_SIMPLE("a\\u00dfx").unescape(), UREGEX_CASE_INSENSITIVE, pe, status);
587         pattern->dumpPattern();
588         RegexMatcher *m = pattern->matcher(UNICODE_STRING_SIMPLE("a\\u00dfxzzz").unescape(), status);
589         UBool result = m->find();
590         printf("result = %d\n", result);
591         // REGEX_FIND("", "<0>ab<1>cc</1><2>ccc</2></0>ddd");
592         // REGEX_FIND("(X([abc=X]+)+X)|(y[abc=]+)", "=XX====================");
593     }
594     exit(1);
595 #endif
596 
597 
598     //
599     // Pattern with parentheses
600     //
601     REGEX_TESTLM("st(abc)ring", "stabcring thing", TRUE,  FALSE);
602     REGEX_TESTLM("st(abc)ring", "stabcring",       TRUE,  TRUE);
603     REGEX_TESTLM("st(abc)ring", "stabcrung",       FALSE, FALSE);
604 
605     //
606     // Patterns with *
607     //
608     REGEX_TESTLM("st(abc)*ring", "string", TRUE, TRUE);
609     REGEX_TESTLM("st(abc)*ring", "stabcring", TRUE, TRUE);
610     REGEX_TESTLM("st(abc)*ring", "stabcabcring", TRUE, TRUE);
611     REGEX_TESTLM("st(abc)*ring", "stabcabcdring", FALSE, FALSE);
612     REGEX_TESTLM("st(abc)*ring", "stabcabcabcring etc.", TRUE, FALSE);
613 
614     REGEX_TESTLM("a*", "",  TRUE, TRUE);
615     REGEX_TESTLM("a*", "b", TRUE, FALSE);
616 
617 
618     //
619     //  Patterns with "."
620     //
621     REGEX_TESTLM(".", "abc", TRUE, FALSE);
622     REGEX_TESTLM("...", "abc", TRUE, TRUE);
623     REGEX_TESTLM("....", "abc", FALSE, FALSE);
624     REGEX_TESTLM(".*", "abcxyz123", TRUE, TRUE);
625     REGEX_TESTLM("ab.*xyz", "abcdefghij", FALSE, FALSE);
626     REGEX_TESTLM("ab.*xyz", "abcdefg...wxyz", TRUE, TRUE);
627     REGEX_TESTLM("ab.*xyz", "abcde...wxyz...abc..xyz", TRUE, TRUE);
628     REGEX_TESTLM("ab.*xyz", "abcde...wxyz...abc..xyz...", TRUE, FALSE);
629 
630     //
631     //  Patterns with * applied to chars at end of literal string
632     //
633     REGEX_TESTLM("abc*", "ab", TRUE, TRUE);
634     REGEX_TESTLM("abc*", "abccccc", TRUE, TRUE);
635 
636     //
637     //  Supplemental chars match as single chars, not a pair of surrogates.
638     //
639     REGEX_TESTLM(".", "\\U00011000", TRUE, TRUE);
640     REGEX_TESTLM("...", "\\U00011000x\\U00012002", TRUE, TRUE);
641     REGEX_TESTLM("...", "\\U00011000x\\U00012002y", TRUE, FALSE);
642 
643 
644     //
645     //  UnicodeSets in the pattern
646     //
647     REGEX_TESTLM("[1-6]", "1", TRUE, TRUE);
648     REGEX_TESTLM("[1-6]", "3", TRUE, TRUE);
649     REGEX_TESTLM("[1-6]", "7", FALSE, FALSE);
650     REGEX_TESTLM("a[1-6]", "a3", TRUE, TRUE);
651     REGEX_TESTLM("a[1-6]", "a3", TRUE, TRUE);
652     REGEX_TESTLM("a[1-6]b", "a3b", TRUE, TRUE);
653 
654     REGEX_TESTLM("a[0-9]*b", "a123b", TRUE, TRUE);
655     REGEX_TESTLM("a[0-9]*b", "abc", TRUE, FALSE);
656     REGEX_TESTLM("[\\p{Nd}]*", "123456", TRUE, TRUE);
657     REGEX_TESTLM("[\\p{Nd}]*", "a123456", TRUE, FALSE);   // note that * matches 0 occurrences.
658     REGEX_TESTLM("[a][b][[:Zs:]]*", "ab   ", TRUE, TRUE);
659 
660     //
661     //   OR operator in patterns
662     //
663     REGEX_TESTLM("(a|b)", "a", TRUE, TRUE);
664     REGEX_TESTLM("(a|b)", "b", TRUE, TRUE);
665     REGEX_TESTLM("(a|b)", "c", FALSE, FALSE);
666     REGEX_TESTLM("a|b", "b", TRUE, TRUE);
667 
668     REGEX_TESTLM("(a|b|c)*", "aabcaaccbcabc", TRUE, TRUE);
669     REGEX_TESTLM("(a|b|c)*", "aabcaaccbcabdc", TRUE, FALSE);
670     REGEX_TESTLM("(a(b|c|d)(x|y|z)*|123)", "ac", TRUE, TRUE);
671     REGEX_TESTLM("(a(b|c|d)(x|y|z)*|123)", "123", TRUE, TRUE);
672     REGEX_TESTLM("(a|(1|2)*)(b|c|d)(x|y|z)*|123", "123", TRUE, TRUE);
673     REGEX_TESTLM("(a|(1|2)*)(b|c|d)(x|y|z)*|123", "222211111czzzzw", TRUE, FALSE);
674 
675     //
676     //  +
677     //
678     REGEX_TESTLM("ab+", "abbc", TRUE, FALSE);
679     REGEX_TESTLM("ab+c", "ac", FALSE, FALSE);
680     REGEX_TESTLM("b+", "", FALSE, FALSE);
681     REGEX_TESTLM("(abc|def)+", "defabc", TRUE, TRUE);
682     REGEX_TESTLM(".+y", "zippity dooy dah ", TRUE, FALSE);
683     REGEX_TESTLM(".+y", "zippity dooy", TRUE, TRUE);
684 
685     //
686     //   ?
687     //
688     REGEX_TESTLM("ab?", "ab", TRUE, TRUE);
689     REGEX_TESTLM("ab?", "a", TRUE, TRUE);
690     REGEX_TESTLM("ab?", "ac", TRUE, FALSE);
691     REGEX_TESTLM("ab?", "abb", TRUE, FALSE);
692     REGEX_TESTLM("a(b|c)?d", "abd", TRUE, TRUE);
693     REGEX_TESTLM("a(b|c)?d", "acd", TRUE, TRUE);
694     REGEX_TESTLM("a(b|c)?d", "ad", TRUE, TRUE);
695     REGEX_TESTLM("a(b|c)?d", "abcd", FALSE, FALSE);
696     REGEX_TESTLM("a(b|c)?d", "ab", FALSE, FALSE);
697 
698     //
699     //  Escape sequences that become single literal chars, handled internally
700     //   by ICU's Unescape.
701     //
702 
703     // REGEX_TESTLM("\101\142", "Ab", TRUE, TRUE);      // Octal     TODO: not implemented yet.
704     REGEX_TESTLM("\\a", "\\u0007", TRUE, TRUE);        // BEL
705     REGEX_TESTLM("\\cL", "\\u000c", TRUE, TRUE);       // Control-L
706     REGEX_TESTLM("\\e", "\\u001b", TRUE, TRUE);        // Escape
707     REGEX_TESTLM("\\f", "\\u000c", TRUE, TRUE);        // Form Feed
708     REGEX_TESTLM("\\n", "\\u000a", TRUE, TRUE);        // new line
709     REGEX_TESTLM("\\r", "\\u000d", TRUE, TRUE);        //  CR
710     REGEX_TESTLM("\\t", "\\u0009", TRUE, TRUE);        // Tab
711     REGEX_TESTLM("\\u1234", "\\u1234", TRUE, TRUE);
712     REGEX_TESTLM("\\U00001234", "\\u1234", TRUE, TRUE);
713 
714     REGEX_TESTLM(".*\\Ax", "xyz", TRUE, FALSE);  //  \A matches only at the beginning of input
715     REGEX_TESTLM(".*\\Ax", " xyz", FALSE, FALSE);  //  \A matches only at the beginning of input
716 
717     // Escape of special chars in patterns
718     REGEX_TESTLM("\\\\\\|\\(\\)\\[\\{\\~\\$\\*\\+\\?\\.", "\\\\|()[{~$*+?.", TRUE, TRUE);
719 }
720 
721 
722 //---------------------------------------------------------------------------
723 //
724 //    UTextBasic   Check for quirks that are specific to the UText
725 //                 implementation.
726 //
727 //---------------------------------------------------------------------------
UTextBasic()728 void RegexTest::UTextBasic() {
729     const char str_abc[] = { 0x61, 0x62, 0x63, 0x00 }; /* abc */
730     UErrorCode status = U_ZERO_ERROR;
731     UText pattern = UTEXT_INITIALIZER;
732     utext_openUTF8(&pattern, str_abc, -1, &status);
733     RegexMatcher matcher(&pattern, 0, status);
734     REGEX_CHECK_STATUS;
735 
736     UText input = UTEXT_INITIALIZER;
737     utext_openUTF8(&input, str_abc, -1, &status);
738     REGEX_CHECK_STATUS;
739     matcher.reset(&input);
740     REGEX_CHECK_STATUS;
741     REGEX_ASSERT_UTEXT_UTF8(str_abc, matcher.inputText());
742 
743     matcher.reset(matcher.inputText());
744     REGEX_CHECK_STATUS;
745     REGEX_ASSERT_UTEXT_UTF8(str_abc, matcher.inputText());
746 
747     utext_close(&pattern);
748     utext_close(&input);
749 }
750 
751 
752 //---------------------------------------------------------------------------
753 //
754 //      API_Match   Test that the API for class RegexMatcher
755 //                  is present and nominally working, but excluding functions
756 //                  implementing replace operations.
757 //
758 //---------------------------------------------------------------------------
API_Match()759 void RegexTest::API_Match() {
760     UParseError         pe;
761     UErrorCode          status=U_ZERO_ERROR;
762     int32_t             flags = 0;
763 
764     //
765     // Debug - slide failing test cases early
766     //
767 #if 0
768     {
769     }
770     return;
771 #endif
772 
773     //
774     // Simple pattern compilation
775     //
776     {
777         UnicodeString       re("abc");
778         RegexPattern        *pat2;
779         pat2 = RegexPattern::compile(re, flags, pe, status);
780         REGEX_CHECK_STATUS;
781 
782         UnicodeString inStr1 = "abcdef this is a test";
783         UnicodeString instr2 = "not abc";
784         UnicodeString empty  = "";
785 
786 
787         //
788         // Matcher creation and reset.
789         //
790         RegexMatcher *m1 = pat2->matcher(inStr1, status);
791         REGEX_CHECK_STATUS;
792         REGEX_ASSERT(m1->lookingAt(status) == TRUE);
793         REGEX_ASSERT(m1->input() == inStr1);
794         m1->reset(instr2);
795         REGEX_ASSERT(m1->lookingAt(status) == FALSE);
796         REGEX_ASSERT(m1->input() == instr2);
797         m1->reset(inStr1);
798         REGEX_ASSERT(m1->input() == inStr1);
799         REGEX_ASSERT(m1->lookingAt(status) == TRUE);
800         m1->reset(empty);
801         REGEX_ASSERT(m1->lookingAt(status) == FALSE);
802         REGEX_ASSERT(m1->input() == empty);
803         REGEX_ASSERT(&m1->pattern() == pat2);
804 
805         //
806         //  reset(pos, status)
807         //
808         m1->reset(inStr1);
809         m1->reset(4, status);
810         REGEX_CHECK_STATUS;
811         REGEX_ASSERT(m1->input() == inStr1);
812         REGEX_ASSERT(m1->lookingAt(status) == TRUE);
813 
814         m1->reset(-1, status);
815         REGEX_ASSERT(status == U_INDEX_OUTOFBOUNDS_ERROR);
816         status = U_ZERO_ERROR;
817 
818         m1->reset(0, status);
819         REGEX_CHECK_STATUS;
820         status = U_ZERO_ERROR;
821 
822         int32_t len = m1->input().length();
823         m1->reset(len-1, status);
824         REGEX_CHECK_STATUS;
825         status = U_ZERO_ERROR;
826 
827         m1->reset(len, status);
828         REGEX_CHECK_STATUS;
829         status = U_ZERO_ERROR;
830 
831         m1->reset(len+1, status);
832         REGEX_ASSERT(status == U_INDEX_OUTOFBOUNDS_ERROR);
833         status = U_ZERO_ERROR;
834 
835         //
836         // match(pos, status)
837         //
838         m1->reset(instr2);
839         REGEX_ASSERT(m1->matches(4, status) == TRUE);
840         m1->reset();
841         REGEX_ASSERT(m1->matches(3, status) == FALSE);
842         m1->reset();
843         REGEX_ASSERT(m1->matches(5, status) == FALSE);
844         REGEX_ASSERT(m1->matches(4, status) == TRUE);
845         REGEX_ASSERT(m1->matches(-1, status) == FALSE);
846         REGEX_ASSERT(status == U_INDEX_OUTOFBOUNDS_ERROR);
847 
848         // Match() at end of string should fail, but should not
849         //  be an error.
850         status = U_ZERO_ERROR;
851         len = m1->input().length();
852         REGEX_ASSERT(m1->matches(len, status) == FALSE);
853         REGEX_CHECK_STATUS;
854 
855         // Match beyond end of string should fail with an error.
856         status = U_ZERO_ERROR;
857         REGEX_ASSERT(m1->matches(len+1, status) == FALSE);
858         REGEX_ASSERT(status == U_INDEX_OUTOFBOUNDS_ERROR);
859 
860         // Successful match at end of string.
861         {
862             status = U_ZERO_ERROR;
863             RegexMatcher m("A?", 0, status);  // will match zero length string.
864             REGEX_CHECK_STATUS;
865             m.reset(inStr1);
866             len = inStr1.length();
867             REGEX_ASSERT(m.matches(len, status) == TRUE);
868             REGEX_CHECK_STATUS;
869             m.reset(empty);
870             REGEX_ASSERT(m.matches(0, status) == TRUE);
871             REGEX_CHECK_STATUS;
872         }
873 
874 
875         //
876         // lookingAt(pos, status)
877         //
878         status = U_ZERO_ERROR;
879         m1->reset(instr2);  // "not abc"
880         REGEX_ASSERT(m1->lookingAt(4, status) == TRUE);
881         REGEX_ASSERT(m1->lookingAt(5, status) == FALSE);
882         REGEX_ASSERT(m1->lookingAt(3, status) == FALSE);
883         REGEX_ASSERT(m1->lookingAt(4, status) == TRUE);
884         REGEX_ASSERT(m1->lookingAt(-1, status) == FALSE);
885         REGEX_ASSERT(status == U_INDEX_OUTOFBOUNDS_ERROR);
886         status = U_ZERO_ERROR;
887         len = m1->input().length();
888         REGEX_ASSERT(m1->lookingAt(len, status) == FALSE);
889         REGEX_CHECK_STATUS;
890         REGEX_ASSERT(m1->lookingAt(len+1, status) == FALSE);
891         REGEX_ASSERT(status == U_INDEX_OUTOFBOUNDS_ERROR);
892 
893         delete m1;
894         delete pat2;
895     }
896 
897 
898     //
899     // Capture Group.
900     //     RegexMatcher::start();
901     //     RegexMatcher::end();
902     //     RegexMatcher::groupCount();
903     //
904     {
905         int32_t             flags=0;
906         UParseError         pe;
907         UErrorCode          status=U_ZERO_ERROR;
908 
909         UnicodeString       re("01(23(45)67)(.*)");
910         RegexPattern *pat = RegexPattern::compile(re, flags, pe, status);
911         REGEX_CHECK_STATUS;
912         UnicodeString data = "0123456789";
913 
914         RegexMatcher *matcher = pat->matcher(data, status);
915         REGEX_CHECK_STATUS;
916         REGEX_ASSERT(matcher->lookingAt(status) == TRUE);
917         static const int32_t matchStarts[] = {0,  2, 4, 8};
918         static const int32_t matchEnds[]   = {10, 8, 6, 10};
919         int32_t i;
920         for (i=0; i<4; i++) {
921             int32_t actualStart = matcher->start(i, status);
922             REGEX_CHECK_STATUS;
923             if (actualStart != matchStarts[i]) {
924                 errln("RegexTest failure at line %d, index %d.  Expected %d, got %d\n",
925                     __LINE__, i, matchStarts[i], actualStart);
926             }
927             int32_t actualEnd = matcher->end(i, status);
928             REGEX_CHECK_STATUS;
929             if (actualEnd != matchEnds[i]) {
930                 errln("RegexTest failure at line %d index %d.  Expected %d, got %d\n",
931                     __LINE__, i, matchEnds[i], actualEnd);
932             }
933         }
934 
935         REGEX_ASSERT(matcher->start(0, status) == matcher->start(status));
936         REGEX_ASSERT(matcher->end(0, status) == matcher->end(status));
937 
938         REGEX_ASSERT_FAIL(matcher->start(-1, status), U_INDEX_OUTOFBOUNDS_ERROR);
939         REGEX_ASSERT_FAIL(matcher->start( 4, status), U_INDEX_OUTOFBOUNDS_ERROR);
940         matcher->reset();
941         REGEX_ASSERT_FAIL(matcher->start( 0, status), U_REGEX_INVALID_STATE);
942 
943         matcher->lookingAt(status);
944         REGEX_ASSERT(matcher->group(status)    == "0123456789");
945         REGEX_ASSERT(matcher->group(0, status) == "0123456789");
946         REGEX_ASSERT(matcher->group(1, status) == "234567"    );
947         REGEX_ASSERT(matcher->group(2, status) == "45"        );
948         REGEX_ASSERT(matcher->group(3, status) == "89"        );
949         REGEX_CHECK_STATUS;
950         REGEX_ASSERT_FAIL(matcher->group(-1, status), U_INDEX_OUTOFBOUNDS_ERROR);
951         REGEX_ASSERT_FAIL(matcher->group( 4, status), U_INDEX_OUTOFBOUNDS_ERROR);
952         matcher->reset();
953         REGEX_ASSERT_FAIL(matcher->group( 0, status), U_REGEX_INVALID_STATE);
954 
955         delete matcher;
956         delete pat;
957 
958     }
959 
960     //
961     //  find
962     //
963     {
964         int32_t             flags=0;
965         UParseError         pe;
966         UErrorCode          status=U_ZERO_ERROR;
967 
968         UnicodeString       re("abc");
969         RegexPattern *pat = RegexPattern::compile(re, flags, pe, status);
970         REGEX_CHECK_STATUS;
971         UnicodeString data = ".abc..abc...abc..";
972         //                    012345678901234567
973 
974         RegexMatcher *matcher = pat->matcher(data, status);
975         REGEX_CHECK_STATUS;
976         REGEX_ASSERT(matcher->find());
977         REGEX_ASSERT(matcher->start(status) == 1);
978         REGEX_ASSERT(matcher->find());
979         REGEX_ASSERT(matcher->start(status) == 6);
980         REGEX_ASSERT(matcher->find());
981         REGEX_ASSERT(matcher->start(status) == 12);
982         REGEX_ASSERT(matcher->find() == FALSE);
983         REGEX_ASSERT(matcher->find() == FALSE);
984 
985         matcher->reset();
986         REGEX_ASSERT(matcher->find());
987         REGEX_ASSERT(matcher->start(status) == 1);
988 
989         REGEX_ASSERT(matcher->find(0, status));
990         REGEX_ASSERT(matcher->start(status) == 1);
991         REGEX_ASSERT(matcher->find(1, status));
992         REGEX_ASSERT(matcher->start(status) == 1);
993         REGEX_ASSERT(matcher->find(2, status));
994         REGEX_ASSERT(matcher->start(status) == 6);
995         REGEX_ASSERT(matcher->find(12, status));
996         REGEX_ASSERT(matcher->start(status) == 12);
997         REGEX_ASSERT(matcher->find(13, status) == FALSE);
998         REGEX_ASSERT(matcher->find(16, status) == FALSE);
999         REGEX_ASSERT(matcher->find(17, status) == FALSE);
1000         REGEX_ASSERT_FAIL(matcher->start(status), U_REGEX_INVALID_STATE);
1001 
1002         status = U_ZERO_ERROR;
1003         REGEX_ASSERT_FAIL(matcher->find(-1, status), U_INDEX_OUTOFBOUNDS_ERROR);
1004         status = U_ZERO_ERROR;
1005         REGEX_ASSERT_FAIL(matcher->find(18, status), U_INDEX_OUTOFBOUNDS_ERROR);
1006 
1007         REGEX_ASSERT(matcher->groupCount() == 0);
1008 
1009         delete matcher;
1010         delete pat;
1011     }
1012 
1013 
1014     //
1015     //  find, with \G in pattern (true if at the end of a previous match).
1016     //
1017     {
1018         int32_t             flags=0;
1019         UParseError         pe;
1020         UErrorCode          status=U_ZERO_ERROR;
1021 
1022         UnicodeString       re(".*?(?:(\\Gabc)|(abc))", -1, US_INV);
1023         RegexPattern *pat = RegexPattern::compile(re, flags, pe, status);
1024         REGEX_CHECK_STATUS;
1025         UnicodeString data = ".abcabc.abc..";
1026         //                    012345678901234567
1027 
1028         RegexMatcher *matcher = pat->matcher(data, status);
1029         REGEX_CHECK_STATUS;
1030         REGEX_ASSERT(matcher->find());
1031         REGEX_ASSERT(matcher->start(status) == 0);
1032         REGEX_ASSERT(matcher->start(1, status) == -1);
1033         REGEX_ASSERT(matcher->start(2, status) == 1);
1034 
1035         REGEX_ASSERT(matcher->find());
1036         REGEX_ASSERT(matcher->start(status) == 4);
1037         REGEX_ASSERT(matcher->start(1, status) == 4);
1038         REGEX_ASSERT(matcher->start(2, status) == -1);
1039         REGEX_CHECK_STATUS;
1040 
1041         delete matcher;
1042         delete pat;
1043     }
1044 
1045     //
1046     //   find with zero length matches, match position should bump ahead
1047     //     to prevent loops.
1048     //
1049     {
1050         int32_t                 i;
1051         UErrorCode          status=U_ZERO_ERROR;
1052         RegexMatcher        m("(?= ?)", 0, status);   // This pattern will zero-length matches anywhere,
1053                                                       //   using an always-true look-ahead.
1054         REGEX_CHECK_STATUS;
1055         UnicodeString s("    ");
1056         m.reset(s);
1057         for (i=0; ; i++) {
1058             if (m.find() == FALSE) {
1059                 break;
1060             }
1061             REGEX_ASSERT(m.start(status) == i);
1062             REGEX_ASSERT(m.end(status) == i);
1063         }
1064         REGEX_ASSERT(i==5);
1065 
1066         // Check that the bump goes over surrogate pairs OK
1067         s = UNICODE_STRING_SIMPLE("\\U00010001\\U00010002\\U00010003\\U00010004");
1068         s = s.unescape();
1069         m.reset(s);
1070         for (i=0; ; i+=2) {
1071             if (m.find() == FALSE) {
1072                 break;
1073             }
1074             REGEX_ASSERT(m.start(status) == i);
1075             REGEX_ASSERT(m.end(status) == i);
1076         }
1077         REGEX_ASSERT(i==10);
1078     }
1079     {
1080         // find() loop breaking test.
1081         //        with pattern of /.?/, should see a series of one char matches, then a single
1082         //        match of zero length at the end of the input string.
1083         int32_t                 i;
1084         UErrorCode          status=U_ZERO_ERROR;
1085         RegexMatcher        m(".?", 0, status);
1086         REGEX_CHECK_STATUS;
1087         UnicodeString s("    ");
1088         m.reset(s);
1089         for (i=0; ; i++) {
1090             if (m.find() == FALSE) {
1091                 break;
1092             }
1093             REGEX_ASSERT(m.start(status) == i);
1094             REGEX_ASSERT(m.end(status) == (i<4 ? i+1 : i));
1095         }
1096         REGEX_ASSERT(i==5);
1097     }
1098 
1099 
1100     //
1101     // Matchers with no input string behave as if they had an empty input string.
1102     //
1103 
1104     {
1105         UErrorCode status = U_ZERO_ERROR;
1106         RegexMatcher  m(".?", 0, status);
1107         REGEX_CHECK_STATUS;
1108         REGEX_ASSERT(m.find());
1109         REGEX_ASSERT(m.start(status) == 0);
1110         REGEX_ASSERT(m.input() == "");
1111     }
1112     {
1113         UErrorCode status = U_ZERO_ERROR;
1114         RegexPattern  *p = RegexPattern::compile(".", 0, status);
1115         RegexMatcher  *m = p->matcher(status);
1116         REGEX_CHECK_STATUS;
1117 
1118         REGEX_ASSERT(m->find() == FALSE);
1119         REGEX_ASSERT(m->input() == "");
1120         delete m;
1121         delete p;
1122     }
1123 
1124     //
1125     // Regions
1126     //
1127     {
1128         UErrorCode status = U_ZERO_ERROR;
1129         UnicodeString testString("This is test data");
1130         RegexMatcher m(".*", testString,  0, status);
1131         REGEX_CHECK_STATUS;
1132         REGEX_ASSERT(m.regionStart() == 0);
1133         REGEX_ASSERT(m.regionEnd() == testString.length());
1134         REGEX_ASSERT(m.hasTransparentBounds() == FALSE);
1135         REGEX_ASSERT(m.hasAnchoringBounds() == TRUE);
1136 
1137         m.region(2,4, status);
1138         REGEX_CHECK_STATUS;
1139         REGEX_ASSERT(m.matches(status));
1140         REGEX_ASSERT(m.start(status)==2);
1141         REGEX_ASSERT(m.end(status)==4);
1142         REGEX_CHECK_STATUS;
1143 
1144         m.reset();
1145         REGEX_ASSERT(m.regionStart() == 0);
1146         REGEX_ASSERT(m.regionEnd() == testString.length());
1147 
1148         UnicodeString shorterString("short");
1149         m.reset(shorterString);
1150         REGEX_ASSERT(m.regionStart() == 0);
1151         REGEX_ASSERT(m.regionEnd() == shorterString.length());
1152 
1153         REGEX_ASSERT(m.hasAnchoringBounds() == TRUE);
1154         REGEX_ASSERT(&m == &m.useAnchoringBounds(FALSE));
1155         REGEX_ASSERT(m.hasAnchoringBounds() == FALSE);
1156         REGEX_ASSERT(&m == &m.reset());
1157         REGEX_ASSERT(m.hasAnchoringBounds() == FALSE);
1158 
1159         REGEX_ASSERT(&m == &m.useAnchoringBounds(TRUE));
1160         REGEX_ASSERT(m.hasAnchoringBounds() == TRUE);
1161         REGEX_ASSERT(&m == &m.reset());
1162         REGEX_ASSERT(m.hasAnchoringBounds() == TRUE);
1163 
1164         REGEX_ASSERT(m.hasTransparentBounds() == FALSE);
1165         REGEX_ASSERT(&m == &m.useTransparentBounds(TRUE));
1166         REGEX_ASSERT(m.hasTransparentBounds() == TRUE);
1167         REGEX_ASSERT(&m == &m.reset());
1168         REGEX_ASSERT(m.hasTransparentBounds() == TRUE);
1169 
1170         REGEX_ASSERT(&m == &m.useTransparentBounds(FALSE));
1171         REGEX_ASSERT(m.hasTransparentBounds() == FALSE);
1172         REGEX_ASSERT(&m == &m.reset());
1173         REGEX_ASSERT(m.hasTransparentBounds() == FALSE);
1174 
1175     }
1176 
1177     //
1178     // hitEnd() and requireEnd()
1179     //
1180     {
1181         UErrorCode status = U_ZERO_ERROR;
1182         UnicodeString testString("aabb");
1183         RegexMatcher m1(".*", testString,  0, status);
1184         REGEX_ASSERT(m1.lookingAt(status) == TRUE);
1185         REGEX_ASSERT(m1.hitEnd() == TRUE);
1186         REGEX_ASSERT(m1.requireEnd() == FALSE);
1187         REGEX_CHECK_STATUS;
1188 
1189         status = U_ZERO_ERROR;
1190         RegexMatcher m2("a*", testString, 0, status);
1191         REGEX_ASSERT(m2.lookingAt(status) == TRUE);
1192         REGEX_ASSERT(m2.hitEnd() == FALSE);
1193         REGEX_ASSERT(m2.requireEnd() == FALSE);
1194         REGEX_CHECK_STATUS;
1195 
1196         status = U_ZERO_ERROR;
1197         RegexMatcher m3(".*$", testString, 0, status);
1198         REGEX_ASSERT(m3.lookingAt(status) == TRUE);
1199         REGEX_ASSERT(m3.hitEnd() == TRUE);
1200         REGEX_ASSERT(m3.requireEnd() == TRUE);
1201         REGEX_CHECK_STATUS;
1202     }
1203 
1204 
1205     //
1206     // Compilation error on reset with UChar *
1207     //   These were a hazard that people were stumbling over with runtime errors.
1208     //   Changed them to compiler errors by adding private methods that more closely
1209     //   matched the incorrect use of the functions.
1210     //
1211 #if 0
1212     {
1213         UErrorCode status = U_ZERO_ERROR;
1214         UChar ucharString[20];
1215         RegexMatcher m(".", 0, status);
1216         m.reset(ucharString);  // should not compile.
1217 
1218         RegexPattern *p = RegexPattern::compile(".", 0, status);
1219         RegexMatcher *m2 = p->matcher(ucharString, status);    //  should not compile.
1220 
1221         RegexMatcher m3(".", ucharString, 0, status);  //  Should not compile
1222     }
1223 #endif
1224 
1225     //
1226     //  Time Outs.
1227     //       Note:  These tests will need to be changed when the regexp engine is
1228     //              able to detect and cut short the exponential time behavior on
1229     //              this type of match.
1230     //
1231     {
1232         UErrorCode status = U_ZERO_ERROR;
1233         //    Enough 'a's in the string to cause the match to time out.
1234         //       (Each on additional 'a' doubles the time)
1235         UnicodeString testString("aaaaaaaaaaaaaaaaaaaaa");
1236         RegexMatcher matcher("(a+)+b", testString, 0, status);
1237         REGEX_CHECK_STATUS;
1238         REGEX_ASSERT(matcher.getTimeLimit() == 0);
1239         matcher.setTimeLimit(100, status);
1240         REGEX_ASSERT(matcher.getTimeLimit() == 100);
1241         REGEX_ASSERT(matcher.lookingAt(status) == FALSE);
1242         REGEX_ASSERT(status == U_REGEX_TIME_OUT);
1243     }
1244     {
1245         UErrorCode status = U_ZERO_ERROR;
1246         //   Few enough 'a's to slip in under the time limit.
1247         UnicodeString testString("aaaaaaaaaaaaaaaaaa");
1248         RegexMatcher matcher("(a+)+b", testString, 0, status);
1249         REGEX_CHECK_STATUS;
1250         matcher.setTimeLimit(100, status);
1251         REGEX_ASSERT(matcher.lookingAt(status) == FALSE);
1252         REGEX_CHECK_STATUS;
1253     }
1254 
1255     //
1256     //  Stack Limits
1257     //
1258     {
1259         UErrorCode status = U_ZERO_ERROR;
1260         UnicodeString testString(1000000, 0x41, 1000000);  // Length 1,000,000, filled with 'A'
1261 
1262         // Adding the capturing parentheses to the pattern "(A)+A$" inhibits optimizations
1263         //   of the '+', and makes the stack frames larger.
1264         RegexMatcher matcher("(A)+A$", testString, 0, status);
1265 
1266         // With the default stack, this match should fail to run
1267         REGEX_ASSERT(matcher.lookingAt(status) == FALSE);
1268         REGEX_ASSERT(status == U_REGEX_STACK_OVERFLOW);
1269 
1270         // With unlimited stack, it should run
1271         status = U_ZERO_ERROR;
1272         matcher.setStackLimit(0, status);
1273         REGEX_CHECK_STATUS;
1274         REGEX_ASSERT(matcher.lookingAt(status) == TRUE);
1275         REGEX_CHECK_STATUS;
1276         REGEX_ASSERT(matcher.getStackLimit() == 0);
1277 
1278         // With a limited stack, it the match should fail
1279         status = U_ZERO_ERROR;
1280         matcher.setStackLimit(10000, status);
1281         REGEX_ASSERT(matcher.lookingAt(status) == FALSE);
1282         REGEX_ASSERT(status == U_REGEX_STACK_OVERFLOW);
1283         REGEX_ASSERT(matcher.getStackLimit() == 10000);
1284     }
1285 
1286         // A pattern that doesn't save state should work with
1287         //   a minimal sized stack
1288     {
1289         UErrorCode status = U_ZERO_ERROR;
1290         UnicodeString testString = "abc";
1291         RegexMatcher matcher("abc", testString, 0, status);
1292         REGEX_CHECK_STATUS;
1293         matcher.setStackLimit(30, status);
1294         REGEX_CHECK_STATUS;
1295         REGEX_ASSERT(matcher.matches(status) == TRUE);
1296         REGEX_CHECK_STATUS;
1297         REGEX_ASSERT(matcher.getStackLimit() == 30);
1298 
1299         // Negative stack sizes should fail
1300         status = U_ZERO_ERROR;
1301         matcher.setStackLimit(1000, status);
1302         REGEX_CHECK_STATUS;
1303         matcher.setStackLimit(-1, status);
1304         REGEX_ASSERT(status == U_ILLEGAL_ARGUMENT_ERROR);
1305         REGEX_ASSERT(matcher.getStackLimit() == 1000);
1306     }
1307 
1308 
1309 }
1310 
1311 
1312 
1313 
1314 
1315 
1316 //---------------------------------------------------------------------------
1317 //
1318 //      API_Replace        API test for class RegexMatcher, testing the
1319 //                         Replace family of functions.
1320 //
1321 //---------------------------------------------------------------------------
API_Replace()1322 void RegexTest::API_Replace() {
1323     //
1324     //  Replace
1325     //
1326     int32_t             flags=0;
1327     UParseError         pe;
1328     UErrorCode          status=U_ZERO_ERROR;
1329 
1330     UnicodeString       re("abc");
1331     RegexPattern *pat = RegexPattern::compile(re, flags, pe, status);
1332     REGEX_CHECK_STATUS;
1333     UnicodeString data = ".abc..abc...abc..";
1334     //                    012345678901234567
1335     RegexMatcher *matcher = pat->matcher(data, status);
1336 
1337     //
1338     //  Plain vanilla matches.
1339     //
1340     UnicodeString  dest;
1341     dest = matcher->replaceFirst("yz", status);
1342     REGEX_CHECK_STATUS;
1343     REGEX_ASSERT(dest == ".yz..abc...abc..");
1344 
1345     dest = matcher->replaceAll("yz", status);
1346     REGEX_CHECK_STATUS;
1347     REGEX_ASSERT(dest == ".yz..yz...yz..");
1348 
1349     //
1350     //  Plain vanilla non-matches.
1351     //
1352     UnicodeString d2 = ".abx..abx...abx..";
1353     matcher->reset(d2);
1354     dest = matcher->replaceFirst("yz", status);
1355     REGEX_CHECK_STATUS;
1356     REGEX_ASSERT(dest == ".abx..abx...abx..");
1357 
1358     dest = matcher->replaceAll("yz", status);
1359     REGEX_CHECK_STATUS;
1360     REGEX_ASSERT(dest == ".abx..abx...abx..");
1361 
1362     //
1363     // Empty source string
1364     //
1365     UnicodeString d3 = "";
1366     matcher->reset(d3);
1367     dest = matcher->replaceFirst("yz", status);
1368     REGEX_CHECK_STATUS;
1369     REGEX_ASSERT(dest == "");
1370 
1371     dest = matcher->replaceAll("yz", status);
1372     REGEX_CHECK_STATUS;
1373     REGEX_ASSERT(dest == "");
1374 
1375     //
1376     // Empty substitution string
1377     //
1378     matcher->reset(data);              // ".abc..abc...abc.."
1379     dest = matcher->replaceFirst("", status);
1380     REGEX_CHECK_STATUS;
1381     REGEX_ASSERT(dest == "...abc...abc..");
1382 
1383     dest = matcher->replaceAll("", status);
1384     REGEX_CHECK_STATUS;
1385     REGEX_ASSERT(dest == "........");
1386 
1387     //
1388     // match whole string
1389     //
1390     UnicodeString d4 = "abc";
1391     matcher->reset(d4);
1392     dest = matcher->replaceFirst("xyz", status);
1393     REGEX_CHECK_STATUS;
1394     REGEX_ASSERT(dest == "xyz");
1395 
1396     dest = matcher->replaceAll("xyz", status);
1397     REGEX_CHECK_STATUS;
1398     REGEX_ASSERT(dest == "xyz");
1399 
1400     //
1401     // Capture Group, simple case
1402     //
1403     UnicodeString       re2("a(..)");
1404     RegexPattern *pat2 = RegexPattern::compile(re2, flags, pe, status);
1405     REGEX_CHECK_STATUS;
1406     UnicodeString d5 = "abcdefg";
1407     RegexMatcher *matcher2 = pat2->matcher(d5, status);
1408     REGEX_CHECK_STATUS;
1409     dest = matcher2->replaceFirst("$1$1", status);
1410     REGEX_CHECK_STATUS;
1411     REGEX_ASSERT(dest == "bcbcdefg");
1412 
1413     dest = matcher2->replaceFirst(UNICODE_STRING_SIMPLE("The value of \\$1 is $1."), status);
1414     REGEX_CHECK_STATUS;
1415     REGEX_ASSERT(dest == "The value of $1 is bc.defg");
1416 
1417     dest = matcher2->replaceFirst("$ by itself, no group number $$$", status);
1418     REGEX_ASSERT(U_FAILURE(status));
1419     status = U_ZERO_ERROR;
1420 
1421     UnicodeString replacement = UNICODE_STRING_SIMPLE("Supplemental Digit 1 $\\U0001D7CF.");
1422     replacement = replacement.unescape();
1423     dest = matcher2->replaceFirst(replacement, status);
1424     REGEX_CHECK_STATUS;
1425     REGEX_ASSERT(dest == "Supplemental Digit 1 bc.defg");
1426 
1427     REGEX_ASSERT_FAIL(matcher2->replaceFirst("bad capture group number $5...",status), U_INDEX_OUTOFBOUNDS_ERROR);
1428 
1429 
1430     //
1431     // Replacement String with \u hex escapes
1432     //
1433     {
1434         UnicodeString  src = "abc 1 abc 2 abc 3";
1435         UnicodeString  substitute = UNICODE_STRING_SIMPLE("--\\u0043--");
1436         matcher->reset(src);
1437         UnicodeString  result = matcher->replaceAll(substitute, status);
1438         REGEX_CHECK_STATUS;
1439         REGEX_ASSERT(result == "--C-- 1 --C-- 2 --C-- 3");
1440     }
1441     {
1442         UnicodeString  src = "abc !";
1443         UnicodeString  substitute = UNICODE_STRING_SIMPLE("--\\U00010000--");
1444         matcher->reset(src);
1445         UnicodeString  result = matcher->replaceAll(substitute, status);
1446         REGEX_CHECK_STATUS;
1447         UnicodeString expected = UnicodeString("--");
1448         expected.append((UChar32)0x10000);
1449         expected.append("-- !");
1450         REGEX_ASSERT(result == expected);
1451     }
1452     // TODO:  need more through testing of capture substitutions.
1453 
1454     // Bug 4057
1455     //
1456     {
1457         status = U_ZERO_ERROR;
1458         UnicodeString s = "The matches start with ss and end with ee ss stuff ee fin";
1459         RegexMatcher m("ss(.*?)ee", 0, status);
1460         REGEX_CHECK_STATUS;
1461         UnicodeString result;
1462 
1463         // Multiple finds do NOT bump up the previous appendReplacement position.
1464         m.reset(s);
1465         m.find();
1466         m.find();
1467         m.appendReplacement(result, "ooh", status);
1468         REGEX_CHECK_STATUS;
1469         REGEX_ASSERT(result == "The matches start with ss and end with ee ooh");
1470 
1471         // After a reset into the interior of a string, appendReplacemnt still starts at beginning.
1472         status = U_ZERO_ERROR;
1473         result.truncate(0);
1474         m.reset(10, status);
1475         m.find();
1476         m.find();
1477         m.appendReplacement(result, "ooh", status);
1478         REGEX_CHECK_STATUS;
1479         REGEX_ASSERT(result == "The matches start with ss and end with ee ooh");
1480 
1481         // find() at interior of string, appendReplacemnt still starts at beginning.
1482         status = U_ZERO_ERROR;
1483         result.truncate(0);
1484         m.reset();
1485         m.find(10, status);
1486         m.find();
1487         m.appendReplacement(result, "ooh", status);
1488         REGEX_CHECK_STATUS;
1489         REGEX_ASSERT(result == "The matches start with ss and end with ee ooh");
1490 
1491         m.appendTail(result);
1492         REGEX_ASSERT(result == "The matches start with ss and end with ee ooh fin");
1493 
1494     }
1495 
1496     delete matcher2;
1497     delete pat2;
1498     delete matcher;
1499     delete pat;
1500 }
1501 
1502 
1503 //---------------------------------------------------------------------------
1504 //
1505 //      API_Pattern       Test that the API for class RegexPattern is
1506 //                        present and nominally working.
1507 //
1508 //---------------------------------------------------------------------------
API_Pattern()1509 void RegexTest::API_Pattern() {
1510     RegexPattern        pata;    // Test default constructor to not crash.
1511     RegexPattern        patb;
1512 
1513     REGEX_ASSERT(pata == patb);
1514     REGEX_ASSERT(pata == pata);
1515 
1516     UnicodeString re1("abc[a-l][m-z]");
1517     UnicodeString re2("def");
1518     UErrorCode    status = U_ZERO_ERROR;
1519     UParseError   pe;
1520 
1521     RegexPattern        *pat1 = RegexPattern::compile(re1, 0, pe, status);
1522     RegexPattern        *pat2 = RegexPattern::compile(re2, 0, pe, status);
1523     REGEX_CHECK_STATUS;
1524     REGEX_ASSERT(*pat1 == *pat1);
1525     REGEX_ASSERT(*pat1 != pata);
1526 
1527     // Assign
1528     patb = *pat1;
1529     REGEX_ASSERT(patb == *pat1);
1530 
1531     // Copy Construct
1532     RegexPattern patc(*pat1);
1533     REGEX_ASSERT(patc == *pat1);
1534     REGEX_ASSERT(patb == patc);
1535     REGEX_ASSERT(pat1 != pat2);
1536     patb = *pat2;
1537     REGEX_ASSERT(patb != patc);
1538     REGEX_ASSERT(patb == *pat2);
1539 
1540     // Compile with no flags.
1541     RegexPattern         *pat1a = RegexPattern::compile(re1, pe, status);
1542     REGEX_ASSERT(*pat1a == *pat1);
1543 
1544     REGEX_ASSERT(pat1a->flags() == 0);
1545 
1546     // Compile with different flags should be not equal
1547     RegexPattern        *pat1b = RegexPattern::compile(re1, UREGEX_CASE_INSENSITIVE, pe, status);
1548     REGEX_CHECK_STATUS;
1549 
1550     REGEX_ASSERT(*pat1b != *pat1a);
1551     REGEX_ASSERT(pat1b->flags() == UREGEX_CASE_INSENSITIVE);
1552     REGEX_ASSERT(pat1a->flags() == 0);
1553     delete pat1b;
1554 
1555     // clone
1556     RegexPattern *pat1c = pat1->clone();
1557     REGEX_ASSERT(*pat1c == *pat1);
1558     REGEX_ASSERT(*pat1c != *pat2);
1559 
1560     delete pat1c;
1561     delete pat1a;
1562     delete pat1;
1563     delete pat2;
1564 
1565 
1566     //
1567     //   Verify that a matcher created from a cloned pattern works.
1568     //     (Jitterbug 3423)
1569     //
1570     {
1571         UErrorCode     status     = U_ZERO_ERROR;
1572         RegexPattern  *pSource    = RegexPattern::compile(UNICODE_STRING_SIMPLE("\\p{L}+"), 0, status);
1573         RegexPattern  *pClone     = pSource->clone();
1574         delete         pSource;
1575         RegexMatcher  *mFromClone = pClone->matcher(status);
1576         REGEX_CHECK_STATUS;
1577         UnicodeString s = "Hello World";
1578         mFromClone->reset(s);
1579         REGEX_ASSERT(mFromClone->find() == TRUE);
1580         REGEX_ASSERT(mFromClone->group(status) == "Hello");
1581         REGEX_ASSERT(mFromClone->find() == TRUE);
1582         REGEX_ASSERT(mFromClone->group(status) == "World");
1583         REGEX_ASSERT(mFromClone->find() == FALSE);
1584         delete mFromClone;
1585         delete pClone;
1586     }
1587 
1588     //
1589     //   matches convenience API
1590     //
1591     REGEX_ASSERT(RegexPattern::matches(".*", "random input", pe, status) == TRUE);
1592     REGEX_CHECK_STATUS;
1593     REGEX_ASSERT(RegexPattern::matches("abc", "random input", pe, status) == FALSE);
1594     REGEX_CHECK_STATUS;
1595     REGEX_ASSERT(RegexPattern::matches(".*nput", "random input", pe, status) == TRUE);
1596     REGEX_CHECK_STATUS;
1597     REGEX_ASSERT(RegexPattern::matches("random input", "random input", pe, status) == TRUE);
1598     REGEX_CHECK_STATUS;
1599     REGEX_ASSERT(RegexPattern::matches(".*u", "random input", pe, status) == FALSE);
1600     REGEX_CHECK_STATUS;
1601     status = U_INDEX_OUTOFBOUNDS_ERROR;
1602     REGEX_ASSERT(RegexPattern::matches("abc", "abc", pe, status) == FALSE);
1603     REGEX_ASSERT(status == U_INDEX_OUTOFBOUNDS_ERROR);
1604 
1605 
1606     //
1607     // Split()
1608     //
1609     status = U_ZERO_ERROR;
1610     pat1 = RegexPattern::compile(" +",  pe, status);
1611     REGEX_CHECK_STATUS;
1612     UnicodeString  fields[10];
1613 
1614     int32_t n;
1615     n = pat1->split("Now is the time", fields, 10, status);
1616     REGEX_CHECK_STATUS;
1617     REGEX_ASSERT(n==4);
1618     REGEX_ASSERT(fields[0]=="Now");
1619     REGEX_ASSERT(fields[1]=="is");
1620     REGEX_ASSERT(fields[2]=="the");
1621     REGEX_ASSERT(fields[3]=="time");
1622     REGEX_ASSERT(fields[4]=="");
1623 
1624     n = pat1->split("Now is the time", fields, 2, status);
1625     REGEX_CHECK_STATUS;
1626     REGEX_ASSERT(n==2);
1627     REGEX_ASSERT(fields[0]=="Now");
1628     REGEX_ASSERT(fields[1]=="is the time");
1629     REGEX_ASSERT(fields[2]=="the");   // left over from previous test
1630 
1631     fields[1] = "*";
1632     status = U_ZERO_ERROR;
1633     n = pat1->split("Now is the time", fields, 1, status);
1634     REGEX_CHECK_STATUS;
1635     REGEX_ASSERT(n==1);
1636     REGEX_ASSERT(fields[0]=="Now is the time");
1637     REGEX_ASSERT(fields[1]=="*");
1638     status = U_ZERO_ERROR;
1639 
1640     n = pat1->split("    Now       is the time   ", fields, 10, status);
1641     REGEX_CHECK_STATUS;
1642     REGEX_ASSERT(n==6);
1643     REGEX_ASSERT(fields[0]=="");
1644     REGEX_ASSERT(fields[1]=="Now");
1645     REGEX_ASSERT(fields[2]=="is");
1646     REGEX_ASSERT(fields[3]=="the");
1647     REGEX_ASSERT(fields[4]=="time");
1648     REGEX_ASSERT(fields[5]=="");
1649 
1650     n = pat1->split("     ", fields, 10, status);
1651     REGEX_CHECK_STATUS;
1652     REGEX_ASSERT(n==2);
1653     REGEX_ASSERT(fields[0]=="");
1654     REGEX_ASSERT(fields[1]=="");
1655 
1656     fields[0] = "foo";
1657     n = pat1->split("", fields, 10, status);
1658     REGEX_CHECK_STATUS;
1659     REGEX_ASSERT(n==0);
1660     REGEX_ASSERT(fields[0]=="foo");
1661 
1662     delete pat1;
1663 
1664     //  split, with a pattern with (capture)
1665     pat1 = RegexPattern::compile(UNICODE_STRING_SIMPLE("<(\\w*)>"),  pe, status);
1666     REGEX_CHECK_STATUS;
1667 
1668     status = U_ZERO_ERROR;
1669     n = pat1->split("<a>Now is <b>the time<c>", fields, 10, status);
1670     REGEX_CHECK_STATUS;
1671     REGEX_ASSERT(n==7);
1672     REGEX_ASSERT(fields[0]=="");
1673     REGEX_ASSERT(fields[1]=="a");
1674     REGEX_ASSERT(fields[2]=="Now is ");
1675     REGEX_ASSERT(fields[3]=="b");
1676     REGEX_ASSERT(fields[4]=="the time");
1677     REGEX_ASSERT(fields[5]=="c");
1678     REGEX_ASSERT(fields[6]=="");
1679     REGEX_ASSERT(status==U_ZERO_ERROR);
1680 
1681     n = pat1->split("  <a>Now is <b>the time<c>", fields, 10, status);
1682     REGEX_CHECK_STATUS;
1683     REGEX_ASSERT(n==7);
1684     REGEX_ASSERT(fields[0]=="  ");
1685     REGEX_ASSERT(fields[1]=="a");
1686     REGEX_ASSERT(fields[2]=="Now is ");
1687     REGEX_ASSERT(fields[3]=="b");
1688     REGEX_ASSERT(fields[4]=="the time");
1689     REGEX_ASSERT(fields[5]=="c");
1690     REGEX_ASSERT(fields[6]=="");
1691 
1692     status = U_ZERO_ERROR;
1693     fields[6] = "foo";
1694     n = pat1->split("  <a>Now is <b>the time<c>", fields, 6, status);
1695     REGEX_CHECK_STATUS;
1696     REGEX_ASSERT(n==6);
1697     REGEX_ASSERT(fields[0]=="  ");
1698     REGEX_ASSERT(fields[1]=="a");
1699     REGEX_ASSERT(fields[2]=="Now is ");
1700     REGEX_ASSERT(fields[3]=="b");
1701     REGEX_ASSERT(fields[4]=="the time");
1702     REGEX_ASSERT(fields[5]=="");  // All text following "<c>" field delimiter.
1703     REGEX_ASSERT(fields[6]=="foo");
1704 
1705     status = U_ZERO_ERROR;
1706     fields[5] = "foo";
1707     n = pat1->split("  <a>Now is <b>the time<c>", fields, 5, status);
1708     REGEX_CHECK_STATUS;
1709     REGEX_ASSERT(n==5);
1710     REGEX_ASSERT(fields[0]=="  ");
1711     REGEX_ASSERT(fields[1]=="a");
1712     REGEX_ASSERT(fields[2]=="Now is ");
1713     REGEX_ASSERT(fields[3]=="b");
1714     REGEX_ASSERT(fields[4]=="the time<c>");
1715     REGEX_ASSERT(fields[5]=="foo");
1716 
1717     status = U_ZERO_ERROR;
1718     fields[5] = "foo";
1719     n = pat1->split("  <a>Now is <b>the time", fields, 5, status);
1720     REGEX_CHECK_STATUS;
1721     REGEX_ASSERT(n==5);
1722     REGEX_ASSERT(fields[0]=="  ");
1723     REGEX_ASSERT(fields[1]=="a");
1724     REGEX_ASSERT(fields[2]=="Now is ");
1725     REGEX_ASSERT(fields[3]=="b");
1726     REGEX_ASSERT(fields[4]=="the time");
1727     REGEX_ASSERT(fields[5]=="foo");
1728 
1729     status = U_ZERO_ERROR;
1730     n = pat1->split("  <a>Now is <b>the time<c>", fields, 4, status);
1731     REGEX_CHECK_STATUS;
1732     REGEX_ASSERT(n==4);
1733     REGEX_ASSERT(fields[0]=="  ");
1734     REGEX_ASSERT(fields[1]=="a");
1735     REGEX_ASSERT(fields[2]=="Now is ");
1736     REGEX_ASSERT(fields[3]=="the time<c>");
1737     status = U_ZERO_ERROR;
1738     delete pat1;
1739 
1740     pat1 = RegexPattern::compile("([-,])",  pe, status);
1741     REGEX_CHECK_STATUS;
1742     n = pat1->split("1-10,20", fields, 10, status);
1743     REGEX_CHECK_STATUS;
1744     REGEX_ASSERT(n==5);
1745     REGEX_ASSERT(fields[0]=="1");
1746     REGEX_ASSERT(fields[1]=="-");
1747     REGEX_ASSERT(fields[2]=="10");
1748     REGEX_ASSERT(fields[3]==",");
1749     REGEX_ASSERT(fields[4]=="20");
1750     delete pat1;
1751 
1752     // Test split of string with empty trailing fields
1753     pat1 = RegexPattern::compile(",", pe, status);
1754     REGEX_CHECK_STATUS;
1755     n = pat1->split("a,b,c,", fields, 10, status);
1756     REGEX_CHECK_STATUS;
1757     REGEX_ASSERT(n==4);
1758     REGEX_ASSERT(fields[0]=="a");
1759     REGEX_ASSERT(fields[1]=="b");
1760     REGEX_ASSERT(fields[2]=="c");
1761     REGEX_ASSERT(fields[3]=="");
1762 
1763     n = pat1->split("a,,,", fields, 10, status);
1764     REGEX_CHECK_STATUS;
1765     REGEX_ASSERT(n==4);
1766     REGEX_ASSERT(fields[0]=="a");
1767     REGEX_ASSERT(fields[1]=="");
1768     REGEX_ASSERT(fields[2]=="");
1769     REGEX_ASSERT(fields[3]=="");
1770     delete pat1;
1771 
1772     // Split Separator with zero length match.
1773     pat1 = RegexPattern::compile(":?", pe, status);
1774     REGEX_CHECK_STATUS;
1775     n = pat1->split("abc", fields, 10, status);
1776     REGEX_CHECK_STATUS;
1777     REGEX_ASSERT(n==5);
1778     REGEX_ASSERT(fields[0]=="");
1779     REGEX_ASSERT(fields[1]=="a");
1780     REGEX_ASSERT(fields[2]=="b");
1781     REGEX_ASSERT(fields[3]=="c");
1782     REGEX_ASSERT(fields[4]=="");
1783 
1784     delete pat1;
1785 
1786     //
1787     // RegexPattern::pattern()
1788     //
1789     pat1 = new RegexPattern();
1790     REGEX_ASSERT(pat1->pattern() == "");
1791     delete pat1;
1792 
1793     pat1 = RegexPattern::compile("(Hello, world)*",  pe, status);
1794     REGEX_CHECK_STATUS;
1795     REGEX_ASSERT(pat1->pattern() == "(Hello, world)*");
1796     delete pat1;
1797 
1798 
1799     //
1800     // classID functions
1801     //
1802     pat1 = RegexPattern::compile("(Hello, world)*",  pe, status);
1803     REGEX_CHECK_STATUS;
1804     REGEX_ASSERT(pat1->getDynamicClassID() == RegexPattern::getStaticClassID());
1805     REGEX_ASSERT(pat1->getDynamicClassID() != NULL);
1806     UnicodeString Hello("Hello, world.");
1807     RegexMatcher *m = pat1->matcher(Hello, status);
1808     REGEX_ASSERT(pat1->getDynamicClassID() != m->getDynamicClassID());
1809     REGEX_ASSERT(m->getDynamicClassID() == RegexMatcher::getStaticClassID());
1810     REGEX_ASSERT(m->getDynamicClassID() != NULL);
1811     delete m;
1812     delete pat1;
1813 
1814 }
1815 
1816 //---------------------------------------------------------------------------
1817 //
1818 //      API_Match_UTF8   Test that the alternate engine for class RegexMatcher
1819 //                       is present and working, but excluding functions
1820 //                       implementing replace operations.
1821 //
1822 //---------------------------------------------------------------------------
API_Match_UTF8()1823 void RegexTest::API_Match_UTF8() {
1824     UParseError         pe;
1825     UErrorCode          status=U_ZERO_ERROR;
1826     int32_t             flags = 0;
1827 
1828     //
1829     // Debug - slide failing test cases early
1830     //
1831 #if 0
1832     {
1833     }
1834     return;
1835 #endif
1836 
1837     //
1838     // Simple pattern compilation
1839     //
1840     {
1841         UText               re = UTEXT_INITIALIZER;
1842         regextst_openUTF8FromInvariant(&re, "abc", -1, &status);
1843         REGEX_VERBOSE_TEXT(&re);
1844         RegexPattern        *pat2;
1845         pat2 = RegexPattern::compile(&re, flags, pe, status);
1846         REGEX_CHECK_STATUS;
1847 
1848         UText input1 = UTEXT_INITIALIZER;
1849         UText input2 = UTEXT_INITIALIZER;
1850         UText empty  = UTEXT_INITIALIZER;
1851         regextst_openUTF8FromInvariant(&input1, "abcdef this is a test", -1, &status);
1852         REGEX_VERBOSE_TEXT(&input1);
1853         regextst_openUTF8FromInvariant(&input2, "not abc", -1, &status);
1854         REGEX_VERBOSE_TEXT(&input2);
1855         utext_openUChars(&empty, NULL, 0, &status);
1856 
1857         int32_t input1Len = static_cast<int32_t>(strlen("abcdef this is a test")); /* TODO: why not nativelen (input1) ? */
1858         int32_t input2Len = static_cast<int32_t>(strlen("not abc"));
1859 
1860 
1861         //
1862         // Matcher creation and reset.
1863         //
1864         RegexMatcher *m1 = &pat2->matcher(status)->reset(&input1);
1865         REGEX_CHECK_STATUS;
1866         REGEX_ASSERT(m1->lookingAt(status) == TRUE);
1867         const char str_abcdefthisisatest[] = { 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x20, 0x74, 0x68, 0x69, 0x73, 0x20, 0x69, 0x73, 0x20, 0x61, 0x20, 0x74, 0x65, 0x73, 0x74, 0x00 }; /* abcdef this is a test */
1868         REGEX_ASSERT_UTEXT_UTF8(str_abcdefthisisatest, m1->inputText());
1869         m1->reset(&input2);
1870         REGEX_ASSERT(m1->lookingAt(status) == FALSE);
1871         const char str_notabc[] = { 0x6e, 0x6f, 0x74, 0x20, 0x61, 0x62, 0x63, 0x00 }; /* not abc */
1872         REGEX_ASSERT_UTEXT_UTF8(str_notabc, m1->inputText());
1873         m1->reset(&input1);
1874         REGEX_ASSERT_UTEXT_UTF8(str_abcdefthisisatest, m1->inputText());
1875         REGEX_ASSERT(m1->lookingAt(status) == TRUE);
1876         m1->reset(&empty);
1877         REGEX_ASSERT(m1->lookingAt(status) == FALSE);
1878         REGEX_ASSERT(utext_nativeLength(&empty) == 0);
1879 
1880         //
1881         //  reset(pos, status)
1882         //
1883         m1->reset(&input1);
1884         m1->reset(4, status);
1885         REGEX_CHECK_STATUS;
1886         REGEX_ASSERT_UTEXT_UTF8(str_abcdefthisisatest, m1->inputText());
1887         REGEX_ASSERT(m1->lookingAt(status) == TRUE);
1888 
1889         m1->reset(-1, status);
1890         REGEX_ASSERT(status == U_INDEX_OUTOFBOUNDS_ERROR);
1891         status = U_ZERO_ERROR;
1892 
1893         m1->reset(0, status);
1894         REGEX_CHECK_STATUS;
1895         status = U_ZERO_ERROR;
1896 
1897         m1->reset(input1Len-1, status);
1898         REGEX_CHECK_STATUS;
1899         status = U_ZERO_ERROR;
1900 
1901         m1->reset(input1Len, status);
1902         REGEX_CHECK_STATUS;
1903         status = U_ZERO_ERROR;
1904 
1905         m1->reset(input1Len+1, status);
1906         REGEX_ASSERT(status == U_INDEX_OUTOFBOUNDS_ERROR);
1907         status = U_ZERO_ERROR;
1908 
1909         //
1910         // match(pos, status)
1911         //
1912         m1->reset(&input2);
1913         REGEX_ASSERT(m1->matches(4, status) == TRUE);
1914         m1->reset();
1915         REGEX_ASSERT(m1->matches(3, status) == FALSE);
1916         m1->reset();
1917         REGEX_ASSERT(m1->matches(5, status) == FALSE);
1918         REGEX_ASSERT(m1->matches(4, status) == TRUE);
1919         REGEX_ASSERT(m1->matches(-1, status) == FALSE);
1920         REGEX_ASSERT(status == U_INDEX_OUTOFBOUNDS_ERROR);
1921 
1922         // Match() at end of string should fail, but should not
1923         //  be an error.
1924         status = U_ZERO_ERROR;
1925         REGEX_ASSERT(m1->matches(input2Len, status) == FALSE);
1926         REGEX_CHECK_STATUS;
1927 
1928         // Match beyond end of string should fail with an error.
1929         status = U_ZERO_ERROR;
1930         REGEX_ASSERT(m1->matches(input2Len+1, status) == FALSE);
1931         REGEX_ASSERT(status == U_INDEX_OUTOFBOUNDS_ERROR);
1932 
1933         // Successful match at end of string.
1934         {
1935             status = U_ZERO_ERROR;
1936             RegexMatcher m("A?", 0, status);  // will match zero length string.
1937             REGEX_CHECK_STATUS;
1938             m.reset(&input1);
1939             REGEX_ASSERT(m.matches(input1Len, status) == TRUE);
1940             REGEX_CHECK_STATUS;
1941             m.reset(&empty);
1942             REGEX_ASSERT(m.matches(0, status) == TRUE);
1943             REGEX_CHECK_STATUS;
1944         }
1945 
1946 
1947         //
1948         // lookingAt(pos, status)
1949         //
1950         status = U_ZERO_ERROR;
1951         m1->reset(&input2);  // "not abc"
1952         REGEX_ASSERT(m1->lookingAt(4, status) == TRUE);
1953         REGEX_ASSERT(m1->lookingAt(5, status) == FALSE);
1954         REGEX_ASSERT(m1->lookingAt(3, status) == FALSE);
1955         REGEX_ASSERT(m1->lookingAt(4, status) == TRUE);
1956         REGEX_ASSERT(m1->lookingAt(-1, status) == FALSE);
1957         REGEX_ASSERT(status == U_INDEX_OUTOFBOUNDS_ERROR);
1958         status = U_ZERO_ERROR;
1959         REGEX_ASSERT(m1->lookingAt(input2Len, status) == FALSE);
1960         REGEX_CHECK_STATUS;
1961         REGEX_ASSERT(m1->lookingAt(input2Len+1, status) == FALSE);
1962         REGEX_ASSERT(status == U_INDEX_OUTOFBOUNDS_ERROR);
1963 
1964         delete m1;
1965         delete pat2;
1966 
1967         utext_close(&re);
1968         utext_close(&input1);
1969         utext_close(&input2);
1970         utext_close(&empty);
1971     }
1972 
1973 
1974     //
1975     // Capture Group.
1976     //     RegexMatcher::start();
1977     //     RegexMatcher::end();
1978     //     RegexMatcher::groupCount();
1979     //
1980     {
1981         int32_t             flags=0;
1982         UParseError         pe;
1983         UErrorCode          status=U_ZERO_ERROR;
1984         UText               re=UTEXT_INITIALIZER;
1985         const char str_01234567_pat[] = { 0x30, 0x31, 0x28, 0x32, 0x33, 0x28, 0x34, 0x35, 0x29, 0x36, 0x37, 0x29, 0x28, 0x2e, 0x2a, 0x29, 0x00 }; /* 01(23(45)67)(.*) */
1986         utext_openUTF8(&re, str_01234567_pat, -1, &status);
1987 
1988         RegexPattern *pat = RegexPattern::compile(&re, flags, pe, status);
1989         REGEX_CHECK_STATUS;
1990 
1991         UText input = UTEXT_INITIALIZER;
1992         const char str_0123456789[] = { 0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37, 0x38, 0x39, 0x00 }; /* 0123456789 */
1993         utext_openUTF8(&input, str_0123456789, -1, &status);
1994 
1995         RegexMatcher *matcher = &pat->matcher(status)->reset(&input);
1996         REGEX_CHECK_STATUS;
1997         REGEX_ASSERT(matcher->lookingAt(status) == TRUE);
1998         static const int32_t matchStarts[] = {0,  2, 4, 8};
1999         static const int32_t matchEnds[]   = {10, 8, 6, 10};
2000         int32_t i;
2001         for (i=0; i<4; i++) {
2002             int32_t actualStart = matcher->start(i, status);
2003             REGEX_CHECK_STATUS;
2004             if (actualStart != matchStarts[i]) {
2005                 errln("RegexTest failure at %s:%d, index %d.  Expected %d, got %d\n",
2006                       __FILE__, __LINE__, i, matchStarts[i], actualStart);
2007             }
2008             int32_t actualEnd = matcher->end(i, status);
2009             REGEX_CHECK_STATUS;
2010             if (actualEnd != matchEnds[i]) {
2011                 errln("RegexTest failure at %s:%d index %d.  Expected %d, got %d\n",
2012                       __FILE__, __LINE__, i, matchEnds[i], actualEnd);
2013             }
2014         }
2015 
2016         REGEX_ASSERT(matcher->start(0, status) == matcher->start(status));
2017         REGEX_ASSERT(matcher->end(0, status) == matcher->end(status));
2018 
2019         REGEX_ASSERT_FAIL(matcher->start(-1, status), U_INDEX_OUTOFBOUNDS_ERROR);
2020         REGEX_ASSERT_FAIL(matcher->start( 4, status), U_INDEX_OUTOFBOUNDS_ERROR);
2021         matcher->reset();
2022         REGEX_ASSERT_FAIL(matcher->start( 0, status), U_REGEX_INVALID_STATE);
2023 
2024         matcher->lookingAt(status);
2025 
2026         UnicodeString dest;
2027         UText destText = UTEXT_INITIALIZER;
2028         utext_openUnicodeString(&destText, &dest, &status);
2029         UText *result;
2030         //const char str_0123456789[] = { 0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37, 0x38, 0x39, 0x00 }; /* 0123456789 */
2031         //  Test shallow-clone API
2032         int64_t   group_len;
2033         result = matcher->group((UText *)NULL, group_len, status);
2034         REGEX_CHECK_STATUS;
2035         REGEX_ASSERT_UTEXT_UTF8(str_0123456789, result);
2036         utext_close(result);
2037         result = matcher->group(0, &destText, group_len, status);
2038         REGEX_CHECK_STATUS;
2039         REGEX_ASSERT(result == &destText);
2040         REGEX_ASSERT_UTEXT_UTF8(str_0123456789, result);
2041         //  destText is now immutable, reopen it
2042         utext_close(&destText);
2043         utext_openUnicodeString(&destText, &dest, &status);
2044 
2045         int64_t length;
2046         result = matcher->group(0, NULL, length, status);
2047         REGEX_CHECK_STATUS;
2048         REGEX_ASSERT_UTEXT_UTF8(str_0123456789, result);
2049         utext_close(result);
2050         result = matcher->group(0, &destText, length, status);
2051         REGEX_CHECK_STATUS;
2052         REGEX_ASSERT(result == &destText);
2053         REGEX_ASSERT(utext_getNativeIndex(result) == 0);
2054         REGEX_ASSERT(length == 10);
2055         REGEX_ASSERT_UTEXT_INVARIANT("0123456789", result);
2056 
2057         // Capture Group 1 == "234567"
2058         result = matcher->group(1, NULL, length, status);
2059         REGEX_CHECK_STATUS;
2060         REGEX_ASSERT(utext_getNativeIndex(result) == 2);
2061         REGEX_ASSERT(length == 6);
2062         REGEX_ASSERT_UTEXT_INVARIANT("0123456789", result);
2063         utext_close(result);
2064 
2065         result = matcher->group(1, &destText, length, status);
2066         REGEX_CHECK_STATUS;
2067         REGEX_ASSERT(result == &destText);
2068         REGEX_ASSERT(utext_getNativeIndex(result) == 2);
2069         REGEX_ASSERT(length == 6);
2070         REGEX_ASSERT_UTEXT_INVARIANT("0123456789", result);
2071         utext_close(result);
2072 
2073         // Capture Group 2 == "45"
2074         result = matcher->group(2, NULL, length, status);
2075         REGEX_CHECK_STATUS;
2076         REGEX_ASSERT(utext_getNativeIndex(result) == 4);
2077         REGEX_ASSERT(length == 2);
2078         REGEX_ASSERT_UTEXT_INVARIANT("0123456789", result);
2079         utext_close(result);
2080 
2081         result = matcher->group(2, &destText, length, status);
2082         REGEX_CHECK_STATUS;
2083         REGEX_ASSERT(result == &destText);
2084         REGEX_ASSERT(utext_getNativeIndex(result) == 4);
2085         REGEX_ASSERT(length == 2);
2086         REGEX_ASSERT_UTEXT_INVARIANT("0123456789", result);
2087         utext_close(result);
2088 
2089         // Capture Group 3 == "89"
2090         result = matcher->group(3, NULL, length, status);
2091         REGEX_CHECK_STATUS;
2092         REGEX_ASSERT(utext_getNativeIndex(result) == 8);
2093         REGEX_ASSERT(length == 2);
2094         REGEX_ASSERT_UTEXT_INVARIANT("0123456789", result);
2095         utext_close(result);
2096 
2097         result = matcher->group(3, &destText, length, status);
2098         REGEX_CHECK_STATUS;
2099         REGEX_ASSERT(result == &destText);
2100         REGEX_ASSERT(utext_getNativeIndex(result) == 8);
2101         REGEX_ASSERT(length == 2);
2102         REGEX_ASSERT_UTEXT_INVARIANT("0123456789", result);
2103         utext_close(result);
2104 
2105         // Capture Group number out of range.
2106         status = U_ZERO_ERROR;
2107         REGEX_ASSERT_FAIL(matcher->group(-1, status), U_INDEX_OUTOFBOUNDS_ERROR);
2108         status = U_ZERO_ERROR;
2109         REGEX_ASSERT_FAIL(matcher->group( 4, status), U_INDEX_OUTOFBOUNDS_ERROR);
2110         status = U_ZERO_ERROR;
2111         matcher->reset();
2112         REGEX_ASSERT_FAIL(matcher->group( 0, status), U_REGEX_INVALID_STATE);
2113 
2114         delete matcher;
2115         delete pat;
2116 
2117         utext_close(&destText);
2118         utext_close(&input);
2119         utext_close(&re);
2120     }
2121 
2122     //
2123     //  find
2124     //
2125     {
2126         int32_t             flags=0;
2127         UParseError         pe;
2128         UErrorCode          status=U_ZERO_ERROR;
2129         UText               re=UTEXT_INITIALIZER;
2130         const char str_abc[] = { 0x61, 0x62, 0x63, 0x00 }; /* abc */
2131         utext_openUTF8(&re, str_abc, -1, &status);
2132 
2133         RegexPattern *pat = RegexPattern::compile(&re, flags, pe, status);
2134         REGEX_CHECK_STATUS;
2135         UText input = UTEXT_INITIALIZER;
2136         const char str_abcabcabc[] = { 0x2e, 0x61, 0x62, 0x63, 0x2e, 0x2e, 0x61, 0x62, 0x63, 0x2e, 0x2e, 0x2e, 0x61, 0x62, 0x63, 0x2e, 0x2e, 0x00 }; /* .abc..abc...abc.. */
2137         utext_openUTF8(&input, str_abcabcabc, -1, &status);
2138         //                      012345678901234567
2139 
2140         RegexMatcher *matcher = &pat->matcher(status)->reset(&input);
2141         REGEX_CHECK_STATUS;
2142         REGEX_ASSERT(matcher->find());
2143         REGEX_ASSERT(matcher->start(status) == 1);
2144         REGEX_ASSERT(matcher->find());
2145         REGEX_ASSERT(matcher->start(status) == 6);
2146         REGEX_ASSERT(matcher->find());
2147         REGEX_ASSERT(matcher->start(status) == 12);
2148         REGEX_ASSERT(matcher->find() == FALSE);
2149         REGEX_ASSERT(matcher->find() == FALSE);
2150 
2151         matcher->reset();
2152         REGEX_ASSERT(matcher->find());
2153         REGEX_ASSERT(matcher->start(status) == 1);
2154 
2155         REGEX_ASSERT(matcher->find(0, status));
2156         REGEX_ASSERT(matcher->start(status) == 1);
2157         REGEX_ASSERT(matcher->find(1, status));
2158         REGEX_ASSERT(matcher->start(status) == 1);
2159         REGEX_ASSERT(matcher->find(2, status));
2160         REGEX_ASSERT(matcher->start(status) == 6);
2161         REGEX_ASSERT(matcher->find(12, status));
2162         REGEX_ASSERT(matcher->start(status) == 12);
2163         REGEX_ASSERT(matcher->find(13, status) == FALSE);
2164         REGEX_ASSERT(matcher->find(16, status) == FALSE);
2165         REGEX_ASSERT(matcher->find(17, status) == FALSE);
2166         REGEX_ASSERT_FAIL(matcher->start(status), U_REGEX_INVALID_STATE);
2167 
2168         status = U_ZERO_ERROR;
2169         REGEX_ASSERT_FAIL(matcher->find(-1, status), U_INDEX_OUTOFBOUNDS_ERROR);
2170         status = U_ZERO_ERROR;
2171         REGEX_ASSERT_FAIL(matcher->find(18, status), U_INDEX_OUTOFBOUNDS_ERROR);
2172 
2173         REGEX_ASSERT(matcher->groupCount() == 0);
2174 
2175         delete matcher;
2176         delete pat;
2177 
2178         utext_close(&input);
2179         utext_close(&re);
2180     }
2181 
2182 
2183     //
2184     //  find, with \G in pattern (true if at the end of a previous match).
2185     //
2186     {
2187         int32_t             flags=0;
2188         UParseError         pe;
2189         UErrorCode          status=U_ZERO_ERROR;
2190         UText               re=UTEXT_INITIALIZER;
2191         const char str_Gabcabc[] = { 0x2e, 0x2a, 0x3f, 0x28, 0x3f, 0x3a, 0x28, 0x5c, 0x47, 0x61, 0x62, 0x63, 0x29, 0x7c, 0x28, 0x61, 0x62, 0x63, 0x29, 0x29, 0x00 }; /* .*?(?:(\\Gabc)|(abc)) */
2192         utext_openUTF8(&re, str_Gabcabc, -1, &status);
2193 
2194         RegexPattern *pat = RegexPattern::compile(&re, flags, pe, status);
2195 
2196         REGEX_CHECK_STATUS;
2197         UText input = UTEXT_INITIALIZER;
2198         const char str_abcabcabc[] = { 0x2e, 0x61, 0x62, 0x63, 0x61, 0x62, 0x63, 0x2e, 0x61, 0x62, 0x63, 0x2e, 0x2e, 0x00 }; /* .abcabc.abc.. */
2199         utext_openUTF8(&input, str_abcabcabc, -1, &status);
2200         //                      012345678901234567
2201 
2202         RegexMatcher *matcher = &pat->matcher(status)->reset(&input);
2203         REGEX_CHECK_STATUS;
2204         REGEX_ASSERT(matcher->find());
2205         REGEX_ASSERT(matcher->start(status) == 0);
2206         REGEX_ASSERT(matcher->start(1, status) == -1);
2207         REGEX_ASSERT(matcher->start(2, status) == 1);
2208 
2209         REGEX_ASSERT(matcher->find());
2210         REGEX_ASSERT(matcher->start(status) == 4);
2211         REGEX_ASSERT(matcher->start(1, status) == 4);
2212         REGEX_ASSERT(matcher->start(2, status) == -1);
2213         REGEX_CHECK_STATUS;
2214 
2215         delete matcher;
2216         delete pat;
2217 
2218         utext_close(&input);
2219         utext_close(&re);
2220     }
2221 
2222     //
2223     //   find with zero length matches, match position should bump ahead
2224     //     to prevent loops.
2225     //
2226     {
2227         int32_t                 i;
2228         UErrorCode          status=U_ZERO_ERROR;
2229         RegexMatcher        m("(?= ?)", 0, status);   // This pattern will zero-length matches anywhere,
2230                                                       //   using an always-true look-ahead.
2231         REGEX_CHECK_STATUS;
2232         UText s = UTEXT_INITIALIZER;
2233         utext_openUTF8(&s, "    ", -1, &status);
2234         m.reset(&s);
2235         for (i=0; ; i++) {
2236             if (m.find() == FALSE) {
2237                 break;
2238             }
2239             REGEX_ASSERT(m.start(status) == i);
2240             REGEX_ASSERT(m.end(status) == i);
2241         }
2242         REGEX_ASSERT(i==5);
2243 
2244         // Check that the bump goes over characters outside the BMP OK
2245         // "\\U00010001\\U00010002\\U00010003\\U00010004".unescape()...in UTF-8
2246         unsigned char aboveBMP[] = {0xF0, 0x90, 0x80, 0x81, 0xF0, 0x90, 0x80, 0x82, 0xF0, 0x90, 0x80, 0x83, 0xF0, 0x90, 0x80, 0x84, 0x00};
2247         utext_openUTF8(&s, (char *)aboveBMP, -1, &status);
2248         m.reset(&s);
2249         for (i=0; ; i+=4) {
2250             if (m.find() == FALSE) {
2251                 break;
2252             }
2253             REGEX_ASSERT(m.start(status) == i);
2254             REGEX_ASSERT(m.end(status) == i);
2255         }
2256         REGEX_ASSERT(i==20);
2257 
2258         utext_close(&s);
2259     }
2260     {
2261         // find() loop breaking test.
2262         //        with pattern of /.?/, should see a series of one char matches, then a single
2263         //        match of zero length at the end of the input string.
2264         int32_t                 i;
2265         UErrorCode          status=U_ZERO_ERROR;
2266         RegexMatcher        m(".?", 0, status);
2267         REGEX_CHECK_STATUS;
2268         UText s = UTEXT_INITIALIZER;
2269         utext_openUTF8(&s, "    ", -1, &status);
2270         m.reset(&s);
2271         for (i=0; ; i++) {
2272             if (m.find() == FALSE) {
2273                 break;
2274             }
2275             REGEX_ASSERT(m.start(status) == i);
2276             REGEX_ASSERT(m.end(status) == (i<4 ? i+1 : i));
2277         }
2278         REGEX_ASSERT(i==5);
2279 
2280         utext_close(&s);
2281     }
2282 
2283 
2284     //
2285     // Matchers with no input string behave as if they had an empty input string.
2286     //
2287 
2288     {
2289         UErrorCode status = U_ZERO_ERROR;
2290         RegexMatcher  m(".?", 0, status);
2291         REGEX_CHECK_STATUS;
2292         REGEX_ASSERT(m.find());
2293         REGEX_ASSERT(m.start(status) == 0);
2294         REGEX_ASSERT(m.input() == "");
2295     }
2296     {
2297         UErrorCode status = U_ZERO_ERROR;
2298         RegexPattern  *p = RegexPattern::compile(".", 0, status);
2299         RegexMatcher  *m = p->matcher(status);
2300         REGEX_CHECK_STATUS;
2301 
2302         REGEX_ASSERT(m->find() == FALSE);
2303         REGEX_ASSERT(utext_nativeLength(m->inputText()) == 0);
2304         delete m;
2305         delete p;
2306     }
2307 
2308     //
2309     // Regions
2310     //
2311     {
2312         UErrorCode status = U_ZERO_ERROR;
2313         UText testPattern = UTEXT_INITIALIZER;
2314         UText testText    = UTEXT_INITIALIZER;
2315         regextst_openUTF8FromInvariant(&testPattern, ".*", -1, &status);
2316         REGEX_VERBOSE_TEXT(&testPattern);
2317         regextst_openUTF8FromInvariant(&testText, "This is test data", -1, &status);
2318         REGEX_VERBOSE_TEXT(&testText);
2319 
2320         RegexMatcher m(&testPattern, &testText, 0, status);
2321         REGEX_CHECK_STATUS;
2322         REGEX_ASSERT(m.regionStart() == 0);
2323         REGEX_ASSERT(m.regionEnd() == (int32_t)strlen("This is test data"));
2324         REGEX_ASSERT(m.hasTransparentBounds() == FALSE);
2325         REGEX_ASSERT(m.hasAnchoringBounds() == TRUE);
2326 
2327         m.region(2,4, status);
2328         REGEX_CHECK_STATUS;
2329         REGEX_ASSERT(m.matches(status));
2330         REGEX_ASSERT(m.start(status)==2);
2331         REGEX_ASSERT(m.end(status)==4);
2332         REGEX_CHECK_STATUS;
2333 
2334         m.reset();
2335         REGEX_ASSERT(m.regionStart() == 0);
2336         REGEX_ASSERT(m.regionEnd() == (int32_t)strlen("This is test data"));
2337 
2338         regextst_openUTF8FromInvariant(&testText, "short", -1, &status);
2339         REGEX_VERBOSE_TEXT(&testText);
2340         m.reset(&testText);
2341         REGEX_ASSERT(m.regionStart() == 0);
2342         REGEX_ASSERT(m.regionEnd() == (int32_t)strlen("short"));
2343 
2344         REGEX_ASSERT(m.hasAnchoringBounds() == TRUE);
2345         REGEX_ASSERT(&m == &m.useAnchoringBounds(FALSE));
2346         REGEX_ASSERT(m.hasAnchoringBounds() == FALSE);
2347         REGEX_ASSERT(&m == &m.reset());
2348         REGEX_ASSERT(m.hasAnchoringBounds() == FALSE);
2349 
2350         REGEX_ASSERT(&m == &m.useAnchoringBounds(TRUE));
2351         REGEX_ASSERT(m.hasAnchoringBounds() == TRUE);
2352         REGEX_ASSERT(&m == &m.reset());
2353         REGEX_ASSERT(m.hasAnchoringBounds() == TRUE);
2354 
2355         REGEX_ASSERT(m.hasTransparentBounds() == FALSE);
2356         REGEX_ASSERT(&m == &m.useTransparentBounds(TRUE));
2357         REGEX_ASSERT(m.hasTransparentBounds() == TRUE);
2358         REGEX_ASSERT(&m == &m.reset());
2359         REGEX_ASSERT(m.hasTransparentBounds() == TRUE);
2360 
2361         REGEX_ASSERT(&m == &m.useTransparentBounds(FALSE));
2362         REGEX_ASSERT(m.hasTransparentBounds() == FALSE);
2363         REGEX_ASSERT(&m == &m.reset());
2364         REGEX_ASSERT(m.hasTransparentBounds() == FALSE);
2365 
2366         utext_close(&testText);
2367         utext_close(&testPattern);
2368     }
2369 
2370     //
2371     // hitEnd() and requireEnd()
2372     //
2373     {
2374         UErrorCode status = U_ZERO_ERROR;
2375         UText testPattern = UTEXT_INITIALIZER;
2376         UText testText    = UTEXT_INITIALIZER;
2377         const char str_[] = { 0x2e, 0x2a, 0x00 }; /* .* */
2378         const char str_aabb[] = { 0x61, 0x61, 0x62, 0x62, 0x00 }; /* aabb */
2379         utext_openUTF8(&testPattern, str_, -1, &status);
2380         utext_openUTF8(&testText, str_aabb, -1, &status);
2381 
2382         RegexMatcher m1(&testPattern, &testText,  0, status);
2383         REGEX_ASSERT(m1.lookingAt(status) == TRUE);
2384         REGEX_ASSERT(m1.hitEnd() == TRUE);
2385         REGEX_ASSERT(m1.requireEnd() == FALSE);
2386         REGEX_CHECK_STATUS;
2387 
2388         status = U_ZERO_ERROR;
2389         const char str_a[] = { 0x61, 0x2a, 0x00 }; /* a* */
2390         utext_openUTF8(&testPattern, str_a, -1, &status);
2391         RegexMatcher m2(&testPattern, &testText, 0, status);
2392         REGEX_ASSERT(m2.lookingAt(status) == TRUE);
2393         REGEX_ASSERT(m2.hitEnd() == FALSE);
2394         REGEX_ASSERT(m2.requireEnd() == FALSE);
2395         REGEX_CHECK_STATUS;
2396 
2397         status = U_ZERO_ERROR;
2398         const char str_dotstardollar[] = { 0x2e, 0x2a, 0x24, 0x00 }; /* .*$ */
2399         utext_openUTF8(&testPattern, str_dotstardollar, -1, &status);
2400         RegexMatcher m3(&testPattern, &testText, 0, status);
2401         REGEX_ASSERT(m3.lookingAt(status) == TRUE);
2402         REGEX_ASSERT(m3.hitEnd() == TRUE);
2403         REGEX_ASSERT(m3.requireEnd() == TRUE);
2404         REGEX_CHECK_STATUS;
2405 
2406         utext_close(&testText);
2407         utext_close(&testPattern);
2408     }
2409 }
2410 
2411 
2412 //---------------------------------------------------------------------------
2413 //
2414 //      API_Replace_UTF8   API test for class RegexMatcher, testing the
2415 //                         Replace family of functions.
2416 //
2417 //---------------------------------------------------------------------------
API_Replace_UTF8()2418 void RegexTest::API_Replace_UTF8() {
2419     //
2420     //  Replace
2421     //
2422     int32_t             flags=0;
2423     UParseError         pe;
2424     UErrorCode          status=U_ZERO_ERROR;
2425 
2426     UText               re=UTEXT_INITIALIZER;
2427     regextst_openUTF8FromInvariant(&re, "abc", -1, &status);
2428     REGEX_VERBOSE_TEXT(&re);
2429     RegexPattern *pat = RegexPattern::compile(&re, flags, pe, status);
2430     REGEX_CHECK_STATUS;
2431 
2432     char data[] = { 0x2e, 0x61, 0x62, 0x63, 0x2e, 0x2e, 0x61, 0x62, 0x63, 0x2e, 0x2e, 0x2e, 0x61, 0x62, 0x63, 0x2e, 0x2e, 0x00 }; /* .abc..abc...abc.. */
2433     //             012345678901234567
2434     UText dataText = UTEXT_INITIALIZER;
2435     utext_openUTF8(&dataText, data, -1, &status);
2436     REGEX_CHECK_STATUS;
2437     REGEX_VERBOSE_TEXT(&dataText);
2438     RegexMatcher *matcher = &pat->matcher(status)->reset(&dataText);
2439 
2440     //
2441     //  Plain vanilla matches.
2442     //
2443     UnicodeString  dest;
2444     UText destText = UTEXT_INITIALIZER;
2445     utext_openUnicodeString(&destText, &dest, &status);
2446     UText *result;
2447 
2448     UText replText = UTEXT_INITIALIZER;
2449 
2450     const char str_yz[] = { 0x79, 0x7a, 0x00 }; /* yz */
2451     utext_openUTF8(&replText, str_yz, -1, &status);
2452     REGEX_VERBOSE_TEXT(&replText);
2453     result = matcher->replaceFirst(&replText, NULL, status);
2454     REGEX_CHECK_STATUS;
2455     const char str_yzabcabc[] = { 0x2e, 0x79, 0x7a, 0x2e, 0x2e, 0x61, 0x62, 0x63, 0x2e, 0x2e, 0x2e, 0x61, 0x62, 0x63, 0x2e, 0x2e, 0x00 }; /* .yz..abc...abc.. */
2456     REGEX_ASSERT_UTEXT_UTF8(str_yzabcabc, result);
2457     utext_close(result);
2458     result = matcher->replaceFirst(&replText, &destText, status);
2459     REGEX_CHECK_STATUS;
2460     REGEX_ASSERT(result == &destText);
2461     REGEX_ASSERT_UTEXT_UTF8(str_yzabcabc, result);
2462 
2463     result = matcher->replaceAll(&replText, NULL, status);
2464     REGEX_CHECK_STATUS;
2465     const char str_yzyzyz[] = { 0x2e, 0x79, 0x7a, 0x2e, 0x2e, 0x79, 0x7a, 0x2e, 0x2e, 0x2e, 0x79, 0x7a, 0x2e, 0x2e, 0x00 }; /* .yz..yz...yz.. */
2466     REGEX_ASSERT_UTEXT_UTF8(str_yzyzyz, result);
2467     utext_close(result);
2468 
2469     utext_replace(&destText, 0, utext_nativeLength(&destText), NULL, 0, &status);
2470     result = matcher->replaceAll(&replText, &destText, status);
2471     REGEX_CHECK_STATUS;
2472     REGEX_ASSERT(result == &destText);
2473     REGEX_ASSERT_UTEXT_UTF8(str_yzyzyz, result);
2474 
2475     //
2476     //  Plain vanilla non-matches.
2477     //
2478     const char str_abxabxabx[] = { 0x2e, 0x61, 0x62, 0x78, 0x2e, 0x2e, 0x61, 0x62, 0x78, 0x2e, 0x2e, 0x2e, 0x61, 0x62, 0x78, 0x2e, 0x2e, 0x00 }; /* .abx..abx...abx.. */
2479     utext_openUTF8(&dataText, str_abxabxabx, -1, &status);
2480     matcher->reset(&dataText);
2481 
2482     result = matcher->replaceFirst(&replText, NULL, status);
2483     REGEX_CHECK_STATUS;
2484     REGEX_ASSERT_UTEXT_UTF8(str_abxabxabx, result);
2485     utext_close(result);
2486     result = matcher->replaceFirst(&replText, &destText, status);
2487     REGEX_CHECK_STATUS;
2488     REGEX_ASSERT(result == &destText);
2489     REGEX_ASSERT_UTEXT_UTF8(str_abxabxabx, result);
2490 
2491     result = matcher->replaceAll(&replText, NULL, status);
2492     REGEX_CHECK_STATUS;
2493     REGEX_ASSERT_UTEXT_UTF8(str_abxabxabx, result);
2494     utext_close(result);
2495     utext_replace(&destText, 0, utext_nativeLength(&destText), NULL, 0, &status);
2496     result = matcher->replaceAll(&replText, &destText, status);
2497     REGEX_CHECK_STATUS;
2498     REGEX_ASSERT(result == &destText);
2499     REGEX_ASSERT_UTEXT_UTF8(str_abxabxabx, result);
2500 
2501     //
2502     // Empty source string
2503     //
2504     utext_openUTF8(&dataText, NULL, 0, &status);
2505     matcher->reset(&dataText);
2506 
2507     result = matcher->replaceFirst(&replText, NULL, status);
2508     REGEX_CHECK_STATUS;
2509     REGEX_ASSERT_UTEXT_UTF8("", result);
2510     utext_close(result);
2511     result = matcher->replaceFirst(&replText, &destText, status);
2512     REGEX_CHECK_STATUS;
2513     REGEX_ASSERT(result == &destText);
2514     REGEX_ASSERT_UTEXT_UTF8("", result);
2515 
2516     result = matcher->replaceAll(&replText, NULL, status);
2517     REGEX_CHECK_STATUS;
2518     REGEX_ASSERT_UTEXT_UTF8("", result);
2519     utext_close(result);
2520     result = matcher->replaceAll(&replText, &destText, status);
2521     REGEX_CHECK_STATUS;
2522     REGEX_ASSERT(result == &destText);
2523     REGEX_ASSERT_UTEXT_UTF8("", result);
2524 
2525     //
2526     // Empty substitution string
2527     //
2528     utext_openUTF8(&dataText, data, -1, &status); // ".abc..abc...abc.."
2529     matcher->reset(&dataText);
2530 
2531     utext_openUTF8(&replText, NULL, 0, &status);
2532     result = matcher->replaceFirst(&replText, NULL, status);
2533     REGEX_CHECK_STATUS;
2534     const char str_abcabc[] = { 0x2e, 0x2e, 0x2e, 0x61, 0x62, 0x63, 0x2e, 0x2e, 0x2e, 0x61, 0x62, 0x63, 0x2e, 0x2e, 0x00 }; /* ...abc...abc.. */
2535     REGEX_ASSERT_UTEXT_UTF8(str_abcabc, result);
2536     utext_close(result);
2537     result = matcher->replaceFirst(&replText, &destText, status);
2538     REGEX_CHECK_STATUS;
2539     REGEX_ASSERT(result == &destText);
2540     REGEX_ASSERT_UTEXT_UTF8(str_abcabc, result);
2541 
2542     result = matcher->replaceAll(&replText, NULL, status);
2543     REGEX_CHECK_STATUS;
2544     const char str_dots[] = { 0x2e, 0x2e, 0x2e, 0x2e, 0x2e, 0x2e, 0x2e, 0x2e, 0x00 }; /* ........ */
2545     REGEX_ASSERT_UTEXT_UTF8(str_dots, result);
2546     utext_close(result);
2547     utext_replace(&destText, 0, utext_nativeLength(&destText), NULL, 0, &status);
2548     result = matcher->replaceAll(&replText, &destText, status);
2549     REGEX_CHECK_STATUS;
2550     REGEX_ASSERT(result == &destText);
2551     REGEX_ASSERT_UTEXT_UTF8(str_dots, result);
2552 
2553     //
2554     // match whole string
2555     //
2556     const char str_abc[] = { 0x61, 0x62, 0x63, 0x00 }; /* abc */
2557     utext_openUTF8(&dataText, str_abc, -1, &status);
2558     matcher->reset(&dataText);
2559 
2560     const char str_xyz[] = { 0x78, 0x79, 0x7a, 0x00 }; /* xyz */
2561     utext_openUTF8(&replText, str_xyz, -1, &status);
2562     result = matcher->replaceFirst(&replText, NULL, status);
2563     REGEX_CHECK_STATUS;
2564     REGEX_ASSERT_UTEXT_UTF8(str_xyz, result);
2565     utext_close(result);
2566     utext_replace(&destText, 0, utext_nativeLength(&destText), NULL, 0, &status);
2567     result = matcher->replaceFirst(&replText, &destText, status);
2568     REGEX_CHECK_STATUS;
2569     REGEX_ASSERT(result == &destText);
2570     REGEX_ASSERT_UTEXT_UTF8(str_xyz, result);
2571 
2572     result = matcher->replaceAll(&replText, NULL, status);
2573     REGEX_CHECK_STATUS;
2574     REGEX_ASSERT_UTEXT_UTF8(str_xyz, result);
2575     utext_close(result);
2576     utext_replace(&destText, 0, utext_nativeLength(&destText), NULL, 0, &status);
2577     result = matcher->replaceAll(&replText, &destText, status);
2578     REGEX_CHECK_STATUS;
2579     REGEX_ASSERT(result == &destText);
2580     REGEX_ASSERT_UTEXT_UTF8(str_xyz, result);
2581 
2582     //
2583     // Capture Group, simple case
2584     //
2585     const char str_add[] = { 0x61, 0x28, 0x2e, 0x2e, 0x29, 0x00 }; /* a(..) */
2586     utext_openUTF8(&re, str_add, -1, &status);
2587     RegexPattern *pat2 = RegexPattern::compile(&re, flags, pe, status);
2588     REGEX_CHECK_STATUS;
2589 
2590     const char str_abcdefg[] = { 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x00 }; /* abcdefg */
2591     utext_openUTF8(&dataText, str_abcdefg, -1, &status);
2592     RegexMatcher *matcher2 = &pat2->matcher(status)->reset(&dataText);
2593     REGEX_CHECK_STATUS;
2594 
2595     const char str_11[] = { 0x24, 0x31, 0x24, 0x31, 0x00 }; /* $1$1 */
2596     utext_openUTF8(&replText, str_11, -1, &status);
2597     result = matcher2->replaceFirst(&replText, NULL, status);
2598     REGEX_CHECK_STATUS;
2599     const char str_bcbcdefg[] = { 0x62, 0x63, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x00 }; /* bcbcdefg */
2600     REGEX_ASSERT_UTEXT_UTF8(str_bcbcdefg, result);
2601     utext_close(result);
2602     utext_replace(&destText, 0, utext_nativeLength(&destText), NULL, 0, &status);
2603     result = matcher2->replaceFirst(&replText, &destText, status);
2604     REGEX_CHECK_STATUS;
2605     REGEX_ASSERT(result == &destText);
2606     REGEX_ASSERT_UTEXT_UTF8(str_bcbcdefg, result);
2607 
2608     const char str_v[24] = { 0x54, 0x68, 0x65, 0x20, 0x76, 0x61, 0x6c, 0x75, 0x65, 0x20, 0x6f, 0x66, 0x20, 0x5c, 0x24, 0x31, 0x20, 0x69, 0x73, 0x20, 0x24, 0x31, 0x2e, 0x00 }; /* The value of \$1 is $1. */
2609     utext_openUTF8(&replText, str_v, -1, &status);
2610     REGEX_VERBOSE_TEXT(&replText);
2611     result = matcher2->replaceFirst(&replText, NULL, status);
2612     REGEX_CHECK_STATUS;
2613     const char str_Thevalueof1isbcdefg[] = { 0x54, 0x68, 0x65, 0x20, 0x76, 0x61, 0x6c, 0x75, 0x65, 0x20, 0x6f, 0x66, 0x20, 0x24, 0x31, 0x20, 0x69, 0x73, 0x20, 0x62, 0x63, 0x2e, 0x64, 0x65, 0x66, 0x67, 0x00 }; /* The value of $1 is bc.defg */
2614     REGEX_ASSERT_UTEXT_UTF8(str_Thevalueof1isbcdefg, result);
2615     utext_close(result);
2616     utext_replace(&destText, 0, utext_nativeLength(&destText), NULL, 0, &status);
2617     result = matcher2->replaceFirst(&replText, &destText, status);
2618     REGEX_CHECK_STATUS;
2619     REGEX_ASSERT(result == &destText);
2620     REGEX_ASSERT_UTEXT_UTF8(str_Thevalueof1isbcdefg, result);
2621 
2622     const char str_byitselfnogroupnumber[] = { 0x5c, 0x24, 0x20, 0x62, 0x79, 0x20, 0x69, 0x74, 0x73, 0x65, 0x6c,
2623                0x66, 0x2c, 0x20, 0x6e, 0x6f, 0x20, 0x67, 0x72, 0x6f, 0x75, 0x70, 0x20, 0x6e, 0x75, 0x6d, 0x62,
2624                0x65, 0x72, 0x20, 0x5c, 0x24, 0x5c, 0x24, 0x5c, 0x24, 0x00 }; /* \$ by itself, no group number \$\$\$ */
2625     utext_openUTF8(&replText, str_byitselfnogroupnumber, -1, &status);
2626     result = matcher2->replaceFirst(&replText, NULL, status);
2627     REGEX_CHECK_STATUS;
2628     const char str_byitselfnogroupnumberdefg[] = { 0x24, 0x20, 0x62, 0x79, 0x20, 0x69, 0x74, 0x73, 0x65, 0x6c, 0x66, 0x2c, 0x20, 0x6e, 0x6f, 0x20, 0x67, 0x72, 0x6f, 0x75, 0x70, 0x20, 0x6e, 0x75, 0x6d, 0x62, 0x65, 0x72, 0x20, 0x24, 0x24, 0x24, 0x64, 0x65, 0x66, 0x67, 0x00 }; /* $ by itself, no group number $$$defg */
2629     REGEX_ASSERT_UTEXT_UTF8(str_byitselfnogroupnumberdefg, result);
2630     utext_close(result);
2631     utext_replace(&destText, 0, utext_nativeLength(&destText), NULL, 0, &status);
2632     result = matcher2->replaceFirst(&replText, &destText, status);
2633     REGEX_CHECK_STATUS;
2634     REGEX_ASSERT(result == &destText);
2635     REGEX_ASSERT_UTEXT_UTF8(str_byitselfnogroupnumberdefg, result);
2636 
2637     unsigned char supplDigitChars[] = { 0x53, 0x75, 0x70, 0x70, 0x6c, 0x65, 0x6d, 0x65, 0x6e, 0x74, 0x61, 0x6c, 0x20, 0x44, 0x69, 0x67, 0x69, 0x74, 0x20, 0x31, 0x20, 0x24, 0x78, 0x78, 0x78, 0x78, 0x2e, 0x00 }; /* Supplemental Digit 1 $xxxx. */
2638     //unsigned char supplDigitChars[] = "Supplemental Digit 1 $xxxx."; // \U0001D7CF, MATHEMATICAL BOLD DIGIT ONE
2639     //                                 012345678901234567890123456
2640     supplDigitChars[22] = 0xF0;
2641     supplDigitChars[23] = 0x9D;
2642     supplDigitChars[24] = 0x9F;
2643     supplDigitChars[25] = 0x8F;
2644     utext_openUTF8(&replText, (char *)supplDigitChars, -1, &status);
2645 
2646     result = matcher2->replaceFirst(&replText, NULL, status);
2647     REGEX_CHECK_STATUS;
2648     const char str_SupplementalDigit1bcdefg[] = { 0x53, 0x75, 0x70, 0x70, 0x6c, 0x65, 0x6d, 0x65, 0x6e, 0x74, 0x61, 0x6c, 0x20, 0x44, 0x69, 0x67, 0x69, 0x74, 0x20, 0x31, 0x20, 0x62, 0x63, 0x2e, 0x64, 0x65, 0x66, 0x67, 0x00 }; /* Supplemental Digit 1 bc.defg */
2649     REGEX_ASSERT_UTEXT_UTF8(str_SupplementalDigit1bcdefg, result);
2650     utext_close(result);
2651     utext_replace(&destText, 0, utext_nativeLength(&destText), NULL, 0, &status);
2652     result = matcher2->replaceFirst(&replText, &destText, status);
2653     REGEX_CHECK_STATUS;
2654     REGEX_ASSERT(result == &destText);
2655     REGEX_ASSERT_UTEXT_UTF8(str_SupplementalDigit1bcdefg, result);
2656     const char str_badcapturegroupnumber5[] = { 0x62, 0x61, 0x64, 0x20, 0x63, 0x61, 0x70, 0x74, 0x75, 0x72, 0x65, 0x20, 0x67, 0x72, 0x6f, 0x75, 0x70, 0x20, 0x6e, 0x75, 0x6d, 0x62, 0x65, 0x72, 0x20, 0x24, 0x35, 0x2e, 0x2e, 0x2e,  0x00 }; /* bad capture group number $5..." */
2657     utext_openUTF8(&replText, str_badcapturegroupnumber5, -1, &status);
2658     REGEX_ASSERT_FAIL((result = matcher2->replaceFirst(&replText, NULL, status)), U_INDEX_OUTOFBOUNDS_ERROR);
2659 //    REGEX_ASSERT_UTEXT_UTF8("abcdefg", result);
2660     utext_close(result);
2661     utext_replace(&destText, 0, utext_nativeLength(&destText), NULL, 0, &status);
2662     REGEX_ASSERT_FAIL((result = matcher2->replaceFirst(&replText, &destText, status)), U_INDEX_OUTOFBOUNDS_ERROR);
2663     REGEX_ASSERT(result == &destText);
2664 //    REGEX_ASSERT_UTEXT_UTF8("abcdefg", result);
2665 
2666     //
2667     // Replacement String with \u hex escapes
2668     //
2669     {
2670       const char str_abc1abc2abc3[] = { 0x61, 0x62, 0x63, 0x20, 0x31, 0x20, 0x61, 0x62, 0x63, 0x20, 0x32, 0x20, 0x61, 0x62, 0x63, 0x20, 0x33, 0x00 }; /* abc 1 abc 2 abc 3 */
2671       const char str_u0043[] = { 0x2d, 0x2d, 0x5c, 0x75, 0x30, 0x30, 0x34, 0x33, 0x2d, 0x2d, 0x00 }; /* --\u0043-- */
2672         utext_openUTF8(&dataText, str_abc1abc2abc3, -1, &status);
2673         utext_openUTF8(&replText, str_u0043, -1, &status);
2674         matcher->reset(&dataText);
2675 
2676         result = matcher->replaceAll(&replText, NULL, status);
2677         REGEX_CHECK_STATUS;
2678         const char str_C1C2C3[] = { 0x2d, 0x2d, 0x43, 0x2d, 0x2d, 0x20, 0x31, 0x20, 0x2d, 0x2d, 0x43, 0x2d, 0x2d, 0x20, 0x32, 0x20, 0x2d, 0x2d, 0x43, 0x2d, 0x2d, 0x20, 0x33, 0x00 }; /* --C-- 1 --C-- 2 --C-- 3 */
2679         REGEX_ASSERT_UTEXT_UTF8(str_C1C2C3, result);
2680         utext_close(result);
2681         utext_replace(&destText, 0, utext_nativeLength(&destText), NULL, 0, &status);
2682         result = matcher->replaceAll(&replText, &destText, status);
2683         REGEX_CHECK_STATUS;
2684         REGEX_ASSERT(result == &destText);
2685         REGEX_ASSERT_UTEXT_UTF8(str_C1C2C3, result);
2686     }
2687     {
2688       const char str_abc[] = { 0x61, 0x62, 0x63, 0x20, 0x21, 0x00 }; /* abc ! */
2689         utext_openUTF8(&dataText, str_abc, -1, &status);
2690         const char str_U00010000[] = { 0x2d, 0x2d, 0x5c, 0x55, 0x30, 0x30, 0x30, 0x31, 0x30, 0x30, 0x30, 0x30, 0x2d, 0x2d, 0x00 }; /* --\U00010000-- */
2691         utext_openUTF8(&replText, str_U00010000, -1, &status);
2692         matcher->reset(&dataText);
2693 
2694         unsigned char expected[] = { 0x2d, 0x2d, 0x78, 0x78, 0x78, 0x78, 0x2d, 0x2d, 0x20, 0x21, 0x00 }; /* --xxxx-- ! */ // \U00010000, "LINEAR B SYLLABLE B008 A"
2695         //                          0123456789
2696         expected[2] = 0xF0;
2697         expected[3] = 0x90;
2698         expected[4] = 0x80;
2699         expected[5] = 0x80;
2700 
2701         result = matcher->replaceAll(&replText, NULL, status);
2702         REGEX_CHECK_STATUS;
2703         REGEX_ASSERT_UTEXT_UTF8((char *)expected, result);
2704         utext_close(result);
2705         utext_replace(&destText, 0, utext_nativeLength(&destText), NULL, 0, &status);
2706         result = matcher->replaceAll(&replText, &destText, status);
2707         REGEX_CHECK_STATUS;
2708         REGEX_ASSERT(result == &destText);
2709         REGEX_ASSERT_UTEXT_UTF8((char *)expected, result);
2710     }
2711     // TODO:  need more through testing of capture substitutions.
2712 
2713     // Bug 4057
2714     //
2715     {
2716         status = U_ZERO_ERROR;
2717 const char str_ssee[] = { 0x73, 0x73, 0x28, 0x2e, 0x2a, 0x3f, 0x29, 0x65, 0x65, 0x00 }; /* ss(.*?)ee */
2718 const char str_blah[] = { 0x54, 0x68, 0x65, 0x20, 0x6d, 0x61, 0x74, 0x63, 0x68, 0x65, 0x73, 0x20, 0x73, 0x74, 0x61, 0x72, 0x74, 0x20, 0x77, 0x69, 0x74, 0x68, 0x20, 0x73, 0x73, 0x20, 0x61, 0x6e, 0x64, 0x20, 0x65, 0x6e, 0x64, 0x20, 0x77, 0x69, 0x74, 0x68, 0x20, 0x65, 0x65, 0x20, 0x73, 0x73, 0x20, 0x73, 0x74, 0x75, 0x66, 0x66, 0x20, 0x65, 0x65, 0x20, 0x66, 0x69, 0x6e, 0x00 }; /* The matches start with ss and end with ee ss stuff ee fin */
2719 const char str_ooh[] = { 0x6f, 0x6f, 0x68, 0x00 }; /* ooh */
2720         utext_openUTF8(&re, str_ssee, -1, &status);
2721         utext_openUTF8(&dataText, str_blah, -1, &status);
2722         utext_openUTF8(&replText, str_ooh, -1, &status);
2723 
2724         RegexMatcher m(&re, 0, status);
2725         REGEX_CHECK_STATUS;
2726 
2727         UnicodeString result;
2728         UText resultText = UTEXT_INITIALIZER;
2729         utext_openUnicodeString(&resultText, &result, &status);
2730 
2731         // Multiple finds do NOT bump up the previous appendReplacement position.
2732         m.reset(&dataText);
2733         m.find();
2734         m.find();
2735         m.appendReplacement(&resultText, &replText, status);
2736         REGEX_CHECK_STATUS;
2737         const char str_blah2[] = { 0x54, 0x68, 0x65, 0x20, 0x6d, 0x61, 0x74, 0x63, 0x68, 0x65, 0x73, 0x20, 0x73, 0x74, 0x61, 0x72, 0x74, 0x20, 0x77, 0x69, 0x74, 0x68, 0x20, 0x73, 0x73, 0x20, 0x61, 0x6e, 0x64, 0x20, 0x65, 0x6e, 0x64, 0x20, 0x77, 0x69, 0x74, 0x68, 0x20, 0x65, 0x65, 0x20, 0x6f, 0x6f, 0x68, 0x00 }; /* The matches start with ss and end with ee ooh */
2738         REGEX_ASSERT_UTEXT_UTF8(str_blah2, &resultText);
2739 
2740         // After a reset into the interior of a string, appendReplacement still starts at beginning.
2741         status = U_ZERO_ERROR;
2742         result.truncate(0);
2743         utext_openUnicodeString(&resultText, &result, &status);
2744         m.reset(10, status);
2745         m.find();
2746         m.find();
2747         m.appendReplacement(&resultText, &replText, status);
2748         REGEX_CHECK_STATUS;
2749         const char str_blah3[] = { 0x54, 0x68, 0x65, 0x20, 0x6d, 0x61, 0x74, 0x63, 0x68, 0x65, 0x73, 0x20, 0x73, 0x74, 0x61, 0x72, 0x74, 0x20, 0x77, 0x69, 0x74, 0x68, 0x20, 0x73, 0x73, 0x20, 0x61, 0x6e, 0x64, 0x20, 0x65, 0x6e, 0x64, 0x20, 0x77, 0x69, 0x74, 0x68, 0x20, 0x65, 0x65, 0x20, 0x6f, 0x6f, 0x68, 0x00 }; /* The matches start with ss and end with ee ooh */
2750         REGEX_ASSERT_UTEXT_UTF8(str_blah3, &resultText);
2751 
2752         // find() at interior of string, appendReplacement still starts at beginning.
2753         status = U_ZERO_ERROR;
2754         result.truncate(0);
2755         utext_openUnicodeString(&resultText, &result, &status);
2756         m.reset();
2757         m.find(10, status);
2758         m.find();
2759         m.appendReplacement(&resultText, &replText, status);
2760         REGEX_CHECK_STATUS;
2761         const char str_blah8[] = { 0x54, 0x68, 0x65, 0x20, 0x6d, 0x61, 0x74, 0x63, 0x68, 0x65, 0x73, 0x20, 0x73, 0x74, 0x61, 0x72, 0x74, 0x20, 0x77, 0x69, 0x74, 0x68, 0x20, 0x73, 0x73, 0x20, 0x61, 0x6e, 0x64, 0x20, 0x65, 0x6e, 0x64, 0x20, 0x77, 0x69, 0x74, 0x68, 0x20, 0x65, 0x65, 0x20, 0x6f, 0x6f, 0x68, 0x00 }; /* The matches start with ss and end with ee ooh */
2762         REGEX_ASSERT_UTEXT_UTF8(str_blah8, &resultText);
2763 
2764         m.appendTail(&resultText, status);
2765         const char str_blah9[] = { 0x54, 0x68, 0x65, 0x20, 0x6d, 0x61, 0x74, 0x63, 0x68, 0x65, 0x73, 0x20, 0x73, 0x74, 0x61, 0x72, 0x74, 0x20, 0x77, 0x69, 0x74, 0x68, 0x20, 0x73, 0x73, 0x20, 0x61, 0x6e, 0x64, 0x20, 0x65, 0x6e, 0x64, 0x20, 0x77, 0x69, 0x74, 0x68, 0x20, 0x65, 0x65, 0x20, 0x6f, 0x6f, 0x68, 0x20, 0x66, 0x69, 0x6e, 0x00 }; /* The matches start with ss and end with ee ooh fin */
2766         REGEX_ASSERT_UTEXT_UTF8(str_blah9, &resultText);
2767 
2768         utext_close(&resultText);
2769     }
2770 
2771     delete matcher2;
2772     delete pat2;
2773     delete matcher;
2774     delete pat;
2775 
2776     utext_close(&dataText);
2777     utext_close(&replText);
2778     utext_close(&destText);
2779     utext_close(&re);
2780 }
2781 
2782 
2783 //---------------------------------------------------------------------------
2784 //
2785 //      API_Pattern_UTF8  Test that the API for class RegexPattern is
2786 //                        present and nominally working.
2787 //
2788 //---------------------------------------------------------------------------
API_Pattern_UTF8()2789 void RegexTest::API_Pattern_UTF8() {
2790     RegexPattern        pata;    // Test default constructor to not crash.
2791     RegexPattern        patb;
2792 
2793     REGEX_ASSERT(pata == patb);
2794     REGEX_ASSERT(pata == pata);
2795 
2796     UText         re1 = UTEXT_INITIALIZER;
2797     UText         re2 = UTEXT_INITIALIZER;
2798     UErrorCode    status = U_ZERO_ERROR;
2799     UParseError   pe;
2800 
2801     const char str_abcalmz[] = { 0x61, 0x62, 0x63, 0x5b, 0x61, 0x2d, 0x6c, 0x5d, 0x5b, 0x6d, 0x2d, 0x7a, 0x5d, 0x00 }; /* abc[a-l][m-z] */
2802     const char str_def[] = { 0x64, 0x65, 0x66, 0x00 }; /* def */
2803     utext_openUTF8(&re1, str_abcalmz, -1, &status);
2804     utext_openUTF8(&re2, str_def, -1, &status);
2805 
2806     RegexPattern        *pat1 = RegexPattern::compile(&re1, 0, pe, status);
2807     RegexPattern        *pat2 = RegexPattern::compile(&re2, 0, pe, status);
2808     REGEX_CHECK_STATUS;
2809     REGEX_ASSERT(*pat1 == *pat1);
2810     REGEX_ASSERT(*pat1 != pata);
2811 
2812     // Assign
2813     patb = *pat1;
2814     REGEX_ASSERT(patb == *pat1);
2815 
2816     // Copy Construct
2817     RegexPattern patc(*pat1);
2818     REGEX_ASSERT(patc == *pat1);
2819     REGEX_ASSERT(patb == patc);
2820     REGEX_ASSERT(pat1 != pat2);
2821     patb = *pat2;
2822     REGEX_ASSERT(patb != patc);
2823     REGEX_ASSERT(patb == *pat2);
2824 
2825     // Compile with no flags.
2826     RegexPattern         *pat1a = RegexPattern::compile(&re1, pe, status);
2827     REGEX_ASSERT(*pat1a == *pat1);
2828 
2829     REGEX_ASSERT(pat1a->flags() == 0);
2830 
2831     // Compile with different flags should be not equal
2832     RegexPattern        *pat1b = RegexPattern::compile(&re1, UREGEX_CASE_INSENSITIVE, pe, status);
2833     REGEX_CHECK_STATUS;
2834 
2835     REGEX_ASSERT(*pat1b != *pat1a);
2836     REGEX_ASSERT(pat1b->flags() == UREGEX_CASE_INSENSITIVE);
2837     REGEX_ASSERT(pat1a->flags() == 0);
2838     delete pat1b;
2839 
2840     // clone
2841     RegexPattern *pat1c = pat1->clone();
2842     REGEX_ASSERT(*pat1c == *pat1);
2843     REGEX_ASSERT(*pat1c != *pat2);
2844 
2845     delete pat1c;
2846     delete pat1a;
2847     delete pat1;
2848     delete pat2;
2849 
2850     utext_close(&re1);
2851     utext_close(&re2);
2852 
2853 
2854     //
2855     //   Verify that a matcher created from a cloned pattern works.
2856     //     (Jitterbug 3423)
2857     //
2858     {
2859         UErrorCode     status     = U_ZERO_ERROR;
2860         UText          pattern    = UTEXT_INITIALIZER;
2861         const char str_pL[] = { 0x5c, 0x70, 0x7b, 0x4c, 0x7d, 0x2b, 0x00 }; /* \p{L}+ */
2862         utext_openUTF8(&pattern, str_pL, -1, &status);
2863 
2864         RegexPattern  *pSource    = RegexPattern::compile(&pattern, 0, status);
2865         RegexPattern  *pClone     = pSource->clone();
2866         delete         pSource;
2867         RegexMatcher  *mFromClone = pClone->matcher(status);
2868         REGEX_CHECK_STATUS;
2869 
2870         UText          input      = UTEXT_INITIALIZER;
2871         const char str_HelloWorld[] = { 0x48, 0x65, 0x6c, 0x6c, 0x6f, 0x20, 0x57, 0x6f, 0x72, 0x6c, 0x64, 0x00 }; /* Hello World */
2872         utext_openUTF8(&input, str_HelloWorld, -1, &status);
2873         mFromClone->reset(&input);
2874         REGEX_ASSERT(mFromClone->find() == TRUE);
2875         REGEX_ASSERT(mFromClone->group(status) == "Hello");
2876         REGEX_ASSERT(mFromClone->find() == TRUE);
2877         REGEX_ASSERT(mFromClone->group(status) == "World");
2878         REGEX_ASSERT(mFromClone->find() == FALSE);
2879         delete mFromClone;
2880         delete pClone;
2881 
2882         utext_close(&input);
2883         utext_close(&pattern);
2884     }
2885 
2886     //
2887     //   matches convenience API
2888     //
2889     {
2890         UErrorCode status  = U_ZERO_ERROR;
2891         UText      pattern = UTEXT_INITIALIZER;
2892         UText      input   = UTEXT_INITIALIZER;
2893 
2894         const char str_randominput[] = { 0x72, 0x61, 0x6e, 0x64, 0x6f, 0x6d, 0x20, 0x69, 0x6e, 0x70, 0x75, 0x74, 0x00 }; /* random input */
2895         utext_openUTF8(&input, str_randominput, -1, &status);
2896 
2897         const char str_dotstar[] = { 0x2e, 0x2a, 0x00 }; /* .* */
2898         utext_openUTF8(&pattern, str_dotstar, -1, &status);
2899         REGEX_ASSERT(RegexPattern::matches(&pattern, &input, pe, status) == TRUE);
2900         REGEX_CHECK_STATUS;
2901 
2902         const char str_abc[] = { 0x61, 0x62, 0x63, 0x00 }; /* abc */
2903         utext_openUTF8(&pattern, str_abc, -1, &status);
2904         REGEX_ASSERT(RegexPattern::matches("abc", "random input", pe, status) == FALSE);
2905         REGEX_CHECK_STATUS;
2906 
2907         const char str_nput[] = { 0x2e, 0x2a, 0x6e, 0x70, 0x75, 0x74, 0x00 }; /* .*nput */
2908         utext_openUTF8(&pattern, str_nput, -1, &status);
2909         REGEX_ASSERT(RegexPattern::matches(".*nput", "random input", pe, status) == TRUE);
2910         REGEX_CHECK_STATUS;
2911 
2912         utext_openUTF8(&pattern, str_randominput, -1, &status);
2913         REGEX_ASSERT(RegexPattern::matches("random input", "random input", pe, status) == TRUE);
2914         REGEX_CHECK_STATUS;
2915 
2916         const char str_u[] = { 0x2e, 0x2a, 0x75, 0x00 }; /* .*u */
2917         utext_openUTF8(&pattern, str_u, -1, &status);
2918         REGEX_ASSERT(RegexPattern::matches(".*u", "random input", pe, status) == FALSE);
2919         REGEX_CHECK_STATUS;
2920 
2921         utext_openUTF8(&input, str_abc, -1, &status);
2922         utext_openUTF8(&pattern, str_abc, -1, &status);
2923         status = U_INDEX_OUTOFBOUNDS_ERROR;
2924         REGEX_ASSERT(RegexPattern::matches("abc", "abc", pe, status) == FALSE);
2925         REGEX_ASSERT(status == U_INDEX_OUTOFBOUNDS_ERROR);
2926 
2927         utext_close(&input);
2928         utext_close(&pattern);
2929     }
2930 
2931 
2932     //
2933     // Split()
2934     //
2935     status = U_ZERO_ERROR;
2936     const char str_spaceplus[] = { 0x20, 0x2b, 0x00 }; /*  + */
2937     utext_openUTF8(&re1, str_spaceplus, -1, &status);
2938     pat1 = RegexPattern::compile(&re1, pe, status);
2939     REGEX_CHECK_STATUS;
2940     UnicodeString  fields[10];
2941 
2942     int32_t n;
2943     n = pat1->split("Now is the time", fields, 10, status);
2944     REGEX_CHECK_STATUS;
2945     REGEX_ASSERT(n==4);
2946     REGEX_ASSERT(fields[0]=="Now");
2947     REGEX_ASSERT(fields[1]=="is");
2948     REGEX_ASSERT(fields[2]=="the");
2949     REGEX_ASSERT(fields[3]=="time");
2950     REGEX_ASSERT(fields[4]=="");
2951 
2952     n = pat1->split("Now is the time", fields, 2, status);
2953     REGEX_CHECK_STATUS;
2954     REGEX_ASSERT(n==2);
2955     REGEX_ASSERT(fields[0]=="Now");
2956     REGEX_ASSERT(fields[1]=="is the time");
2957     REGEX_ASSERT(fields[2]=="the");   // left over from previous test
2958 
2959     fields[1] = "*";
2960     status = U_ZERO_ERROR;
2961     n = pat1->split("Now is the time", fields, 1, status);
2962     REGEX_CHECK_STATUS;
2963     REGEX_ASSERT(n==1);
2964     REGEX_ASSERT(fields[0]=="Now is the time");
2965     REGEX_ASSERT(fields[1]=="*");
2966     status = U_ZERO_ERROR;
2967 
2968     n = pat1->split("    Now       is the time   ", fields, 10, status);
2969     REGEX_CHECK_STATUS;
2970     REGEX_ASSERT(n==6);
2971     REGEX_ASSERT(fields[0]=="");
2972     REGEX_ASSERT(fields[1]=="Now");
2973     REGEX_ASSERT(fields[2]=="is");
2974     REGEX_ASSERT(fields[3]=="the");
2975     REGEX_ASSERT(fields[4]=="time");
2976     REGEX_ASSERT(fields[5]=="");
2977     REGEX_ASSERT(fields[6]=="");
2978 
2979     fields[2] = "*";
2980     n = pat1->split("     ", fields, 10, status);
2981     REGEX_CHECK_STATUS;
2982     REGEX_ASSERT(n==2);
2983     REGEX_ASSERT(fields[0]=="");
2984     REGEX_ASSERT(fields[1]=="");
2985     REGEX_ASSERT(fields[2]=="*");
2986 
2987     fields[0] = "foo";
2988     n = pat1->split("", fields, 10, status);
2989     REGEX_CHECK_STATUS;
2990     REGEX_ASSERT(n==0);
2991     REGEX_ASSERT(fields[0]=="foo");
2992 
2993     delete pat1;
2994 
2995     //  split, with a pattern with (capture)
2996     regextst_openUTF8FromInvariant(&re1, "<(\\w*)>", -1, &status);
2997     pat1 = RegexPattern::compile(&re1,  pe, status);
2998     REGEX_CHECK_STATUS;
2999 
3000     status = U_ZERO_ERROR;
3001     fields[6] = fields[7] = "*";
3002     n = pat1->split("<a>Now is <b>the time<c>", fields, 10, status);
3003     REGEX_CHECK_STATUS;
3004     REGEX_ASSERT(n==7);
3005     REGEX_ASSERT(fields[0]=="");
3006     REGEX_ASSERT(fields[1]=="a");
3007     REGEX_ASSERT(fields[2]=="Now is ");
3008     REGEX_ASSERT(fields[3]=="b");
3009     REGEX_ASSERT(fields[4]=="the time");
3010     REGEX_ASSERT(fields[5]=="c");
3011     REGEX_ASSERT(fields[6]=="");
3012     REGEX_ASSERT(fields[7]=="*");
3013     REGEX_ASSERT(status==U_ZERO_ERROR);
3014 
3015     fields[6] = fields[7] = "*";
3016     n = pat1->split("  <a>Now is <b>the time<c>", fields, 10, status);
3017     REGEX_CHECK_STATUS;
3018     REGEX_ASSERT(n==7);
3019     REGEX_ASSERT(fields[0]=="  ");
3020     REGEX_ASSERT(fields[1]=="a");
3021     REGEX_ASSERT(fields[2]=="Now is ");
3022     REGEX_ASSERT(fields[3]=="b");
3023     REGEX_ASSERT(fields[4]=="the time");
3024     REGEX_ASSERT(fields[5]=="c");
3025     REGEX_ASSERT(fields[6]=="");
3026     REGEX_ASSERT(fields[7]=="*");
3027 
3028     status = U_ZERO_ERROR;
3029     fields[6] = "foo";
3030     n = pat1->split("  <a>Now is <b>the time<c> ", fields, 6, status);
3031     REGEX_CHECK_STATUS;
3032     REGEX_ASSERT(n==6);
3033     REGEX_ASSERT(fields[0]=="  ");
3034     REGEX_ASSERT(fields[1]=="a");
3035     REGEX_ASSERT(fields[2]=="Now is ");
3036     REGEX_ASSERT(fields[3]=="b");
3037     REGEX_ASSERT(fields[4]=="the time");
3038     REGEX_ASSERT(fields[5]==" ");
3039     REGEX_ASSERT(fields[6]=="foo");
3040 
3041     status = U_ZERO_ERROR;
3042     fields[5] = "foo";
3043     n = pat1->split("  <a>Now is <b>the time<c>", fields, 5, status);
3044     REGEX_CHECK_STATUS;
3045     REGEX_ASSERT(n==5);
3046     REGEX_ASSERT(fields[0]=="  ");
3047     REGEX_ASSERT(fields[1]=="a");
3048     REGEX_ASSERT(fields[2]=="Now is ");
3049     REGEX_ASSERT(fields[3]=="b");
3050     REGEX_ASSERT(fields[4]=="the time<c>");
3051     REGEX_ASSERT(fields[5]=="foo");
3052 
3053     status = U_ZERO_ERROR;
3054     fields[5] = "foo";
3055     n = pat1->split("  <a>Now is <b>the time", fields, 5, status);
3056     REGEX_CHECK_STATUS;
3057     REGEX_ASSERT(n==5);
3058     REGEX_ASSERT(fields[0]=="  ");
3059     REGEX_ASSERT(fields[1]=="a");
3060     REGEX_ASSERT(fields[2]=="Now is ");
3061     REGEX_ASSERT(fields[3]=="b");
3062     REGEX_ASSERT(fields[4]=="the time");
3063     REGEX_ASSERT(fields[5]=="foo");
3064 
3065     status = U_ZERO_ERROR;
3066     n = pat1->split("  <a>Now is <b>the time<c>", fields, 4, status);
3067     REGEX_CHECK_STATUS;
3068     REGEX_ASSERT(n==4);
3069     REGEX_ASSERT(fields[0]=="  ");
3070     REGEX_ASSERT(fields[1]=="a");
3071     REGEX_ASSERT(fields[2]=="Now is ");
3072     REGEX_ASSERT(fields[3]=="the time<c>");
3073     status = U_ZERO_ERROR;
3074     delete pat1;
3075 
3076     regextst_openUTF8FromInvariant(&re1, "([-,])", -1, &status);
3077     pat1 = RegexPattern::compile(&re1, pe, status);
3078     REGEX_CHECK_STATUS;
3079     n = pat1->split("1-10,20", fields, 10, status);
3080     REGEX_CHECK_STATUS;
3081     REGEX_ASSERT(n==5);
3082     REGEX_ASSERT(fields[0]=="1");
3083     REGEX_ASSERT(fields[1]=="-");
3084     REGEX_ASSERT(fields[2]=="10");
3085     REGEX_ASSERT(fields[3]==",");
3086     REGEX_ASSERT(fields[4]=="20");
3087     delete pat1;
3088 
3089 
3090     //
3091     // split of a UText based string, with library allocating output UTexts.
3092     //
3093     {
3094         status = U_ZERO_ERROR;
3095         RegexMatcher matcher(UnicodeString("(:)"), 0, status);
3096         UnicodeString stringToSplit("first:second:third");
3097         UText *textToSplit = utext_openUnicodeString(NULL, &stringToSplit, &status);
3098         REGEX_CHECK_STATUS;
3099 
3100         UText *splits[10] = {NULL};
3101         int32_t numFields = matcher.split(textToSplit, splits, UPRV_LENGTHOF(splits), status);
3102         REGEX_CHECK_STATUS;
3103         REGEX_ASSERT(numFields == 5);
3104         REGEX_ASSERT_UTEXT_INVARIANT("first", splits[0]);
3105         REGEX_ASSERT_UTEXT_INVARIANT(":", splits[1]);
3106         REGEX_ASSERT_UTEXT_INVARIANT("second", splits[2]);
3107         REGEX_ASSERT_UTEXT_INVARIANT(":", splits[3]);
3108         REGEX_ASSERT_UTEXT_INVARIANT("third", splits[4]);
3109         REGEX_ASSERT(splits[5] == NULL);
3110 
3111         for (int i=0; i<UPRV_LENGTHOF(splits); i++) {
3112             if (splits[i]) {
3113                 utext_close(splits[i]);
3114                 splits[i] = NULL;
3115             }
3116         }
3117         utext_close(textToSplit);
3118     }
3119 
3120 
3121     //
3122     // RegexPattern::pattern() and patternText()
3123     //
3124     pat1 = new RegexPattern();
3125     REGEX_ASSERT(pat1->pattern() == "");
3126     REGEX_ASSERT_UTEXT_UTF8("", pat1->patternText(status));
3127     delete pat1;
3128     const char *helloWorldInvariant = "(Hello, world)*";
3129     regextst_openUTF8FromInvariant(&re1, helloWorldInvariant, -1, &status);
3130     pat1 = RegexPattern::compile(&re1, pe, status);
3131     REGEX_CHECK_STATUS;
3132     REGEX_ASSERT_UNISTR("(Hello, world)*", pat1->pattern());
3133     REGEX_ASSERT_UTEXT_INVARIANT("(Hello, world)*", pat1->patternText(status));
3134     delete pat1;
3135 
3136     utext_close(&re1);
3137 }
3138 
3139 
3140 //---------------------------------------------------------------------------
3141 //
3142 //      Extended       A more thorough check for features of regex patterns
3143 //                     The test cases are in a separate data file,
3144 //                       source/tests/testdata/regextst.txt
3145 //                     A description of the test data format is included in that file.
3146 //
3147 //---------------------------------------------------------------------------
3148 
3149 const char *
getPath(char buffer[2048],const char * filename)3150 RegexTest::getPath(char buffer[2048], const char *filename) {
3151     UErrorCode status=U_ZERO_ERROR;
3152     const char *testDataDirectory = IntlTest::getSourceTestData(status);
3153     if (U_FAILURE(status)) {
3154         errln("ERROR: loadTestData() failed - %s", u_errorName(status));
3155         return NULL;
3156     }
3157 
3158     strcpy(buffer, testDataDirectory);
3159     strcat(buffer, filename);
3160     return buffer;
3161 }
3162 
Extended()3163 void RegexTest::Extended() {
3164     char tdd[2048];
3165     const char *srcPath;
3166     UErrorCode  status  = U_ZERO_ERROR;
3167     int32_t     lineNum = 0;
3168 
3169     //
3170     //  Open and read the test data file.
3171     //
3172     srcPath=getPath(tdd, "regextst.txt");
3173     if(srcPath==NULL) {
3174         return; /* something went wrong, error already output */
3175     }
3176 
3177     int32_t    len;
3178     UChar *testData = ReadAndConvertFile(srcPath, len, "utf-8", status);
3179     if (U_FAILURE(status)) {
3180         return; /* something went wrong, error already output */
3181     }
3182 
3183     //
3184     //  Put the test data into a UnicodeString
3185     //
3186     UnicodeString testString(FALSE, testData, len);
3187 
3188     RegexMatcher    quotedStuffMat(UNICODE_STRING_SIMPLE("\\s*([\\'\\\"/])(.*?)\\1"), 0, status);
3189     RegexMatcher    commentMat    (UNICODE_STRING_SIMPLE("\\s*(#.*)?$"), 0, status);
3190     RegexMatcher    flagsMat      (UNICODE_STRING_SIMPLE("\\s*([ixsmdteDEGLMQvabtyYzZ2-9]*)([:letter:]*)"), 0, status);
3191 
3192     RegexMatcher    lineMat(UNICODE_STRING_SIMPLE("(.*?)\\r?\\n"), testString, 0, status);
3193     UnicodeString   testPattern;   // The pattern for test from the test file.
3194     UnicodeString   testFlags;     // the flags   for a test.
3195     UnicodeString   matchString;   // The marked up string to be used as input
3196 
3197     if (U_FAILURE(status)){
3198         dataerrln("Construct RegexMatcher() error - %s", u_errorName(status));
3199         delete [] testData;
3200         return;
3201     }
3202 
3203     //
3204     //  Loop over the test data file, once per line.
3205     //
3206     while (lineMat.find()) {
3207         lineNum++;
3208         if (U_FAILURE(status)) {
3209           errln("%s:%d: ICU Error \"%s\"", srcPath, lineNum, u_errorName(status));
3210         }
3211 
3212         status = U_ZERO_ERROR;
3213         UnicodeString testLine = lineMat.group(1, status);
3214         if (testLine.length() == 0) {
3215             continue;
3216         }
3217 
3218         //
3219         // Parse the test line.  Skip blank and comment only lines.
3220         // Separate out the three main fields - pattern, flags, target.
3221         //
3222 
3223         commentMat.reset(testLine);
3224         if (commentMat.lookingAt(status)) {
3225             // This line is a comment, or blank.
3226             continue;
3227         }
3228 
3229         //
3230         //  Pull out the pattern field, remove it from the test file line.
3231         //
3232         quotedStuffMat.reset(testLine);
3233         if (quotedStuffMat.lookingAt(status)) {
3234             testPattern = quotedStuffMat.group(2, status);
3235             testLine.remove(0, quotedStuffMat.end(0, status));
3236         } else {
3237             errln("Bad pattern (missing quotes?) at %s:%d", srcPath, lineNum);
3238             continue;
3239         }
3240 
3241 
3242         //
3243         //  Pull out the flags from the test file line.
3244         //
3245         flagsMat.reset(testLine);
3246         flagsMat.lookingAt(status);                  // Will always match, possibly an empty string.
3247         testFlags = flagsMat.group(1, status);
3248         if (flagsMat.group(2, status).length() > 0) {
3249             errln("Bad Match flag at line %d. Scanning %c\n",
3250                 lineNum, flagsMat.group(2, status).charAt(0));
3251             continue;
3252         }
3253         testLine.remove(0, flagsMat.end(0, status));
3254 
3255         //
3256         //  Pull out the match string, as a whole.
3257         //    We'll process the <tags> later.
3258         //
3259         quotedStuffMat.reset(testLine);
3260         if (quotedStuffMat.lookingAt(status)) {
3261             matchString = quotedStuffMat.group(2, status);
3262             testLine.remove(0, quotedStuffMat.end(0, status));
3263         } else {
3264             errln("Bad match string at test file line %d", lineNum);
3265             continue;
3266         }
3267 
3268         //
3269         //  The only thing left from the input line should be an optional trailing comment.
3270         //
3271         commentMat.reset(testLine);
3272         if (commentMat.lookingAt(status) == FALSE) {
3273             errln("Line %d: unexpected characters at end of test line.", lineNum);
3274             continue;
3275         }
3276 
3277         //
3278         //  Run the test
3279         //
3280         regex_find(testPattern, testFlags, matchString, srcPath, lineNum);
3281     }
3282 
3283     delete [] testData;
3284 
3285 }
3286 
3287 
3288 
3289 //---------------------------------------------------------------------------
3290 //
3291 //    regex_find(pattern, flags, inputString, lineNumber)
3292 //
3293 //         Function to run a single test from the Extended (data driven) tests.
3294 //         See file test/testdata/regextst.txt for a description of the
3295 //         pattern and inputString fields, and the allowed flags.
3296 //         lineNumber is the source line in regextst.txt of the test.
3297 //
3298 //---------------------------------------------------------------------------
3299 
3300 
3301 //  Set a value into a UVector at position specified by a decimal number in
3302 //   a UnicodeString.   This is a utility function needed by the actual test function,
3303 //   which follows.
set(UVector & vec,int32_t val,UnicodeString index)3304 static void set(UVector &vec, int32_t val, UnicodeString index) {
3305     UErrorCode  status=U_ZERO_ERROR;
3306     int32_t  idx = 0;
3307     for (int32_t i=0; i<index.length(); i++) {
3308         int32_t d=u_charDigitValue(index.charAt(i));
3309         if (d<0) {return;}
3310         idx = idx*10 + d;
3311     }
3312     while (vec.size()<idx+1) {vec.addElement(-1, status);}
3313     vec.setElementAt(val, idx);
3314 }
3315 
setInt(UVector & vec,int32_t val,int32_t idx)3316 static void setInt(UVector &vec, int32_t val, int32_t idx) {
3317     UErrorCode  status=U_ZERO_ERROR;
3318     while (vec.size()<idx+1) {vec.addElement(-1, status);}
3319     vec.setElementAt(val, idx);
3320 }
3321 
utextOffsetToNative(UText * utext,int32_t unistrOffset,int32_t & nativeIndex)3322 static UBool utextOffsetToNative(UText *utext, int32_t unistrOffset, int32_t& nativeIndex)
3323 {
3324     UBool couldFind = TRUE;
3325     UTEXT_SETNATIVEINDEX(utext, 0);
3326     int32_t i = 0;
3327     while (i < unistrOffset) {
3328         UChar32 c = UTEXT_NEXT32(utext);
3329         if (c != U_SENTINEL) {
3330             i += U16_LENGTH(c);
3331         } else {
3332             couldFind = FALSE;
3333             break;
3334         }
3335     }
3336     nativeIndex = (int32_t)UTEXT_GETNATIVEINDEX(utext);
3337     return couldFind;
3338 }
3339 
3340 
regex_find(const UnicodeString & pattern,const UnicodeString & flags,const UnicodeString & inputString,const char * srcPath,int32_t line)3341 void RegexTest::regex_find(const UnicodeString &pattern,
3342                            const UnicodeString &flags,
3343                            const UnicodeString &inputString,
3344                            const char *srcPath,
3345                            int32_t line) {
3346     UnicodeString       unEscapedInput;
3347     UnicodeString       deTaggedInput;
3348 
3349     int32_t             patternUTF8Length,      inputUTF8Length;
3350     char                *patternChars  = NULL, *inputChars = NULL;
3351     UText               patternText    = UTEXT_INITIALIZER;
3352     UText               inputText      = UTEXT_INITIALIZER;
3353     UConverter          *UTF8Converter = NULL;
3354 
3355     UErrorCode          status         = U_ZERO_ERROR;
3356     UParseError         pe;
3357     RegexPattern        *parsePat      = NULL;
3358     RegexMatcher        *parseMatcher  = NULL;
3359     RegexPattern        *callerPattern = NULL, *UTF8Pattern = NULL;
3360     RegexMatcher        *matcher       = NULL, *UTF8Matcher = NULL;
3361     UVector             groupStarts(status);
3362     UVector             groupEnds(status);
3363     UVector             groupStartsUTF8(status);
3364     UVector             groupEndsUTF8(status);
3365     UBool               isMatch        = FALSE, isUTF8Match = FALSE;
3366     UBool               failed         = FALSE;
3367     int32_t             numFinds;
3368     int32_t             i;
3369     UBool               useMatchesFunc   = FALSE;
3370     UBool               useLookingAtFunc = FALSE;
3371     int32_t             regionStart      = -1;
3372     int32_t             regionEnd        = -1;
3373     int32_t             regionStartUTF8  = -1;
3374     int32_t             regionEndUTF8    = -1;
3375 
3376 
3377     //
3378     //  Compile the caller's pattern
3379     //
3380     uint32_t bflags = 0;
3381     if (flags.indexOf((UChar)0x69) >= 0)  { // 'i' flag
3382         bflags |= UREGEX_CASE_INSENSITIVE;
3383     }
3384     if (flags.indexOf((UChar)0x78) >= 0)  { // 'x' flag
3385         bflags |= UREGEX_COMMENTS;
3386     }
3387     if (flags.indexOf((UChar)0x73) >= 0)  { // 's' flag
3388         bflags |= UREGEX_DOTALL;
3389     }
3390     if (flags.indexOf((UChar)0x6d) >= 0)  { // 'm' flag
3391         bflags |= UREGEX_MULTILINE;
3392     }
3393 
3394     if (flags.indexOf((UChar)0x65) >= 0) { // 'e' flag
3395         bflags |= UREGEX_ERROR_ON_UNKNOWN_ESCAPES;
3396     }
3397     if (flags.indexOf((UChar)0x44) >= 0) { // 'D' flag
3398         bflags |= UREGEX_UNIX_LINES;
3399     }
3400     if (flags.indexOf((UChar)0x51) >= 0) { // 'Q' flag
3401         bflags |= UREGEX_LITERAL;
3402     }
3403 
3404 
3405     callerPattern = RegexPattern::compile(pattern, bflags, pe, status);
3406     if (status != U_ZERO_ERROR) {
3407         #if UCONFIG_NO_BREAK_ITERATION==1
3408         // 'v' test flag means that the test pattern should not compile if ICU was configured
3409         //     to not include break iteration.  RBBI is needed for Unicode word boundaries.
3410         if (flags.indexOf((UChar)0x76) >= 0 /*'v'*/ && status == U_UNSUPPORTED_ERROR) {
3411             goto cleanupAndReturn;
3412         }
3413         #endif
3414         if (flags.indexOf((UChar)0x45) >= 0) {  //  flags contain 'E'
3415             // Expected pattern compilation error.
3416             if (flags.indexOf((UChar)0x64) >= 0) {   // flags contain 'd'
3417                 logln("Pattern Compile returns \"%s\"", u_errorName(status));
3418             }
3419             goto cleanupAndReturn;
3420         } else {
3421             // Unexpected pattern compilation error.
3422             dataerrln("Line %d: error %s compiling pattern.", line, u_errorName(status));
3423             goto cleanupAndReturn;
3424         }
3425     }
3426 
3427     UTF8Converter = ucnv_open("UTF8", &status);
3428     ucnv_setFromUCallBack(UTF8Converter, UCNV_FROM_U_CALLBACK_STOP, NULL, NULL, NULL, &status);
3429 
3430     patternUTF8Length = pattern.extract(NULL, 0, UTF8Converter, status);
3431     status = U_ZERO_ERROR; // buffer overflow
3432     patternChars = new char[patternUTF8Length+1];
3433     pattern.extract(patternChars, patternUTF8Length+1, UTF8Converter, status);
3434     utext_openUTF8(&patternText, patternChars, patternUTF8Length, &status);
3435 
3436     if (status == U_ZERO_ERROR) {
3437         UTF8Pattern = RegexPattern::compile(&patternText, bflags, pe, status);
3438 
3439         if (status != U_ZERO_ERROR) {
3440 #if UCONFIG_NO_BREAK_ITERATION==1
3441             // 'v' test flag means that the test pattern should not compile if ICU was configured
3442             //     to not include break iteration.  RBBI is needed for Unicode word boundaries.
3443             if (flags.indexOf((UChar)0x76) >= 0 /*'v'*/ && status == U_UNSUPPORTED_ERROR) {
3444                 goto cleanupAndReturn;
3445             }
3446 #endif
3447             if (flags.indexOf((UChar)0x45) >= 0) {  //  flags contain 'E'
3448                 // Expected pattern compilation error.
3449                 if (flags.indexOf((UChar)0x64) >= 0) {   // flags contain 'd'
3450                     logln("Pattern Compile returns \"%s\" (UTF8)", u_errorName(status));
3451                 }
3452                 goto cleanupAndReturn;
3453             } else {
3454                 // Unexpected pattern compilation error.
3455                 errln("Line %d: error %s compiling pattern. (UTF8)", line, u_errorName(status));
3456                 goto cleanupAndReturn;
3457             }
3458         }
3459     }
3460 
3461     if (UTF8Pattern == NULL) {
3462         // UTF-8 does not allow unpaired surrogates, so this could actually happen without being a failure of the engine
3463         logln("Unable to create UTF-8 pattern, skipping UTF-8 tests for %s:%d", srcPath, line);
3464         status = U_ZERO_ERROR;
3465     }
3466 
3467     if (flags.indexOf((UChar)0x64) >= 0) {  // 'd' flag
3468         callerPattern->dumpPattern();
3469     }
3470 
3471     if (flags.indexOf((UChar)0x45) >= 0) {  // 'E' flag
3472         errln("%s, Line %d: Expected, but did not get, a pattern compilation error.", srcPath, line);
3473         goto cleanupAndReturn;
3474     }
3475 
3476 
3477     //
3478     // Number of times find() should be called on the test string, default to 1
3479     //
3480     numFinds = 1;
3481     for (i=2; i<=9; i++) {
3482         if (flags.indexOf((UChar)(0x30 + i)) >= 0) {   // digit flag
3483             if (numFinds != 1) {
3484                 errln("Line %d: more than one digit flag.  Scanning %d.", line, i);
3485                 goto cleanupAndReturn;
3486             }
3487             numFinds = i;
3488         }
3489     }
3490 
3491     // 'M' flag.  Use matches() instead of find()
3492     if (flags.indexOf((UChar)0x4d) >= 0) {
3493         useMatchesFunc = TRUE;
3494     }
3495     if (flags.indexOf((UChar)0x4c) >= 0) {
3496         useLookingAtFunc = TRUE;
3497     }
3498 
3499     //
3500     //  Find the tags in the input data, remove them, and record the group boundary
3501     //    positions.
3502     //
3503     parsePat = RegexPattern::compile("<(/?)(r|[0-9]+)>", 0, pe, status);
3504     if (!assertSuccess(WHERE, status) ) {
3505         goto cleanupAndReturn;
3506     }
3507 
3508     unEscapedInput = inputString.unescape();
3509     parseMatcher = parsePat->matcher(unEscapedInput, status);
3510     if (!assertSuccess(WHERE, status) ) {
3511         goto cleanupAndReturn;
3512     }
3513     while(parseMatcher->find()) {
3514         parseMatcher->appendReplacement(deTaggedInput, "", status);
3515         REGEX_CHECK_STATUS;
3516         UnicodeString groupNum = parseMatcher->group(2, status);
3517         if (groupNum == "r") {
3518             // <r> or </r>, a region specification within the string
3519             if (parseMatcher->group(1, status) == "/") {
3520                 regionEnd = deTaggedInput.length();
3521             } else {
3522                 regionStart = deTaggedInput.length();
3523             }
3524         } else {
3525             // <digits> or </digits>, a group match boundary tag.
3526             if (parseMatcher->group(1, status) == "/") {
3527                 set(groupEnds, deTaggedInput.length(), groupNum);
3528             } else {
3529                 set(groupStarts, deTaggedInput.length(), groupNum);
3530             }
3531         }
3532     }
3533     parseMatcher->appendTail(deTaggedInput);
3534 
3535     if (groupStarts.size() != groupEnds.size()) {
3536         errln("Error at line %d: mismatched <n> group tags in expected results.", line);
3537         failed = true;
3538         goto cleanupAndReturn;
3539     }
3540     if ((regionStart>=0 || regionEnd>=0) && (regionStart<0 || regionStart>regionEnd)) {
3541         errln("mismatched <r> tags");
3542         failed = TRUE;
3543         goto cleanupAndReturn;
3544     }
3545 
3546     //
3547     //  Configure the matcher according to the flags specified with this test.
3548     //
3549     matcher = callerPattern->matcher(deTaggedInput, status);
3550     REGEX_CHECK_STATUS_L(line);
3551     if (flags.indexOf((UChar)0x74) >= 0) {   //  't' trace flag
3552         matcher->setTrace(TRUE);
3553     }
3554 
3555     if (UTF8Pattern != NULL) {
3556         inputUTF8Length = deTaggedInput.extract(NULL, 0, UTF8Converter, status);
3557         status = U_ZERO_ERROR; // buffer overflow
3558         inputChars = new char[inputUTF8Length+1];
3559         deTaggedInput.extract(inputChars, inputUTF8Length+1, UTF8Converter, status);
3560         utext_openUTF8(&inputText, inputChars, inputUTF8Length, &status);
3561 
3562         if (status == U_ZERO_ERROR) {
3563             UTF8Matcher = &UTF8Pattern->matcher(status)->reset(&inputText);
3564             REGEX_CHECK_STATUS_L(line);
3565         }
3566 
3567         if (UTF8Matcher == NULL) {
3568             // UTF-8 does not allow unpaired surrogates, so this could actually happen without being a failure of the engine
3569             logln("Unable to create UTF-8 matcher, skipping UTF-8 tests for %s:%d", srcPath, line);
3570             status = U_ZERO_ERROR;
3571         }
3572     }
3573 
3574     //
3575     //  Generate native indices for UTF8 versions of region and capture group info
3576     //
3577     if (UTF8Matcher != NULL) {
3578         if (flags.indexOf((UChar)0x74) >= 0) {   //  't' trace flag
3579             UTF8Matcher->setTrace(TRUE);
3580         }
3581         if (regionStart>=0)    (void) utextOffsetToNative(&inputText, regionStart, regionStartUTF8);
3582         if (regionEnd>=0)      (void) utextOffsetToNative(&inputText, regionEnd, regionEndUTF8);
3583 
3584         //  Fill out the native index UVector info.
3585         //  Only need 1 loop, from above we know groupStarts.size() = groupEnds.size()
3586         for (i=0; i<groupStarts.size(); i++) {
3587             int32_t  start = groupStarts.elementAti(i);
3588             //  -1 means there was no UVector slot and we won't be requesting that capture group for this test, don't bother inserting
3589             if (start >= 0) {
3590                 int32_t  startUTF8;
3591                 if (!utextOffsetToNative(&inputText, start, startUTF8)) {
3592                     errln("Error at line %d: could not find native index for group start %d.  UTF16 index %d", line, i, start);
3593                     failed = TRUE;
3594                     goto cleanupAndReturn;  // Good chance of subsequent bogus errors.  Stop now.
3595                 }
3596                 setInt(groupStartsUTF8, startUTF8, i);
3597             }
3598 
3599             int32_t  end = groupEnds.elementAti(i);
3600             //  -1 means there was no UVector slot and we won't be requesting that capture group for this test, don't bother inserting
3601             if (end >= 0) {
3602                 int32_t  endUTF8;
3603                 if (!utextOffsetToNative(&inputText, end, endUTF8)) {
3604                     errln("Error at line %d: could not find native index for group end %d.  UTF16 index %d", line, i, end);
3605                     failed = TRUE;
3606                     goto cleanupAndReturn;  // Good chance of subsequent bogus errors.  Stop now.
3607                 }
3608                 setInt(groupEndsUTF8, endUTF8, i);
3609             }
3610         }
3611     }
3612 
3613     if (regionStart>=0) {
3614        matcher->region(regionStart, regionEnd, status);
3615        REGEX_CHECK_STATUS_L(line);
3616        if (UTF8Matcher != NULL) {
3617            UTF8Matcher->region(regionStartUTF8, regionEndUTF8, status);
3618            REGEX_CHECK_STATUS_L(line);
3619        }
3620     }
3621     if (flags.indexOf((UChar)0x61) >= 0) {   //  'a' anchoring bounds flag
3622         matcher->useAnchoringBounds(FALSE);
3623         if (UTF8Matcher != NULL) {
3624             UTF8Matcher->useAnchoringBounds(FALSE);
3625         }
3626     }
3627     if (flags.indexOf((UChar)0x62) >= 0) {   //  'b' transparent bounds flag
3628         matcher->useTransparentBounds(TRUE);
3629         if (UTF8Matcher != NULL) {
3630             UTF8Matcher->useTransparentBounds(TRUE);
3631         }
3632     }
3633 
3634 
3635 
3636     //
3637     // Do a find on the de-tagged input using the caller's pattern
3638     //     TODO: error on count>1 and not find().
3639     //           error on both matches() and lookingAt().
3640     //
3641     for (i=0; i<numFinds; i++) {
3642         if (useMatchesFunc) {
3643             isMatch = matcher->matches(status);
3644             if (UTF8Matcher != NULL) {
3645                isUTF8Match = UTF8Matcher->matches(status);
3646             }
3647         } else  if (useLookingAtFunc) {
3648             isMatch = matcher->lookingAt(status);
3649             if (UTF8Matcher != NULL) {
3650                 isUTF8Match = UTF8Matcher->lookingAt(status);
3651             }
3652         } else {
3653             isMatch = matcher->find();
3654             if (UTF8Matcher != NULL) {
3655                 isUTF8Match = UTF8Matcher->find();
3656             }
3657         }
3658     }
3659     matcher->setTrace(FALSE);
3660     if (UTF8Matcher) {
3661         UTF8Matcher->setTrace(FALSE);
3662     }
3663     if (U_FAILURE(status)) {
3664         errln("Error at line %d. ICU ErrorCode is %s", u_errorName(status));
3665     }
3666 
3667     //
3668     // Match up the groups from the find() with the groups from the tags
3669     //
3670 
3671     // number of tags should match number of groups from find operation.
3672     // matcher->groupCount does not include group 0, the entire match, hence the +1.
3673     //   G option in test means that capture group data is not available in the
3674     //     expected results, so the check needs to be suppressed.
3675     if (isMatch == FALSE && groupStarts.size() != 0) {
3676         dataerrln("Error at line %d:  Match expected, but none found.", line);
3677         failed = TRUE;
3678         goto cleanupAndReturn;
3679     } else if (UTF8Matcher != NULL && isUTF8Match == FALSE && groupStarts.size() != 0) {
3680         errln("Error at line %d:  Match expected, but none found. (UTF8)", line);
3681         failed = TRUE;
3682         goto cleanupAndReturn;
3683     }
3684     if (isMatch && groupStarts.size() == 0) {
3685         errln("Error at line %d: No match expected, but one found at position %d.", line, matcher->start(status));
3686         failed = TRUE;
3687     }
3688     if (UTF8Matcher && isUTF8Match && groupStarts.size() == 0) {
3689         errln("Error at line %d: No match expected, but one found at position %d (UTF-8).", line, UTF8Matcher->start(status));
3690         failed = TRUE;
3691     }
3692 
3693     if (flags.indexOf((UChar)0x47 /*G*/) >= 0) {
3694         // Only check for match / no match.  Don't check capture groups.
3695         goto cleanupAndReturn;
3696     }
3697 
3698     REGEX_CHECK_STATUS_L(line);
3699     for (i=0; i<=matcher->groupCount(); i++) {
3700         int32_t  expectedStart = (i >= groupStarts.size()? -1 : groupStarts.elementAti(i));
3701         int32_t  expectedStartUTF8 = (i >= groupStartsUTF8.size()? -1 : groupStartsUTF8.elementAti(i));
3702         if (matcher->start(i, status) != expectedStart) {
3703             errln("Error at line %d: incorrect start position for group %d.  Expected %d, got %d",
3704                 line, i, expectedStart, matcher->start(i, status));
3705             failed = TRUE;
3706             goto cleanupAndReturn;  // Good chance of subsequent bogus errors.  Stop now.
3707         } else if (UTF8Matcher != NULL && UTF8Matcher->start(i, status) != expectedStartUTF8) {
3708             errln("Error at line %d: incorrect start position for group %d.  Expected %d, got %d (UTF8)",
3709                   line, i, expectedStartUTF8, UTF8Matcher->start(i, status));
3710             failed = TRUE;
3711             goto cleanupAndReturn;  // Good chance of subsequent bogus errors.  Stop now.
3712         }
3713 
3714         int32_t  expectedEnd = (i >= groupEnds.size()? -1 : groupEnds.elementAti(i));
3715         int32_t  expectedEndUTF8 = (i >= groupEndsUTF8.size()? -1 : groupEndsUTF8.elementAti(i));
3716         if (matcher->end(i, status) != expectedEnd) {
3717             errln("Error at line %d: incorrect end position for group %d.  Expected %d, got %d",
3718                 line, i, expectedEnd, matcher->end(i, status));
3719             failed = TRUE;
3720             // Error on end position;  keep going; real error is probably yet to come as group
3721             //   end positions work from end of the input data towards the front.
3722         } else if (UTF8Matcher != NULL && UTF8Matcher->end(i, status) != expectedEndUTF8) {
3723             errln("Error at line %d: incorrect end position for group %d.  Expected %d, got %d (UTF8)",
3724                   line, i, expectedEndUTF8, UTF8Matcher->end(i, status));
3725             failed = TRUE;
3726             // Error on end position;  keep going; real error is probably yet to come as group
3727             //   end positions work from end of the input data towards the front.
3728         }
3729     }
3730     if ( matcher->groupCount()+1 < groupStarts.size()) {
3731         errln("Error at line %d: Expected %d capture groups, found %d.",
3732             line, groupStarts.size()-1, matcher->groupCount());
3733         failed = TRUE;
3734         }
3735     else if (UTF8Matcher != NULL && UTF8Matcher->groupCount()+1 < groupStarts.size()) {
3736         errln("Error at line %d: Expected %d capture groups, found %d. (UTF8)",
3737               line, groupStarts.size()-1, UTF8Matcher->groupCount());
3738         failed = TRUE;
3739     }
3740 
3741     if ((flags.indexOf((UChar)0x59) >= 0) &&   //  'Y' flag:  RequireEnd() == false
3742         matcher->requireEnd() == TRUE) {
3743         errln("Error at line %d: requireEnd() returned TRUE.  Expected FALSE", line);
3744         failed = TRUE;
3745     } else if (UTF8Matcher != NULL && (flags.indexOf((UChar)0x59) >= 0) &&   //  'Y' flag:  RequireEnd() == false
3746         UTF8Matcher->requireEnd() == TRUE) {
3747         errln("Error at line %d: requireEnd() returned TRUE.  Expected FALSE (UTF8)", line);
3748         failed = TRUE;
3749     }
3750 
3751     if ((flags.indexOf((UChar)0x79) >= 0) &&   //  'y' flag:  RequireEnd() == true
3752         matcher->requireEnd() == FALSE) {
3753         errln("Error at line %d: requireEnd() returned FALSE.  Expected TRUE", line);
3754         failed = TRUE;
3755     } else if (UTF8Matcher != NULL && (flags.indexOf((UChar)0x79) >= 0) &&   //  'Y' flag:  RequireEnd() == false
3756         UTF8Matcher->requireEnd() == FALSE) {
3757         errln("Error at line %d: requireEnd() returned FALSE.  Expected TRUE (UTF8)", line);
3758         failed = TRUE;
3759     }
3760 
3761     if ((flags.indexOf((UChar)0x5A) >= 0) &&   //  'Z' flag:  hitEnd() == false
3762         matcher->hitEnd() == TRUE) {
3763         errln("Error at line %d: hitEnd() returned TRUE.  Expected FALSE", line);
3764         failed = TRUE;
3765     } else if (UTF8Matcher != NULL && (flags.indexOf((UChar)0x5A) >= 0) &&   //  'Z' flag:  hitEnd() == false
3766                UTF8Matcher->hitEnd() == TRUE) {
3767         errln("Error at line %d: hitEnd() returned TRUE.  Expected FALSE (UTF8)", line);
3768         failed = TRUE;
3769     }
3770 
3771     if ((flags.indexOf((UChar)0x7A) >= 0) &&   //  'z' flag:  hitEnd() == true
3772         matcher->hitEnd() == FALSE) {
3773         errln("Error at line %d: hitEnd() returned FALSE.  Expected TRUE", line);
3774         failed = TRUE;
3775     } else if (UTF8Matcher != NULL && (flags.indexOf((UChar)0x7A) >= 0) &&   //  'z' flag:  hitEnd() == true
3776                UTF8Matcher->hitEnd() == FALSE) {
3777         errln("Error at line %d: hitEnd() returned FALSE.  Expected TRUE (UTF8)", line);
3778         failed = TRUE;
3779     }
3780 
3781 
3782 cleanupAndReturn:
3783     if (failed) {
3784         infoln((UnicodeString)"\""+pattern+(UnicodeString)"\"  "
3785             +flags+(UnicodeString)"  \""+inputString+(UnicodeString)"\"");
3786         // callerPattern->dump();
3787     }
3788     delete parseMatcher;
3789     delete parsePat;
3790     delete UTF8Matcher;
3791     delete UTF8Pattern;
3792     delete matcher;
3793     delete callerPattern;
3794 
3795     utext_close(&inputText);
3796     delete[] inputChars;
3797     utext_close(&patternText);
3798     delete[] patternChars;
3799     ucnv_close(UTF8Converter);
3800 }
3801 
3802 
3803 
3804 
3805 //---------------------------------------------------------------------------
3806 //
3807 //      Errors     Check for error handling in patterns.
3808 //
3809 //---------------------------------------------------------------------------
Errors()3810 void RegexTest::Errors() {
3811     // \escape sequences that aren't implemented yet.
3812     //REGEX_ERR("hex format \\x{abcd} not implemented", 1, 13, U_REGEX_UNIMPLEMENTED);
3813 
3814     // Missing close parentheses
3815     REGEX_ERR("Comment (?# with no close", 1, 25, U_REGEX_MISMATCHED_PAREN);
3816     REGEX_ERR("Capturing Parenthesis(...", 1, 25, U_REGEX_MISMATCHED_PAREN);
3817     REGEX_ERR("Grouping only parens (?: blah blah", 1, 34, U_REGEX_MISMATCHED_PAREN);
3818 
3819     // Extra close paren
3820     REGEX_ERR("Grouping only parens (?: blah)) blah", 1, 31, U_REGEX_MISMATCHED_PAREN);
3821     REGEX_ERR(")))))))", 1, 1, U_REGEX_MISMATCHED_PAREN);
3822     REGEX_ERR("(((((((", 1, 7, U_REGEX_MISMATCHED_PAREN);
3823 
3824     // Look-ahead, Look-behind
3825     //  TODO:  add tests for unbounded length look-behinds.
3826     REGEX_ERR("abc(?<@xyz).*", 1, 7, U_REGEX_RULE_SYNTAX);       // illegal construct
3827 
3828     // Attempt to use non-default flags
3829     {
3830         UParseError   pe;
3831         UErrorCode    status = U_ZERO_ERROR;
3832         int32_t       flags  = UREGEX_CANON_EQ |
3833                                UREGEX_COMMENTS         | UREGEX_DOTALL   |
3834                                UREGEX_MULTILINE;
3835         RegexPattern *pat1= RegexPattern::compile(".*", flags, pe, status);
3836         REGEX_ASSERT(status == U_REGEX_UNIMPLEMENTED);
3837         delete pat1;
3838     }
3839 
3840 
3841     // Quantifiers are allowed only after something that can be quantified.
3842     REGEX_ERR("+", 1, 1, U_REGEX_RULE_SYNTAX);
3843     REGEX_ERR("abc\ndef(*2)", 2, 5, U_REGEX_RULE_SYNTAX);
3844     REGEX_ERR("abc**", 1, 5, U_REGEX_RULE_SYNTAX);
3845 
3846     // Mal-formed {min,max} quantifiers
3847     REGEX_ERR("abc{a,2}",1,5, U_REGEX_BAD_INTERVAL);
3848     REGEX_ERR("abc{4,2}",1,8, U_REGEX_MAX_LT_MIN);
3849     REGEX_ERR("abc{1,b}",1,7, U_REGEX_BAD_INTERVAL);
3850     REGEX_ERR("abc{1,,2}",1,7, U_REGEX_BAD_INTERVAL);
3851     REGEX_ERR("abc{1,2a}",1,8, U_REGEX_BAD_INTERVAL);
3852     REGEX_ERR("abc{222222222222222222222}",1,14, U_REGEX_NUMBER_TOO_BIG);
3853     REGEX_ERR("abc{5,50000000000}", 1, 16, U_REGEX_NUMBER_TOO_BIG);        // Overflows int during scan
3854     REGEX_ERR("abc{5,687865858}", 1, 16, U_REGEX_NUMBER_TOO_BIG);          // Overflows regex binary format
3855     REGEX_ERR("abc{687865858,687865859}", 1, 24, U_REGEX_NUMBER_TOO_BIG);
3856 
3857     // Ticket 5389
3858     REGEX_ERR("*c", 1, 1, U_REGEX_RULE_SYNTAX);
3859 
3860     // Invalid Back Reference \0
3861     //    For ICU 3.8 and earlier
3862     //    For ICU versions newer than 3.8, \0 introduces an octal escape.
3863     //
3864     REGEX_ERR("(ab)\\0", 1, 6, U_REGEX_BAD_ESCAPE_SEQUENCE);
3865 
3866 }
3867 
3868 //-------------------------------------------------------------------------------
3869 //
3870 //   PerlTests  - Run Perl's regular expression tests
3871 //                The input file for this test is re_tests, the standard regular
3872 //                expression test data distributed with the Perl source code.
3873 //
3874 //                Here is Perl's description of the test data file:
3875 //
3876 //        # The tests are in a separate file 't/op/re_tests'.
3877 //        # Each line in that file is a separate test.
3878 //        # There are five columns, separated by tabs.
3879 //        #
3880 //        # Column 1 contains the pattern, optionally enclosed in C<''>.
3881 //        # Modifiers can be put after the closing C<'>.
3882 //        #
3883 //        # Column 2 contains the string to be matched.
3884 //        #
3885 //        # Column 3 contains the expected result:
3886 //        #     y   expect a match
3887 //        #     n   expect no match
3888 //        #     c   expect an error
3889 //        # B   test exposes a known bug in Perl, should be skipped
3890 //        # b   test exposes a known bug in Perl, should be skipped if noamp
3891 //        #
3892 //        # Columns 4 and 5 are used only if column 3 contains C<y> or C<c>.
3893 //        #
3894 //        # Column 4 contains a string, usually C<$&>.
3895 //        #
3896 //        # Column 5 contains the expected result of double-quote
3897 //        # interpolating that string after the match, or start of error message.
3898 //        #
3899 //        # Column 6, if present, contains a reason why the test is skipped.
3900 //        # This is printed with "skipped", for harness to pick up.
3901 //        #
3902 //        # \n in the tests are interpolated, as are variables of the form ${\w+}.
3903 //        #
3904 //        # If you want to add a regular expression test that can't be expressed
3905 //        # in this format, don't add it here: put it in op/pat.t instead.
3906 //
3907 //        For ICU, if field 3 contains an 'i', the test will be skipped.
3908 //        The test exposes is some known incompatibility between ICU and Perl regexps.
3909 //        (The i is in addition to whatever was there before.)
3910 //
3911 //-------------------------------------------------------------------------------
PerlTests()3912 void RegexTest::PerlTests() {
3913     char tdd[2048];
3914     const char *srcPath;
3915     UErrorCode  status = U_ZERO_ERROR;
3916     UParseError pe;
3917 
3918     //
3919     //  Open and read the test data file.
3920     //
3921     srcPath=getPath(tdd, "re_tests.txt");
3922     if(srcPath==NULL) {
3923         return; /* something went wrong, error already output */
3924     }
3925 
3926     int32_t    len;
3927     UChar *testData = ReadAndConvertFile(srcPath, len, "iso-8859-1", status);
3928     if (U_FAILURE(status)) {
3929         return; /* something went wrong, error already output */
3930     }
3931 
3932     //
3933     //  Put the test data into a UnicodeString
3934     //
3935     UnicodeString testDataString(FALSE, testData, len);
3936 
3937     //
3938     //  Regex to break the input file into lines, and strip the new lines.
3939     //     One line per match, capture group one is the desired data.
3940     //
3941     RegexPattern* linePat = RegexPattern::compile(UNICODE_STRING_SIMPLE("(.+?)[\\r\\n]+"), 0, pe, status);
3942     if (U_FAILURE(status)) {
3943         dataerrln("RegexPattern::compile() error");
3944         return;
3945     }
3946     RegexMatcher* lineMat = linePat->matcher(testDataString, status);
3947 
3948     //
3949     //  Regex to split a test file line into fields.
3950     //    There are six fields, separated by tabs.
3951     //
3952     RegexPattern* fieldPat = RegexPattern::compile(UNICODE_STRING_SIMPLE("\\t"), 0, pe, status);
3953 
3954     //
3955     //  Regex to identify test patterns with flag settings, and to separate them.
3956     //    Test patterns with flags look like 'pattern'i
3957     //    Test patterns without flags are not quoted:   pattern
3958     //   Coming out, capture group 2 is the pattern, capture group 3 is the flags.
3959     //
3960     RegexPattern *flagPat = RegexPattern::compile(UNICODE_STRING_SIMPLE("('?)(.*)\\1(.*)"), 0, pe, status);
3961     RegexMatcher* flagMat = flagPat->matcher(status);
3962 
3963     //
3964     // The Perl tests reference several perl-isms, which are evaluated/substituted
3965     //   in the test data.  Not being perl, this must be done explicitly.  Here
3966     //   are string constants and REs for these constructs.
3967     //
3968     UnicodeString nulnulSrc("${nulnul}");
3969     UnicodeString nulnul("\\u0000\\u0000", -1, US_INV);
3970     nulnul = nulnul.unescape();
3971 
3972     UnicodeString ffffSrc("${ffff}");
3973     UnicodeString ffff("\\uffff", -1, US_INV);
3974     ffff = ffff.unescape();
3975 
3976     //  regexp for $-[0], $+[2], etc.
3977     RegexPattern *groupsPat = RegexPattern::compile(UNICODE_STRING_SIMPLE("\\$([+\\-])\\[(\\d+)\\]"), 0, pe, status);
3978     RegexMatcher *groupsMat = groupsPat->matcher(status);
3979 
3980     //  regexp for $0, $1, $2, etc.
3981     RegexPattern *cgPat = RegexPattern::compile(UNICODE_STRING_SIMPLE("\\$(\\d+)"), 0, pe, status);
3982     RegexMatcher *cgMat = cgPat->matcher(status);
3983 
3984 
3985     //
3986     // Main Loop for the Perl Tests, runs once per line from the
3987     //   test data file.
3988     //
3989     int32_t  lineNum = 0;
3990     int32_t  skippedUnimplementedCount = 0;
3991     while (lineMat->find()) {
3992         lineNum++;
3993 
3994         //
3995         //  Get a line, break it into its fields, do the Perl
3996         //    variable substitutions.
3997         //
3998         UnicodeString line = lineMat->group(1, status);
3999         UnicodeString fields[7];
4000         fieldPat->split(line, fields, 7, status);
4001 
4002         flagMat->reset(fields[0]);
4003         flagMat->matches(status);
4004         UnicodeString pattern  = flagMat->group(2, status);
4005         pattern.findAndReplace("${bang}", "!");
4006         pattern.findAndReplace(nulnulSrc, UNICODE_STRING_SIMPLE("\\u0000\\u0000"));
4007         pattern.findAndReplace(ffffSrc, ffff);
4008 
4009         //
4010         //  Identify patterns that include match flag settings,
4011         //    split off the flags, remove the extra quotes.
4012         //
4013         UnicodeString flagStr = flagMat->group(3, status);
4014         if (U_FAILURE(status)) {
4015             errln("ucnv_toUChars: ICU Error \"%s\"\n", u_errorName(status));
4016             return;
4017         }
4018         int32_t flags = 0;
4019         const UChar UChar_c = 0x63;  // Char constants for the flag letters.
4020         const UChar UChar_i = 0x69;  //   (Damn the lack of Unicode support in C)
4021         const UChar UChar_m = 0x6d;
4022         const UChar UChar_x = 0x78;
4023         const UChar UChar_y = 0x79;
4024         if (flagStr.indexOf(UChar_i) != -1) {
4025             flags |= UREGEX_CASE_INSENSITIVE;
4026         }
4027         if (flagStr.indexOf(UChar_m) != -1) {
4028             flags |= UREGEX_MULTILINE;
4029         }
4030         if (flagStr.indexOf(UChar_x) != -1) {
4031             flags |= UREGEX_COMMENTS;
4032         }
4033 
4034         //
4035         // Compile the test pattern.
4036         //
4037         status = U_ZERO_ERROR;
4038         RegexPattern *testPat = RegexPattern::compile(pattern, flags, pe, status);
4039         if (status == U_REGEX_UNIMPLEMENTED) {
4040             //
4041             // Test of a feature that is planned for ICU, but not yet implemented.
4042             //   skip the test.
4043             skippedUnimplementedCount++;
4044             delete testPat;
4045             status = U_ZERO_ERROR;
4046             continue;
4047         }
4048 
4049         if (U_FAILURE(status)) {
4050             // Some tests are supposed to generate errors.
4051             //   Only report an error for tests that are supposed to succeed.
4052             if (fields[2].indexOf(UChar_c) == -1  &&  // Compilation is not supposed to fail AND
4053                 fields[2].indexOf(UChar_i) == -1)     //   it's not an accepted ICU incompatibility
4054             {
4055                 errln("line %d: ICU Error \"%s\"\n", lineNum, u_errorName(status));
4056             }
4057             status = U_ZERO_ERROR;
4058             delete testPat;
4059             continue;
4060         }
4061 
4062         if (fields[2].indexOf(UChar_i) >= 0) {
4063             // ICU should skip this test.
4064             delete testPat;
4065             continue;
4066         }
4067 
4068         if (fields[2].indexOf(UChar_c) >= 0) {
4069             // This pattern should have caused a compilation error, but didn't/
4070             errln("line %d: Expected a pattern compile error, got success.", lineNum);
4071             delete testPat;
4072             continue;
4073         }
4074 
4075         //
4076         // replace the Perl variables that appear in some of the
4077         //   match data strings.
4078         //
4079         UnicodeString matchString = fields[1];
4080         matchString.findAndReplace(nulnulSrc, nulnul);
4081         matchString.findAndReplace(ffffSrc,   ffff);
4082 
4083         // Replace any \n in the match string with an actual new-line char.
4084         //  Don't do full unescape, as this unescapes more than Perl does, which
4085         //  causes other spurious failures in the tests.
4086         matchString.findAndReplace(UNICODE_STRING_SIMPLE("\\n"), "\n");
4087 
4088 
4089 
4090         //
4091         // Run the test, check for expected match/don't match result.
4092         //
4093         RegexMatcher *testMat = testPat->matcher(matchString, status);
4094         UBool found = testMat->find();
4095         UBool expected = FALSE;
4096         if (fields[2].indexOf(UChar_y) >=0) {
4097             expected = TRUE;
4098         }
4099         if (expected != found) {
4100             errln("line %d: Expected %smatch, got %smatch",
4101                 lineNum, expected?"":"no ", found?"":"no " );
4102             delete testMat;
4103             delete testPat;
4104             continue;
4105         }
4106 
4107         // Don't try to check expected results if there is no match.
4108         //   (Some have stuff in the expected fields)
4109         if (!found) {
4110             delete testMat;
4111             delete testPat;
4112             continue;
4113         }
4114 
4115         //
4116         // Interpret the Perl expression from the fourth field of the data file,
4117         // building up an ICU string from the results of the ICU match.
4118         //   The Perl expression will contain references to the results of
4119         //     a regex match, including the matched string, capture group strings,
4120         //     group starting and ending indices, etc.
4121         //
4122         UnicodeString resultString;
4123         UnicodeString perlExpr = fields[3];
4124 #if SUPPORT_MUTATING_INPUT_STRING
4125         groupsMat->reset(perlExpr);
4126         cgMat->reset(perlExpr);
4127 #endif
4128 
4129         while (perlExpr.length() > 0) {
4130 #if !SUPPORT_MUTATING_INPUT_STRING
4131             //  Preferred usage.  Reset after any modification to input string.
4132             groupsMat->reset(perlExpr);
4133             cgMat->reset(perlExpr);
4134 #endif
4135 
4136             if (perlExpr.startsWith("$&")) {
4137                 resultString.append(testMat->group(status));
4138                 perlExpr.remove(0, 2);
4139             }
4140 
4141             else if (groupsMat->lookingAt(status)) {
4142                 // $-[0]   $+[2]  etc.
4143                 UnicodeString digitString = groupsMat->group(2, status);
4144                 int32_t t = 0;
4145                 int32_t groupNum = ICU_Utility::parseNumber(digitString, t, 10);
4146                 UnicodeString plusOrMinus = groupsMat->group(1, status);
4147                 int32_t matchPosition;
4148                 if (plusOrMinus.compare("+") == 0) {
4149                     matchPosition = testMat->end(groupNum, status);
4150                 } else {
4151                     matchPosition = testMat->start(groupNum, status);
4152                 }
4153                 if (matchPosition != -1) {
4154                     ICU_Utility::appendNumber(resultString, matchPosition);
4155                 }
4156                 perlExpr.remove(0, groupsMat->end(status));
4157             }
4158 
4159             else if (cgMat->lookingAt(status)) {
4160                 // $1, $2, $3, etc.
4161                 UnicodeString digitString = cgMat->group(1, status);
4162                 int32_t t = 0;
4163                 int32_t groupNum = ICU_Utility::parseNumber(digitString, t, 10);
4164                 if (U_SUCCESS(status)) {
4165                     resultString.append(testMat->group(groupNum, status));
4166                     status = U_ZERO_ERROR;
4167                 }
4168                 perlExpr.remove(0, cgMat->end(status));
4169             }
4170 
4171             else if (perlExpr.startsWith("@-")) {
4172                 int32_t i;
4173                 for (i=0; i<=testMat->groupCount(); i++) {
4174                     if (i>0) {
4175                         resultString.append(" ");
4176                     }
4177                     ICU_Utility::appendNumber(resultString, testMat->start(i, status));
4178                 }
4179                 perlExpr.remove(0, 2);
4180             }
4181 
4182             else if (perlExpr.startsWith("@+")) {
4183                 int32_t i;
4184                 for (i=0; i<=testMat->groupCount(); i++) {
4185                     if (i>0) {
4186                         resultString.append(" ");
4187                     }
4188                     ICU_Utility::appendNumber(resultString, testMat->end(i, status));
4189                 }
4190                 perlExpr.remove(0, 2);
4191             }
4192 
4193             else if (perlExpr.startsWith(UNICODE_STRING_SIMPLE("\\"))) {    // \Escape.  Take following char as a literal.
4194                                                      //           or as an escaped sequence (e.g. \n)
4195                 if (perlExpr.length() > 1) {
4196                     perlExpr.remove(0, 1);  // Remove the '\', but only if not last char.
4197                 }
4198                 UChar c = perlExpr.charAt(0);
4199                 switch (c) {
4200                 case 'n':   c = '\n'; break;
4201                 // add any other escape sequences that show up in the test expected results.
4202                 }
4203                 resultString.append(c);
4204                 perlExpr.remove(0, 1);
4205             }
4206 
4207             else  {
4208                 // Any characters from the perl expression that we don't explicitly
4209                 //  recognize before here are assumed to be literals and copied
4210                 //  as-is to the expected results.
4211                 resultString.append(perlExpr.charAt(0));
4212                 perlExpr.remove(0, 1);
4213             }
4214 
4215             if (U_FAILURE(status)) {
4216                 errln("Line %d: ICU Error \"%s\"", lineNum, u_errorName(status));
4217                 break;
4218             }
4219         }
4220 
4221         //
4222         // Expected Results Compare
4223         //
4224         UnicodeString expectedS(fields[4]);
4225         expectedS.findAndReplace(nulnulSrc, nulnul);
4226         expectedS.findAndReplace(ffffSrc,   ffff);
4227         expectedS.findAndReplace(UNICODE_STRING_SIMPLE("\\n"), "\n");
4228 
4229 
4230         if (expectedS.compare(resultString) != 0) {
4231             err("Line %d: Incorrect perl expression results.", lineNum);
4232             infoln((UnicodeString)"Expected \""+expectedS+(UnicodeString)"\"; got \""+resultString+(UnicodeString)"\"");
4233         }
4234 
4235         delete testMat;
4236         delete testPat;
4237     }
4238 
4239     //
4240     // All done.  Clean up allocated stuff.
4241     //
4242     delete cgMat;
4243     delete cgPat;
4244 
4245     delete groupsMat;
4246     delete groupsPat;
4247 
4248     delete flagMat;
4249     delete flagPat;
4250 
4251     delete lineMat;
4252     delete linePat;
4253 
4254     delete fieldPat;
4255     delete [] testData;
4256 
4257 
4258     logln("%d tests skipped because of unimplemented regexp features.", skippedUnimplementedCount);
4259 
4260 }
4261 
4262 
4263 //-------------------------------------------------------------------------------
4264 //
4265 //   PerlTestsUTF8  Run Perl's regular expression tests on UTF-8-based UTexts
4266 //                  (instead of using UnicodeStrings) to test the alternate engine.
4267 //                  The input file for this test is re_tests, the standard regular
4268 //                  expression test data distributed with the Perl source code.
4269 //                  See PerlTests() for more information.
4270 //
4271 //-------------------------------------------------------------------------------
PerlTestsUTF8()4272 void RegexTest::PerlTestsUTF8() {
4273     char tdd[2048];
4274     const char *srcPath;
4275     UErrorCode  status = U_ZERO_ERROR;
4276     UParseError pe;
4277     LocalUConverterPointer UTF8Converter(ucnv_open("UTF-8", &status));
4278     UText       patternText = UTEXT_INITIALIZER;
4279     char       *patternChars = NULL;
4280     int32_t     patternLength;
4281     int32_t     patternCapacity = 0;
4282     UText       inputText = UTEXT_INITIALIZER;
4283     char       *inputChars = NULL;
4284     int32_t     inputLength;
4285     int32_t     inputCapacity = 0;
4286 
4287     ucnv_setFromUCallBack(UTF8Converter.getAlias(), UCNV_FROM_U_CALLBACK_STOP, NULL, NULL, NULL, &status);
4288 
4289     //
4290     //  Open and read the test data file.
4291     //
4292     srcPath=getPath(tdd, "re_tests.txt");
4293     if(srcPath==NULL) {
4294         return; /* something went wrong, error already output */
4295     }
4296 
4297     int32_t    len;
4298     UChar *testData = ReadAndConvertFile(srcPath, len, "iso-8859-1", status);
4299     if (U_FAILURE(status)) {
4300         return; /* something went wrong, error already output */
4301     }
4302 
4303     //
4304     //  Put the test data into a UnicodeString
4305     //
4306     UnicodeString testDataString(FALSE, testData, len);
4307 
4308     //
4309     //  Regex to break the input file into lines, and strip the new lines.
4310     //     One line per match, capture group one is the desired data.
4311     //
4312     RegexPattern* linePat = RegexPattern::compile(UNICODE_STRING_SIMPLE("(.+?)[\\r\\n]+"), 0, pe, status);
4313     if (U_FAILURE(status)) {
4314         dataerrln("RegexPattern::compile() error");
4315         return;
4316     }
4317     RegexMatcher* lineMat = linePat->matcher(testDataString, status);
4318 
4319     //
4320     //  Regex to split a test file line into fields.
4321     //    There are six fields, separated by tabs.
4322     //
4323     RegexPattern* fieldPat = RegexPattern::compile(UNICODE_STRING_SIMPLE("\\t"), 0, pe, status);
4324 
4325     //
4326     //  Regex to identify test patterns with flag settings, and to separate them.
4327     //    Test patterns with flags look like 'pattern'i
4328     //    Test patterns without flags are not quoted:   pattern
4329     //   Coming out, capture group 2 is the pattern, capture group 3 is the flags.
4330     //
4331     RegexPattern *flagPat = RegexPattern::compile(UNICODE_STRING_SIMPLE("('?)(.*)\\1(.*)"), 0, pe, status);
4332     RegexMatcher* flagMat = flagPat->matcher(status);
4333 
4334     //
4335     // The Perl tests reference several perl-isms, which are evaluated/substituted
4336     //   in the test data.  Not being perl, this must be done explicitly.  Here
4337     //   are string constants and REs for these constructs.
4338     //
4339     UnicodeString nulnulSrc("${nulnul}");
4340     UnicodeString nulnul("\\u0000\\u0000", -1, US_INV);
4341     nulnul = nulnul.unescape();
4342 
4343     UnicodeString ffffSrc("${ffff}");
4344     UnicodeString ffff("\\uffff", -1, US_INV);
4345     ffff = ffff.unescape();
4346 
4347     //  regexp for $-[0], $+[2], etc.
4348     RegexPattern *groupsPat = RegexPattern::compile(UNICODE_STRING_SIMPLE("\\$([+\\-])\\[(\\d+)\\]"), 0, pe, status);
4349     RegexMatcher *groupsMat = groupsPat->matcher(status);
4350 
4351     //  regexp for $0, $1, $2, etc.
4352     RegexPattern *cgPat = RegexPattern::compile(UNICODE_STRING_SIMPLE("\\$(\\d+)"), 0, pe, status);
4353     RegexMatcher *cgMat = cgPat->matcher(status);
4354 
4355 
4356     //
4357     // Main Loop for the Perl Tests, runs once per line from the
4358     //   test data file.
4359     //
4360     int32_t  lineNum = 0;
4361     int32_t  skippedUnimplementedCount = 0;
4362     while (lineMat->find()) {
4363         lineNum++;
4364 
4365         //
4366         //  Get a line, break it into its fields, do the Perl
4367         //    variable substitutions.
4368         //
4369         UnicodeString line = lineMat->group(1, status);
4370         UnicodeString fields[7];
4371         fieldPat->split(line, fields, 7, status);
4372 
4373         flagMat->reset(fields[0]);
4374         flagMat->matches(status);
4375         UnicodeString pattern  = flagMat->group(2, status);
4376         pattern.findAndReplace("${bang}", "!");
4377         pattern.findAndReplace(nulnulSrc, UNICODE_STRING_SIMPLE("\\u0000\\u0000"));
4378         pattern.findAndReplace(ffffSrc, ffff);
4379 
4380         //
4381         //  Identify patterns that include match flag settings,
4382         //    split off the flags, remove the extra quotes.
4383         //
4384         UnicodeString flagStr = flagMat->group(3, status);
4385         if (U_FAILURE(status)) {
4386             errln("ucnv_toUChars: ICU Error \"%s\"\n", u_errorName(status));
4387             return;
4388         }
4389         int32_t flags = 0;
4390         const UChar UChar_c = 0x63;  // Char constants for the flag letters.
4391         const UChar UChar_i = 0x69;  //   (Damn the lack of Unicode support in C)
4392         const UChar UChar_m = 0x6d;
4393         const UChar UChar_x = 0x78;
4394         const UChar UChar_y = 0x79;
4395         if (flagStr.indexOf(UChar_i) != -1) {
4396             flags |= UREGEX_CASE_INSENSITIVE;
4397         }
4398         if (flagStr.indexOf(UChar_m) != -1) {
4399             flags |= UREGEX_MULTILINE;
4400         }
4401         if (flagStr.indexOf(UChar_x) != -1) {
4402             flags |= UREGEX_COMMENTS;
4403         }
4404 
4405         //
4406         // Put the pattern in a UTF-8 UText
4407         //
4408         status = U_ZERO_ERROR;
4409         patternLength = pattern.extract(patternChars, patternCapacity, UTF8Converter.getAlias(), status);
4410         if (status == U_BUFFER_OVERFLOW_ERROR) {
4411             status = U_ZERO_ERROR;
4412             delete[] patternChars;
4413             patternCapacity = patternLength + 1;
4414             patternChars = new char[patternCapacity];
4415             pattern.extract(patternChars, patternCapacity, UTF8Converter.getAlias(), status);
4416         }
4417         utext_openUTF8(&patternText, patternChars, patternLength, &status);
4418 
4419         //
4420         // Compile the test pattern.
4421         //
4422         RegexPattern *testPat = RegexPattern::compile(&patternText, flags, pe, status);
4423         if (status == U_REGEX_UNIMPLEMENTED) {
4424             //
4425             // Test of a feature that is planned for ICU, but not yet implemented.
4426             //   skip the test.
4427             skippedUnimplementedCount++;
4428             delete testPat;
4429             status = U_ZERO_ERROR;
4430             continue;
4431         }
4432 
4433         if (U_FAILURE(status)) {
4434             // Some tests are supposed to generate errors.
4435             //   Only report an error for tests that are supposed to succeed.
4436             if (fields[2].indexOf(UChar_c) == -1  &&  // Compilation is not supposed to fail AND
4437                 fields[2].indexOf(UChar_i) == -1)     //   it's not an accepted ICU incompatibility
4438             {
4439                 errln("line %d: ICU Error \"%s\"\n", lineNum, u_errorName(status));
4440             }
4441             status = U_ZERO_ERROR;
4442             delete testPat;
4443             continue;
4444         }
4445 
4446         if (fields[2].indexOf(UChar_i) >= 0) {
4447             // ICU should skip this test.
4448             delete testPat;
4449             continue;
4450         }
4451 
4452         if (fields[2].indexOf(UChar_c) >= 0) {
4453             // This pattern should have caused a compilation error, but didn't/
4454             errln("line %d: Expected a pattern compile error, got success.", lineNum);
4455             delete testPat;
4456             continue;
4457         }
4458 
4459 
4460         //
4461         // replace the Perl variables that appear in some of the
4462         //   match data strings.
4463         //
4464         UnicodeString matchString = fields[1];
4465         matchString.findAndReplace(nulnulSrc, nulnul);
4466         matchString.findAndReplace(ffffSrc,   ffff);
4467 
4468         // Replace any \n in the match string with an actual new-line char.
4469         //  Don't do full unescape, as this unescapes more than Perl does, which
4470         //  causes other spurious failures in the tests.
4471         matchString.findAndReplace(UNICODE_STRING_SIMPLE("\\n"), "\n");
4472 
4473         //
4474         // Put the input in a UTF-8 UText
4475         //
4476         status = U_ZERO_ERROR;
4477         inputLength = matchString.extract(inputChars, inputCapacity, UTF8Converter.getAlias(), status);
4478         if (status == U_BUFFER_OVERFLOW_ERROR) {
4479             status = U_ZERO_ERROR;
4480             delete[] inputChars;
4481             inputCapacity = inputLength + 1;
4482             inputChars = new char[inputCapacity];
4483             matchString.extract(inputChars, inputCapacity, UTF8Converter.getAlias(), status);
4484         }
4485         utext_openUTF8(&inputText, inputChars, inputLength, &status);
4486 
4487         //
4488         // Run the test, check for expected match/don't match result.
4489         //
4490         RegexMatcher *testMat = &testPat->matcher(status)->reset(&inputText);
4491         UBool found = testMat->find();
4492         UBool expected = FALSE;
4493         if (fields[2].indexOf(UChar_y) >=0) {
4494             expected = TRUE;
4495         }
4496         if (expected != found) {
4497             errln("line %d: Expected %smatch, got %smatch",
4498                 lineNum, expected?"":"no ", found?"":"no " );
4499             delete testMat;
4500             delete testPat;
4501             continue;
4502         }
4503 
4504         // Don't try to check expected results if there is no match.
4505         //   (Some have stuff in the expected fields)
4506         if (!found) {
4507             delete testMat;
4508             delete testPat;
4509             continue;
4510         }
4511 
4512         //
4513         // Interpret the Perl expression from the fourth field of the data file,
4514         // building up an ICU string from the results of the ICU match.
4515         //   The Perl expression will contain references to the results of
4516         //     a regex match, including the matched string, capture group strings,
4517         //     group starting and ending indices, etc.
4518         //
4519         UnicodeString resultString;
4520         UnicodeString perlExpr = fields[3];
4521 
4522         while (perlExpr.length() > 0) {
4523             groupsMat->reset(perlExpr);
4524             cgMat->reset(perlExpr);
4525 
4526             if (perlExpr.startsWith("$&")) {
4527                 resultString.append(testMat->group(status));
4528                 perlExpr.remove(0, 2);
4529             }
4530 
4531             else if (groupsMat->lookingAt(status)) {
4532                 // $-[0]   $+[2]  etc.
4533                 UnicodeString digitString = groupsMat->group(2, status);
4534                 int32_t t = 0;
4535                 int32_t groupNum = ICU_Utility::parseNumber(digitString, t, 10);
4536                 UnicodeString plusOrMinus = groupsMat->group(1, status);
4537                 int32_t matchPosition;
4538                 if (plusOrMinus.compare("+") == 0) {
4539                     matchPosition = testMat->end(groupNum, status);
4540                 } else {
4541                     matchPosition = testMat->start(groupNum, status);
4542                 }
4543                 if (matchPosition != -1) {
4544                     ICU_Utility::appendNumber(resultString, matchPosition);
4545                 }
4546                 perlExpr.remove(0, groupsMat->end(status));
4547             }
4548 
4549             else if (cgMat->lookingAt(status)) {
4550                 // $1, $2, $3, etc.
4551                 UnicodeString digitString = cgMat->group(1, status);
4552                 int32_t t = 0;
4553                 int32_t groupNum = ICU_Utility::parseNumber(digitString, t, 10);
4554                 if (U_SUCCESS(status)) {
4555                     resultString.append(testMat->group(groupNum, status));
4556                     status = U_ZERO_ERROR;
4557                 }
4558                 perlExpr.remove(0, cgMat->end(status));
4559             }
4560 
4561             else if (perlExpr.startsWith("@-")) {
4562                 int32_t i;
4563                 for (i=0; i<=testMat->groupCount(); i++) {
4564                     if (i>0) {
4565                         resultString.append(" ");
4566                     }
4567                     ICU_Utility::appendNumber(resultString, testMat->start(i, status));
4568                 }
4569                 perlExpr.remove(0, 2);
4570             }
4571 
4572             else if (perlExpr.startsWith("@+")) {
4573                 int32_t i;
4574                 for (i=0; i<=testMat->groupCount(); i++) {
4575                     if (i>0) {
4576                         resultString.append(" ");
4577                     }
4578                     ICU_Utility::appendNumber(resultString, testMat->end(i, status));
4579                 }
4580                 perlExpr.remove(0, 2);
4581             }
4582 
4583             else if (perlExpr.startsWith(UNICODE_STRING_SIMPLE("\\"))) {    // \Escape.  Take following char as a literal.
4584                                                      //           or as an escaped sequence (e.g. \n)
4585                 if (perlExpr.length() > 1) {
4586                     perlExpr.remove(0, 1);  // Remove the '\', but only if not last char.
4587                 }
4588                 UChar c = perlExpr.charAt(0);
4589                 switch (c) {
4590                 case 'n':   c = '\n'; break;
4591                 // add any other escape sequences that show up in the test expected results.
4592                 }
4593                 resultString.append(c);
4594                 perlExpr.remove(0, 1);
4595             }
4596 
4597             else  {
4598                 // Any characters from the perl expression that we don't explicitly
4599                 //  recognize before here are assumed to be literals and copied
4600                 //  as-is to the expected results.
4601                 resultString.append(perlExpr.charAt(0));
4602                 perlExpr.remove(0, 1);
4603             }
4604 
4605             if (U_FAILURE(status)) {
4606                 errln("Line %d: ICU Error \"%s\"", lineNum, u_errorName(status));
4607                 break;
4608             }
4609         }
4610 
4611         //
4612         // Expected Results Compare
4613         //
4614         UnicodeString expectedS(fields[4]);
4615         expectedS.findAndReplace(nulnulSrc, nulnul);
4616         expectedS.findAndReplace(ffffSrc,   ffff);
4617         expectedS.findAndReplace(UNICODE_STRING_SIMPLE("\\n"), "\n");
4618 
4619 
4620         if (expectedS.compare(resultString) != 0) {
4621             err("Line %d: Incorrect perl expression results.", lineNum);
4622             infoln((UnicodeString)"Expected \""+expectedS+(UnicodeString)"\"; got \""+resultString+(UnicodeString)"\"");
4623         }
4624 
4625         delete testMat;
4626         delete testPat;
4627     }
4628 
4629     //
4630     // All done.  Clean up allocated stuff.
4631     //
4632     delete cgMat;
4633     delete cgPat;
4634 
4635     delete groupsMat;
4636     delete groupsPat;
4637 
4638     delete flagMat;
4639     delete flagPat;
4640 
4641     delete lineMat;
4642     delete linePat;
4643 
4644     delete fieldPat;
4645     delete [] testData;
4646 
4647     utext_close(&patternText);
4648     utext_close(&inputText);
4649 
4650     delete [] patternChars;
4651     delete [] inputChars;
4652 
4653 
4654     logln("%d tests skipped because of unimplemented regexp features.", skippedUnimplementedCount);
4655 
4656 }
4657 
4658 
4659 //--------------------------------------------------------------
4660 //
4661 //  Bug6149   Verify limits to heap expansion for backtrack stack.
4662 //             Use this pattern,
4663 //                 "(a?){1,8000000}"
4664 //             Note: was an unbounded upperbounds, but that now has loop-breaking enabled.
4665 //                   This test is likely to be fragile, as further optimizations stop
4666 //                   more cases of pointless looping in the match engine.
4667 //
4668 //---------------------------------------------------------------
Bug6149()4669 void RegexTest::Bug6149() {
4670     UnicodeString pattern("(a?){1,8000000}");
4671     UnicodeString s("xyz");
4672     uint32_t flags = 0;
4673     UErrorCode status = U_ZERO_ERROR;
4674 
4675     RegexMatcher  matcher(pattern, s, flags, status);
4676     UBool result = false;
4677     REGEX_ASSERT_FAIL(result=matcher.matches(status), U_REGEX_STACK_OVERFLOW);
4678     REGEX_ASSERT(result == FALSE);
4679  }
4680 
4681 
4682 //
4683 //   Callbacks()    Test the callback function.
4684 //                  When set, callbacks occur periodically during matching operations,
4685 //                  giving the application code the ability to abort the operation
4686 //                  before it's normal completion.
4687 //
4688 
4689 struct callBackContext {
4690     RegexTest        *test;
4691     int32_t          maxCalls;
4692     int32_t          numCalls;
4693     int32_t          lastSteps;
resetcallBackContext4694     void reset(int32_t max) {maxCalls=max; numCalls=0; lastSteps=0;}
4695 };
4696 
4697 U_CDECL_BEGIN
4698 static UBool U_CALLCONV
testCallBackFn(const void * context,int32_t steps)4699 testCallBackFn(const void *context, int32_t steps) {
4700     callBackContext  *info = (callBackContext *)context;
4701     if (info->lastSteps+1 != steps) {
4702         info->test->errln("incorrect steps in callback.  Expected %d, got %d\n", info->lastSteps+1, steps);
4703     }
4704     info->lastSteps = steps;
4705     info->numCalls++;
4706     return (info->numCalls < info->maxCalls);
4707 }
4708 U_CDECL_END
4709 
Callbacks()4710 void RegexTest::Callbacks() {
4711    {
4712         // Getter returns NULLs if no callback has been set
4713 
4714         //   The variables that the getter will fill in.
4715         //   Init to non-null values so that the action of the getter can be seen.
4716         const void          *returnedContext = &returnedContext;
4717         URegexMatchCallback *returnedFn = &testCallBackFn;
4718 
4719         UErrorCode status = U_ZERO_ERROR;
4720         RegexMatcher matcher("x", 0, status);
4721         REGEX_CHECK_STATUS;
4722         matcher.getMatchCallback(returnedFn, returnedContext, status);
4723         REGEX_CHECK_STATUS;
4724         REGEX_ASSERT(returnedFn == NULL);
4725         REGEX_ASSERT(returnedContext == NULL);
4726     }
4727 
4728    {
4729         // Set and Get work
4730         callBackContext cbInfo = {this, 0, 0, 0};
4731         const void          *returnedContext;
4732         URegexMatchCallback *returnedFn;
4733         UErrorCode status = U_ZERO_ERROR;
4734         RegexMatcher matcher(UNICODE_STRING_SIMPLE("((.)+\\2)+x"), 0, status);  // A pattern that can run long.
4735         REGEX_CHECK_STATUS;
4736         matcher.setMatchCallback(testCallBackFn, &cbInfo, status);
4737         REGEX_CHECK_STATUS;
4738         matcher.getMatchCallback(returnedFn, returnedContext, status);
4739         REGEX_CHECK_STATUS;
4740         REGEX_ASSERT(returnedFn == testCallBackFn);
4741         REGEX_ASSERT(returnedContext == &cbInfo);
4742 
4743         // A short-running match shouldn't invoke the callback
4744         status = U_ZERO_ERROR;
4745         cbInfo.reset(1);
4746         UnicodeString s = "xxx";
4747         matcher.reset(s);
4748         REGEX_ASSERT(matcher.matches(status));
4749         REGEX_CHECK_STATUS;
4750         REGEX_ASSERT(cbInfo.numCalls == 0);
4751 
4752         // A medium-length match that runs long enough to invoke the
4753         //   callback, but not so long that the callback aborts it.
4754         status = U_ZERO_ERROR;
4755         cbInfo.reset(4);
4756         s = "aaaaaaaaaaaaaaaaaaab";
4757         matcher.reset(s);
4758         REGEX_ASSERT(matcher.matches(status)==FALSE);
4759         REGEX_CHECK_STATUS;
4760         REGEX_ASSERT(cbInfo.numCalls > 0);
4761 
4762         // A longer running match that the callback function will abort.
4763         status = U_ZERO_ERROR;
4764         cbInfo.reset(4);
4765         s = "aaaaaaaaaaaaaaaaaaaaaaab";
4766         matcher.reset(s);
4767         REGEX_ASSERT(matcher.matches(status)==FALSE);
4768         REGEX_ASSERT(status == U_REGEX_STOPPED_BY_CALLER);
4769         REGEX_ASSERT(cbInfo.numCalls == 4);
4770 
4771         // A longer running find that the callback function will abort.
4772         status = U_ZERO_ERROR;
4773         cbInfo.reset(4);
4774         s = "aaaaaaaaaaaaaaaaaaaaaaab";
4775         matcher.reset(s);
4776         REGEX_ASSERT(matcher.find(status)==FALSE);
4777         REGEX_ASSERT(status == U_REGEX_STOPPED_BY_CALLER);
4778         REGEX_ASSERT(cbInfo.numCalls == 4);
4779     }
4780 
4781 
4782 }
4783 
4784 
4785 //
4786 //   FindProgressCallbacks()    Test the find "progress" callback function.
4787 //                  When set, the find progress callback will be invoked during a find operations
4788 //                  after each return from a match attempt, giving the application the opportunity
4789 //                  to terminate a long-running find operation before it's normal completion.
4790 //
4791 
4792 struct progressCallBackContext {
4793     RegexTest        *test;
4794     int64_t          lastIndex;
4795     int32_t          maxCalls;
4796     int32_t          numCalls;
resetprogressCallBackContext4797     void reset(int32_t max) {maxCalls=max; numCalls=0;lastIndex=0;}
4798 };
4799 
4800 // call-back function for find().
4801 // Return TRUE to continue the find().
4802 // Return FALSE to stop the find().
4803 U_CDECL_BEGIN
4804 static UBool U_CALLCONV
testProgressCallBackFn(const void * context,int64_t matchIndex)4805 testProgressCallBackFn(const void *context, int64_t matchIndex) {
4806     progressCallBackContext  *info = (progressCallBackContext *)context;
4807     info->numCalls++;
4808     info->lastIndex = matchIndex;
4809 //    info->test->infoln("ProgressCallback - matchIndex = %d, numCalls = %d\n", matchIndex, info->numCalls);
4810     return (info->numCalls < info->maxCalls);
4811 }
4812 U_CDECL_END
4813 
FindProgressCallbacks()4814 void RegexTest::FindProgressCallbacks() {
4815    {
4816         // Getter returns NULLs if no callback has been set
4817 
4818         //   The variables that the getter will fill in.
4819         //   Init to non-null values so that the action of the getter can be seen.
4820         const void                  *returnedContext = &returnedContext;
4821         URegexFindProgressCallback  *returnedFn = &testProgressCallBackFn;
4822 
4823         UErrorCode status = U_ZERO_ERROR;
4824         RegexMatcher matcher("x", 0, status);
4825         REGEX_CHECK_STATUS;
4826         matcher.getFindProgressCallback(returnedFn, returnedContext, status);
4827         REGEX_CHECK_STATUS;
4828         REGEX_ASSERT(returnedFn == NULL);
4829         REGEX_ASSERT(returnedContext == NULL);
4830     }
4831 
4832    {
4833         // Set and Get work
4834         progressCallBackContext cbInfo = {this, 0, 0, 0};
4835         const void                  *returnedContext;
4836         URegexFindProgressCallback  *returnedFn;
4837         UErrorCode status = U_ZERO_ERROR;
4838         RegexMatcher matcher(UNICODE_STRING_SIMPLE("((.)\\2)x"), 0, status);
4839         REGEX_CHECK_STATUS;
4840         matcher.setFindProgressCallback(testProgressCallBackFn, &cbInfo, status);
4841         REGEX_CHECK_STATUS;
4842         matcher.getFindProgressCallback(returnedFn, returnedContext, status);
4843         REGEX_CHECK_STATUS;
4844         REGEX_ASSERT(returnedFn == testProgressCallBackFn);
4845         REGEX_ASSERT(returnedContext == &cbInfo);
4846 
4847         // A find that matches on the initial position does NOT invoke the callback.
4848         status = U_ZERO_ERROR;
4849         cbInfo.reset(100);
4850         UnicodeString s = "aaxxx";
4851         matcher.reset(s);
4852 #if 0
4853         matcher.setTrace(TRUE);
4854 #endif
4855         REGEX_ASSERT(matcher.find(0, status));
4856         REGEX_CHECK_STATUS;
4857         REGEX_ASSERT(cbInfo.numCalls == 0);
4858 
4859         // A medium running find() that causes matcher.find() to invoke our callback for each index,
4860         //   but not so many times that we interrupt the operation.
4861         status = U_ZERO_ERROR;
4862         s = "aaaaaaaaaaaaaaaaaaab";
4863         cbInfo.reset(s.length()); //  Some upper limit for number of calls that is greater than size of our input string
4864         matcher.reset(s);
4865         REGEX_ASSERT(matcher.find(0, status)==FALSE);
4866         REGEX_CHECK_STATUS;
4867         REGEX_ASSERT(cbInfo.numCalls > 0 && cbInfo.numCalls < 25);
4868 
4869         // A longer running match that causes matcher.find() to invoke our callback which we cancel/interrupt at some point.
4870         status = U_ZERO_ERROR;
4871         UnicodeString s1 = "aaaaaaaaaaaaaaaaaaaaaaab";
4872         cbInfo.reset(s1.length() - 5); //  Bail early somewhere near the end of input string
4873         matcher.reset(s1);
4874         REGEX_ASSERT(matcher.find(0, status)==FALSE);
4875         REGEX_ASSERT(status == U_REGEX_STOPPED_BY_CALLER);
4876         REGEX_ASSERT(cbInfo.numCalls == s1.length() - 5);
4877 
4878         // Now a match that will succeed, but after an interruption
4879         status = U_ZERO_ERROR;
4880         UnicodeString s2 = "aaaaaaaaaaaaaa aaaaaaaaab xxx";
4881         cbInfo.reset(s2.length() - 10); //  Bail early somewhere near the end of input string
4882         matcher.reset(s2);
4883         REGEX_ASSERT(matcher.find(0, status)==FALSE);
4884         REGEX_ASSERT(status == U_REGEX_STOPPED_BY_CALLER);
4885         // Now retry the match from where left off
4886         cbInfo.maxCalls = 100; //  No callback limit
4887         status = U_ZERO_ERROR;
4888         REGEX_ASSERT(matcher.find(cbInfo.lastIndex, status));
4889         REGEX_CHECK_STATUS;
4890     }
4891 
4892 
4893 }
4894 
4895 
4896 //---------------------------------------------------------------------------
4897 //
4898 //    PreAllocatedUTextCAPI    Check the C API with pre-allocated mutable
4899 //                             UTexts. The pure-C implementation of UText
4900 //                             has no mutable backing stores, but we can
4901 //                             use UnicodeString here to test the functionality.
4902 //
4903 //---------------------------------------------------------------------------
PreAllocatedUTextCAPI()4904 void RegexTest::PreAllocatedUTextCAPI () {
4905     UErrorCode           status = U_ZERO_ERROR;
4906     URegularExpression  *re;
4907     UText                patternText = UTEXT_INITIALIZER;
4908     UnicodeString        buffer;
4909     UText                bufferText = UTEXT_INITIALIZER;
4910 
4911     utext_openUnicodeString(&bufferText, &buffer, &status);
4912 
4913     /*
4914      *  getText() and getUText()
4915      */
4916     {
4917         UText  text1 = UTEXT_INITIALIZER;
4918         UText  text2 = UTEXT_INITIALIZER;
4919         UChar  text2Chars[20];
4920         UText  *resultText;
4921 
4922         status = U_ZERO_ERROR;
4923         regextst_openUTF8FromInvariant(&text1, "abcccd", -1, &status);
4924         regextst_openUTF8FromInvariant(&text2, "abcccxd", -1, &status);
4925         u_uastrncpy(text2Chars, "abcccxd", sizeof(text2)/2);
4926         utext_openUChars(&text2, text2Chars, -1, &status);
4927 
4928         regextst_openUTF8FromInvariant(&patternText, "abc*d", -1, &status);
4929         re = uregex_openUText(&patternText, 0, NULL, &status);
4930 
4931         /* First set a UText */
4932         uregex_setUText(re, &text1, &status);
4933         resultText = uregex_getUText(re, &bufferText, &status);
4934         REGEX_CHECK_STATUS;
4935         REGEX_ASSERT(resultText == &bufferText);
4936         utext_setNativeIndex(resultText, 0);
4937         utext_setNativeIndex(&text1, 0);
4938         REGEX_ASSERT(testUTextEqual(resultText, &text1));
4939 
4940         resultText = uregex_getUText(re, &bufferText, &status);
4941         REGEX_CHECK_STATUS;
4942         REGEX_ASSERT(resultText == &bufferText);
4943         utext_setNativeIndex(resultText, 0);
4944         utext_setNativeIndex(&text1, 0);
4945         REGEX_ASSERT(testUTextEqual(resultText, &text1));
4946 
4947         /* Then set a UChar * */
4948         uregex_setText(re, text2Chars, 7, &status);
4949         resultText = uregex_getUText(re, &bufferText, &status);
4950         REGEX_CHECK_STATUS;
4951         REGEX_ASSERT(resultText == &bufferText);
4952         utext_setNativeIndex(resultText, 0);
4953         utext_setNativeIndex(&text2, 0);
4954         REGEX_ASSERT(testUTextEqual(resultText, &text2));
4955 
4956         uregex_close(re);
4957         utext_close(&text1);
4958         utext_close(&text2);
4959     }
4960 
4961     /*
4962      *  group()
4963      */
4964     {
4965         UChar    text1[80];
4966         UText   *actual;
4967         UBool    result;
4968         int64_t  length = 0;
4969 
4970         u_uastrncpy(text1, "noise abc interior def, and this is off the end",  UPRV_LENGTHOF(text1));
4971         //                  012345678901234567890123456789012345678901234567
4972         //                  0         1         2         3         4
4973 
4974         status = U_ZERO_ERROR;
4975         re = uregex_openC("abc(.*?)def", 0, NULL, &status);
4976         REGEX_CHECK_STATUS;
4977 
4978         uregex_setText(re, text1, -1, &status);
4979         result = uregex_find(re, 0, &status);
4980         REGEX_ASSERT(result==TRUE);
4981 
4982         /*  Capture Group 0, the full match.  Should succeed. "abc interior def" */
4983         status = U_ZERO_ERROR;
4984         actual = uregex_groupUText(re, 0, &bufferText, &length, &status);
4985         REGEX_CHECK_STATUS;
4986         REGEX_ASSERT(actual == &bufferText);
4987         REGEX_ASSERT(utext_getNativeIndex(actual) == 6);
4988         REGEX_ASSERT(length == 16);
4989         REGEX_ASSERT(utext_nativeLength(actual) == 47);
4990 
4991         /*  Capture group #1.  Should succeed, matching " interior ". */
4992         status = U_ZERO_ERROR;
4993         actual = uregex_groupUText(re, 1, &bufferText, &length, &status);
4994         REGEX_CHECK_STATUS;
4995         REGEX_ASSERT(actual == &bufferText);
4996         REGEX_ASSERT(utext_getNativeIndex(actual) == 9);   // position of " interior "
4997         REGEX_ASSERT(length == 10);
4998         REGEX_ASSERT(utext_nativeLength(actual) == 47);
4999 
5000         /*  Capture group out of range.  Error. */
5001         status = U_ZERO_ERROR;
5002         actual = uregex_groupUText(re, 2, &bufferText, &length, &status);
5003         REGEX_ASSERT(status == U_INDEX_OUTOFBOUNDS_ERROR);
5004         REGEX_ASSERT(actual == &bufferText);
5005         uregex_close(re);
5006 
5007     }
5008 
5009     /*
5010      *  replaceFirst()
5011      */
5012     {
5013         UChar    text1[80];
5014         UChar    text2[80];
5015         UText    replText = UTEXT_INITIALIZER;
5016         UText   *result;
5017         status = U_ZERO_ERROR;
5018         utext_openUnicodeString(&bufferText, &buffer, &status);
5019 
5020         status = U_ZERO_ERROR;
5021         u_uastrncpy(text1, "Replace xaax x1x x...x.",  UPRV_LENGTHOF(text1));
5022         u_uastrncpy(text2, "No match here.",  UPRV_LENGTHOF(text2)/2);
5023         regextst_openUTF8FromInvariant(&replText, "<$1>", -1, &status);
5024 
5025         re = uregex_openC("x(.*?)x", 0, NULL, &status);
5026         REGEX_CHECK_STATUS;
5027 
5028         /*  Normal case, with match */
5029         uregex_setText(re, text1, -1, &status);
5030         REGEX_CHECK_STATUS;
5031         utext_replace(&bufferText, 0, utext_nativeLength(&bufferText), NULL, 0, &status);
5032         REGEX_CHECK_STATUS;
5033         result = uregex_replaceFirstUText(re, &replText, &bufferText, &status);
5034         REGEX_CHECK_STATUS;
5035         REGEX_ASSERT(result == &bufferText);
5036         REGEX_ASSERT_UTEXT_INVARIANT("Replace <aa> x1x x...x.", result);
5037 
5038         /* No match.  Text should copy to output with no changes.  */
5039         uregex_setText(re, text2, -1, &status);
5040         utext_replace(&bufferText, 0, utext_nativeLength(&bufferText), NULL, 0, &status);
5041         result = uregex_replaceFirstUText(re, &replText, &bufferText, &status);
5042         REGEX_CHECK_STATUS;
5043         REGEX_ASSERT(result == &bufferText);
5044         REGEX_ASSERT_UTEXT_INVARIANT("No match here.", result);
5045 
5046         /* Unicode escapes */
5047         uregex_setText(re, text1, -1, &status);
5048         regextst_openUTF8FromInvariant(&replText, "\\\\\\u0041$1\\U00000042\\$\\a", -1, &status);
5049         utext_replace(&bufferText, 0, utext_nativeLength(&bufferText), NULL, 0, &status);
5050         result = uregex_replaceFirstUText(re, &replText, &bufferText, &status);
5051         REGEX_CHECK_STATUS;
5052         REGEX_ASSERT(result == &bufferText);
5053         REGEX_ASSERT_UTEXT_INVARIANT("Replace \\AaaB$a x1x x...x.", result);
5054 
5055         uregex_close(re);
5056         utext_close(&replText);
5057     }
5058 
5059 
5060     /*
5061      *  replaceAll()
5062      */
5063     {
5064         UChar    text1[80];
5065         UChar    text2[80];
5066         UText    replText = UTEXT_INITIALIZER;
5067         UText   *result;
5068 
5069         status = U_ZERO_ERROR;
5070         u_uastrncpy(text1, "Replace xaax x1x x...x.",  sizeof(text1)/2);
5071         u_uastrncpy(text2, "No match here.",  sizeof(text2)/2);
5072         regextst_openUTF8FromInvariant(&replText, "<$1>", -1, &status);
5073 
5074         re = uregex_openC("x(.*?)x", 0, NULL, &status);
5075         REGEX_CHECK_STATUS;
5076 
5077         /*  Normal case, with match */
5078         uregex_setText(re, text1, -1, &status);
5079         utext_replace(&bufferText, 0, utext_nativeLength(&bufferText), NULL, 0, &status);
5080         result = uregex_replaceAllUText(re, &replText, &bufferText, &status);
5081         REGEX_CHECK_STATUS;
5082         REGEX_ASSERT(result == &bufferText);
5083         REGEX_ASSERT_UTEXT_INVARIANT("Replace <aa> <1> <...>.", result);
5084 
5085         /* No match.  Text should copy to output with no changes.  */
5086         uregex_setText(re, text2, -1, &status);
5087         utext_replace(&bufferText, 0, utext_nativeLength(&bufferText), NULL, 0, &status);
5088         result = uregex_replaceAllUText(re, &replText, &bufferText, &status);
5089         REGEX_CHECK_STATUS;
5090         REGEX_ASSERT(result == &bufferText);
5091         REGEX_ASSERT_UTEXT_INVARIANT("No match here.", result);
5092 
5093         uregex_close(re);
5094         utext_close(&replText);
5095     }
5096 
5097 
5098     /*
5099      *  splitUText() uses the C++ API directly, and the UnicodeString version uses mutable UTexts,
5100      *   so we don't need to test it here.
5101      */
5102 
5103     utext_close(&bufferText);
5104     utext_close(&patternText);
5105 }
5106 
5107 
5108 //--------------------------------------------------------------
5109 //
5110 //  NamedCapture   Check basic named capture group functionality
5111 //
5112 //--------------------------------------------------------------
NamedCapture()5113 void RegexTest::NamedCapture() {
5114     UErrorCode status = U_ZERO_ERROR;
5115     RegexPattern *pat = RegexPattern::compile(UnicodeString(
5116             "abc()()(?<three>xyz)(de)(?<five>hmm)(?<six>oh)f\\k<five>"), 0, status);
5117     REGEX_CHECK_STATUS;
5118     int32_t group = pat->groupNumberFromName("five", -1, status);
5119     REGEX_CHECK_STATUS;
5120     REGEX_ASSERT(5 == group);
5121     group = pat->groupNumberFromName("three", -1, status);
5122     REGEX_CHECK_STATUS;
5123     REGEX_ASSERT(3 == group);
5124 
5125     status = U_ZERO_ERROR;
5126     group = pat->groupNumberFromName(UnicodeString("six"), status);
5127     REGEX_CHECK_STATUS;
5128     REGEX_ASSERT(6 == group);
5129 
5130     status = U_ZERO_ERROR;
5131     group = pat->groupNumberFromName(UnicodeString("nosuch"), status);
5132     U_ASSERT(status == U_REGEX_INVALID_CAPTURE_GROUP_NAME);
5133 
5134     status = U_ZERO_ERROR;
5135 
5136     // After copying a pattern, named capture should still work in the copy.
5137     RegexPattern *copiedPat = new RegexPattern(*pat);
5138     REGEX_ASSERT(*copiedPat == *pat);
5139     delete pat; pat = NULL;  // Delete original, copy should have no references back to it.
5140 
5141     group = copiedPat->groupNumberFromName("five", -1, status);
5142     REGEX_CHECK_STATUS;
5143     REGEX_ASSERT(5 == group);
5144     group = copiedPat->groupNumberFromName("three", -1, status);
5145     REGEX_CHECK_STATUS;
5146     REGEX_ASSERT(3 == group);
5147     delete copiedPat;
5148 
5149     // ReplaceAll with named capture group.
5150     status = U_ZERO_ERROR;
5151     UnicodeString text("Substitution of <<quotes>> for <<double brackets>>");
5152     RegexMatcher *m = new RegexMatcher(UnicodeString("<<(?<mid>.+?)>>"), text, 0, status);
5153     REGEX_CHECK_STATUS;
5154     // m.pattern().dumpPattern();
5155     UnicodeString replacedText = m->replaceAll("'${mid}'", status);
5156     REGEX_CHECK_STATUS;
5157     REGEX_ASSERT(UnicodeString("Substitution of 'quotes' for 'double brackets'") == replacedText);
5158     delete m;
5159 
5160     // ReplaceAll, allowed capture group numbers.
5161     text = UnicodeString("abcmxyz");
5162     m = new RegexMatcher(UnicodeString("..(?<one>m)(.)(.)"), text, 0, status);
5163     REGEX_CHECK_STATUS;
5164 
5165     status = U_ZERO_ERROR;
5166     replacedText  = m->replaceAll(UnicodeString("<$0>"), status);   // group 0, full match, is allowed.
5167     REGEX_CHECK_STATUS;
5168     REGEX_ASSERT(UnicodeString("a<bcmxy>z") == replacedText);
5169 
5170     status = U_ZERO_ERROR;
5171     replacedText  = m->replaceAll(UnicodeString("<$1>"), status);      // group 1 by number.
5172     REGEX_CHECK_STATUS;
5173     REGEX_ASSERT(UnicodeString("a<m>z") == replacedText);
5174 
5175     status = U_ZERO_ERROR;
5176     replacedText  = m->replaceAll(UnicodeString("<${one}>"), status);   // group 1 by name.
5177     REGEX_CHECK_STATUS;
5178     REGEX_ASSERT(UnicodeString("a<m>z") == replacedText);
5179 
5180     status = U_ZERO_ERROR;
5181     replacedText  = m->replaceAll(UnicodeString("<$2>"), status);   // group 2.
5182     REGEX_CHECK_STATUS;
5183     REGEX_ASSERT(UnicodeString("a<x>z") == replacedText);
5184 
5185     status = U_ZERO_ERROR;
5186     replacedText  = m->replaceAll(UnicodeString("<$3>"), status);
5187     REGEX_CHECK_STATUS;
5188     REGEX_ASSERT(UnicodeString("a<y>z") == replacedText);
5189 
5190     status = U_ZERO_ERROR;
5191     replacedText  = m->replaceAll(UnicodeString("<$4>"), status);
5192     REGEX_ASSERT(status == U_INDEX_OUTOFBOUNDS_ERROR);
5193 
5194     status = U_ZERO_ERROR;
5195     replacedText  = m->replaceAll(UnicodeString("<$04>"), status);      // group 0, leading 0,
5196     REGEX_CHECK_STATUS;                                                 //    trailing out-of-range 4 passes through.
5197     REGEX_ASSERT(UnicodeString("a<bcmxy4>z") == replacedText);
5198 
5199     status = U_ZERO_ERROR;
5200     replacedText  = m->replaceAll(UnicodeString("<$000016>"), status);  // Consume leading zeroes. Don't consume digits
5201     REGEX_CHECK_STATUS;                                                 //   that push group num out of range.
5202     REGEX_ASSERT(UnicodeString("a<m6>z") == replacedText);              //   This is group 1.
5203 
5204     status = U_ZERO_ERROR;
5205     replacedText  = m->replaceAll(UnicodeString("<$3$2$1${one}>"), status);
5206     REGEX_CHECK_STATUS;
5207     REGEX_ASSERT(UnicodeString("a<yxmm>z") == replacedText);
5208 
5209     status = U_ZERO_ERROR;
5210     replacedText  = m->replaceAll(UnicodeString("$3$2$1${one}"), status);
5211     REGEX_CHECK_STATUS;
5212     REGEX_ASSERT(UnicodeString("ayxmmz") == replacedText);
5213 
5214     status = U_ZERO_ERROR;
5215     replacedText  = m->replaceAll(UnicodeString("<${noSuchName}>"), status);
5216     REGEX_ASSERT(status == U_REGEX_INVALID_CAPTURE_GROUP_NAME);
5217 
5218     status = U_ZERO_ERROR;
5219     replacedText  = m->replaceAll(UnicodeString("<${invalid-name}>"), status);
5220     REGEX_ASSERT(status == U_REGEX_INVALID_CAPTURE_GROUP_NAME);
5221 
5222     status = U_ZERO_ERROR;
5223     replacedText  = m->replaceAll(UnicodeString("<${one"), status);
5224     REGEX_ASSERT(status == U_REGEX_INVALID_CAPTURE_GROUP_NAME);
5225 
5226     status = U_ZERO_ERROR;
5227     replacedText  = m->replaceAll(UnicodeString("$not a capture group"), status);
5228     REGEX_ASSERT(status == U_REGEX_INVALID_CAPTURE_GROUP_NAME);
5229 
5230     delete m;
5231 
5232     // Repeat the above replaceAll() tests using the plain C API, which
5233     //  has a separate implementation internally.
5234     //  TODO: factor out the test data.
5235 
5236     status = U_ZERO_ERROR;
5237     URegularExpression *re = uregex_openC("..(?<one>m)(.)(.)", 0, NULL, &status);
5238     REGEX_CHECK_STATUS;
5239     text = UnicodeString("abcmxyz");
5240     uregex_setText(re, text.getBuffer(), text.length(), &status);
5241     REGEX_CHECK_STATUS;
5242 
5243     UChar resultBuf[100];
5244     int32_t resultLength;
5245     UnicodeString repl;
5246 
5247     status = U_ZERO_ERROR;
5248     repl = UnicodeString("<$0>");
5249     resultLength = uregex_replaceAll(re, repl.getBuffer(), repl.length(), resultBuf, UPRV_LENGTHOF(resultBuf), &status);
5250     REGEX_CHECK_STATUS;
5251     REGEX_ASSERT(UnicodeString("a<bcmxy>z") == UnicodeString(resultBuf, resultLength));
5252 
5253     status = U_ZERO_ERROR;
5254     repl = UnicodeString("<$1>");
5255     resultLength = uregex_replaceAll(re, repl.getBuffer(), repl.length(), resultBuf, UPRV_LENGTHOF(resultBuf), &status);
5256     REGEX_CHECK_STATUS;
5257     REGEX_ASSERT(UnicodeString("a<m>z") == UnicodeString(resultBuf, resultLength));
5258 
5259     status = U_ZERO_ERROR;
5260     repl = UnicodeString("<${one}>");
5261     resultLength = uregex_replaceAll(re, repl.getBuffer(), repl.length(), resultBuf, UPRV_LENGTHOF(resultBuf), &status);
5262     REGEX_CHECK_STATUS;
5263     REGEX_ASSERT(UnicodeString("a<m>z") == UnicodeString(resultBuf, resultLength));
5264 
5265     status = U_ZERO_ERROR;
5266     repl = UnicodeString("<$2>");
5267     resultLength = uregex_replaceAll(re, repl.getBuffer(), repl.length(), resultBuf, UPRV_LENGTHOF(resultBuf), &status);
5268     REGEX_CHECK_STATUS;
5269     REGEX_ASSERT(UnicodeString("a<x>z") == UnicodeString(resultBuf, resultLength));
5270 
5271     status = U_ZERO_ERROR;
5272     repl = UnicodeString("<$3>");
5273     resultLength = uregex_replaceAll(re, repl.getBuffer(), repl.length(), resultBuf, UPRV_LENGTHOF(resultBuf), &status);
5274     REGEX_CHECK_STATUS;
5275     REGEX_ASSERT(UnicodeString("a<y>z") == UnicodeString(resultBuf, resultLength));
5276 
5277     status = U_ZERO_ERROR;
5278     repl = UnicodeString("<$4>");
5279     resultLength = uregex_replaceAll(re, repl.getBuffer(), repl.length(), resultBuf, UPRV_LENGTHOF(resultBuf), &status);
5280     REGEX_ASSERT(status == U_INDEX_OUTOFBOUNDS_ERROR);
5281 
5282     status = U_ZERO_ERROR;
5283     repl = UnicodeString("<$04>");
5284     resultLength = uregex_replaceAll(re, repl.getBuffer(), repl.length(), resultBuf, UPRV_LENGTHOF(resultBuf), &status);
5285     REGEX_CHECK_STATUS;
5286     REGEX_ASSERT(UnicodeString("a<bcmxy4>z") == UnicodeString(resultBuf, resultLength));
5287 
5288     status = U_ZERO_ERROR;
5289     repl = UnicodeString("<$000016>");
5290     resultLength = uregex_replaceAll(re, repl.getBuffer(), repl.length(), resultBuf, UPRV_LENGTHOF(resultBuf), &status);
5291     REGEX_CHECK_STATUS;
5292     REGEX_ASSERT(UnicodeString("a<m6>z") == UnicodeString(resultBuf, resultLength));
5293 
5294     status = U_ZERO_ERROR;
5295     repl = UnicodeString("<$3$2$1${one}>");
5296     resultLength = uregex_replaceAll(re, repl.getBuffer(), repl.length(), resultBuf, UPRV_LENGTHOF(resultBuf), &status);
5297     REGEX_CHECK_STATUS;
5298     REGEX_ASSERT(UnicodeString("a<yxmm>z") == UnicodeString(resultBuf, resultLength));
5299 
5300     status = U_ZERO_ERROR;
5301     repl = UnicodeString("$3$2$1${one}");
5302     resultLength = uregex_replaceAll(re, repl.getBuffer(), repl.length(), resultBuf, UPRV_LENGTHOF(resultBuf), &status);
5303     REGEX_CHECK_STATUS;
5304     REGEX_ASSERT(UnicodeString("ayxmmz") == UnicodeString(resultBuf, resultLength));
5305 
5306     status = U_ZERO_ERROR;
5307     repl = UnicodeString("<${noSuchName}>");
5308     resultLength = uregex_replaceAll(re, repl.getBuffer(), repl.length(), resultBuf, UPRV_LENGTHOF(resultBuf), &status);
5309     REGEX_ASSERT(status == U_REGEX_INVALID_CAPTURE_GROUP_NAME);
5310 
5311     status = U_ZERO_ERROR;
5312     repl = UnicodeString("<${invalid-name}>");
5313     resultLength = uregex_replaceAll(re, repl.getBuffer(), repl.length(), resultBuf, UPRV_LENGTHOF(resultBuf), &status);
5314     REGEX_ASSERT(status == U_REGEX_INVALID_CAPTURE_GROUP_NAME);
5315 
5316     status = U_ZERO_ERROR;
5317     repl = UnicodeString("<${one");
5318     resultLength = uregex_replaceAll(re, repl.getBuffer(), repl.length(), resultBuf, UPRV_LENGTHOF(resultBuf), &status);
5319     REGEX_ASSERT(status == U_REGEX_INVALID_CAPTURE_GROUP_NAME);
5320 
5321     status = U_ZERO_ERROR;
5322     repl = UnicodeString("$not a capture group");
5323     resultLength = uregex_replaceAll(re, repl.getBuffer(), repl.length(), resultBuf, UPRV_LENGTHOF(resultBuf), &status);
5324     REGEX_ASSERT(status == U_REGEX_INVALID_CAPTURE_GROUP_NAME);
5325 
5326     uregex_close(re);
5327 }
5328 
5329 //--------------------------------------------------------------
5330 //
5331 //  NamedCaptureLimits   Patterns with huge numbers of named capture groups.
5332 //                       The point is not so much what the exact limit is,
5333 //                       but that a largish number doesn't hit bad non-linear performance,
5334 //                       and that exceeding the limit fails cleanly.
5335 //
5336 //--------------------------------------------------------------
NamedCaptureLimits()5337 void RegexTest::NamedCaptureLimits() {
5338     if (quick) {
5339         logln("Skipping test. Runs in exhuastive mode only.");
5340         return;
5341     }
5342     const int32_t goodLimit = 1000000;     // Pattern w this many groups builds successfully.
5343     const int32_t failLimit = 10000000;    // Pattern exceeds internal limits, fails to compile.
5344     char nnbuf[100];
5345     UnicodeString pattern;
5346     int32_t nn;
5347 
5348     for (nn=1; nn<goodLimit; nn++) {
5349         sprintf(nnbuf, "(?<nn%d>)", nn);
5350         pattern.append(UnicodeString(nnbuf, -1, US_INV));
5351     }
5352     UErrorCode status = U_ZERO_ERROR;
5353     RegexPattern *pat = RegexPattern::compile(pattern, 0, status);
5354     REGEX_CHECK_STATUS;
5355     for (nn=1; nn<goodLimit; nn++) {
5356         sprintf(nnbuf, "nn%d", nn);
5357         int32_t groupNum = pat->groupNumberFromName(nnbuf, -1, status);
5358         REGEX_ASSERT(nn == groupNum);
5359         if (nn != groupNum) {
5360             break;
5361         }
5362     }
5363     delete pat;
5364 
5365     pattern.remove();
5366     for (nn=1; nn<failLimit; nn++) {
5367         sprintf(nnbuf, "(?<nn%d>)", nn);
5368         pattern.append(UnicodeString(nnbuf, -1, US_INV));
5369     }
5370     status = U_ZERO_ERROR;
5371     pat = RegexPattern::compile(pattern, 0, status);
5372     REGEX_ASSERT(status == U_REGEX_PATTERN_TOO_BIG);
5373     delete pat;
5374 }
5375 
5376 
5377 //--------------------------------------------------------------
5378 //
5379 //  Bug7651   Regex pattern that exceeds default operator stack depth in matcher.
5380 //
5381 //---------------------------------------------------------------
Bug7651()5382 void RegexTest::Bug7651() {
5383     UnicodeString pattern1("((?<![A-Za-z0-9])[#\\uff03][A-Za-z0-9_][A-Za-z0-9_\\u00c0-\\u00d6\\u00c8-\\u00f6\\u00f8-\\u00ff]*|(?<![A-Za-z0-9_])[@\\uff20][A-Za-z0-9_]+(?:\\/[\\w-]+)?|(https?\\:\\/\\/|www\\.)\\S+(?<![\\!\\),\\.:;\\]\\u0080-\\uFFFF])|\\$[A-Za-z]+)");
5384     //  The following should exceed the default operator stack depth in the matcher, i.e. force the matcher to malloc instead of using fSmallData.
5385     //  It will cause a segfault if RegexMatcher tries to use fSmallData instead of malloc'ing the memory needed (see init2) for the pattern operator stack allocation.
5386     UnicodeString pattern2("((https?\\:\\/\\/|www\\.)\\S+(?<![\\!\\),\\.:;\\]\\u0080-\\uFFFF])|(?<![A-Za-z0-9_])[\\@\\uff20][A-Za-z0-9_]+(?:\\/[\\w\\-]+)?|(?<![A-Za-z0-9])[\\#\\uff03][A-Za-z0-9_][A-Za-z0-9_\\u00c0-\\u00d6\\u00c8-\\u00f6\\u00f8-\\u00ff]*|\\$[A-Za-z]+)");
5387     UnicodeString s("#ff @abcd This is test");
5388     RegexPattern  *REPattern = NULL;
5389     RegexMatcher  *REMatcher = NULL;
5390     UErrorCode status = U_ZERO_ERROR;
5391     UParseError pe;
5392 
5393     REPattern = RegexPattern::compile(pattern1, 0, pe, status);
5394     REGEX_CHECK_STATUS;
5395     REMatcher = REPattern->matcher(s, status);
5396     REGEX_CHECK_STATUS;
5397     REGEX_ASSERT(REMatcher->find());
5398     REGEX_ASSERT(REMatcher->start(status) == 0);
5399     delete REPattern;
5400     delete REMatcher;
5401     status = U_ZERO_ERROR;
5402 
5403     REPattern = RegexPattern::compile(pattern2, 0, pe, status);
5404     REGEX_CHECK_STATUS;
5405     REMatcher = REPattern->matcher(s, status);
5406     REGEX_CHECK_STATUS;
5407     REGEX_ASSERT(REMatcher->find());
5408     REGEX_ASSERT(REMatcher->start(status) == 0);
5409     delete REPattern;
5410     delete REMatcher;
5411     status = U_ZERO_ERROR;
5412  }
5413 
Bug7740()5414 void RegexTest::Bug7740() {
5415     UErrorCode status = U_ZERO_ERROR;
5416     UnicodeString pattern = "(a)";
5417     UnicodeString text = "abcdef";
5418     RegexMatcher *m = new RegexMatcher(pattern, text, 0, status);
5419     REGEX_CHECK_STATUS;
5420     REGEX_ASSERT(m->lookingAt(status));
5421     REGEX_CHECK_STATUS;
5422     status = U_ILLEGAL_ARGUMENT_ERROR;
5423     UnicodeString s = m->group(1, status);    // Bug 7740: segfault here.
5424     REGEX_ASSERT(status == U_ILLEGAL_ARGUMENT_ERROR);
5425     REGEX_ASSERT(s == "");
5426     delete m;
5427 }
5428 
5429 // Bug 8479:  was crashing whith a Bogus UnicodeString as input.
5430 
Bug8479()5431 void RegexTest::Bug8479() {
5432     UErrorCode status = U_ZERO_ERROR;
5433 
5434     RegexMatcher* const pMatcher = new RegexMatcher("\\Aboo\\z", UREGEX_DOTALL|UREGEX_CASE_INSENSITIVE, status);
5435     REGEX_CHECK_STATUS;
5436     if (U_SUCCESS(status))
5437     {
5438         UnicodeString str;
5439         str.setToBogus();
5440         pMatcher->reset(str);
5441         status = U_ZERO_ERROR;
5442         pMatcher->matches(status);
5443         REGEX_ASSERT(status == U_ILLEGAL_ARGUMENT_ERROR);
5444         delete pMatcher;
5445     }
5446 }
5447 
5448 
5449 // Bug 7029
Bug7029()5450 void RegexTest::Bug7029() {
5451     UErrorCode status = U_ZERO_ERROR;
5452 
5453     RegexMatcher* const pMatcher = new RegexMatcher(".", 0, status);
5454     UnicodeString text = "abc.def";
5455     UnicodeString splits[10];
5456     REGEX_CHECK_STATUS;
5457     int32_t numFields = pMatcher->split(text, splits, 10, status);
5458     REGEX_CHECK_STATUS;
5459     REGEX_ASSERT(numFields == 8);
5460     delete pMatcher;
5461 }
5462 
5463 // Bug 9283
5464 //   This test is checking for the existence of any supplemental characters that case-fold
5465 //   to a bmp character.
5466 //
5467 //   At the time of this writing there are none. If any should appear in a subsequent release
5468 //   of Unicode, the code in regular expressions compilation that determines the longest
5469 //   possible match for a literal string  will need to be enhanced.
5470 //
5471 //   See file regexcmp.cpp, case URX_STRING_I in RegexCompile::maxMatchLength()
5472 //   for details on what to do in case of a failure of this test.
5473 //
Bug9283()5474 void RegexTest::Bug9283() {
5475 #if !UCONFIG_NO_NORMALIZATION
5476     UErrorCode status = U_ZERO_ERROR;
5477     UnicodeSet supplementalsWithCaseFolding("[[:CWCF:]&[\\U00010000-\\U0010FFFF]]", status);
5478     REGEX_CHECK_STATUS;
5479     int32_t index;
5480     UChar32 c;
5481     for (index=0; ; index++) {
5482         c = supplementalsWithCaseFolding.charAt(index);
5483         if (c == -1) {
5484             break;
5485         }
5486         UnicodeString cf = UnicodeString(c).foldCase();
5487         REGEX_ASSERT(cf.length() >= 2);
5488     }
5489 #endif /* #if !UCONFIG_NO_NORMALIZATION */
5490 }
5491 
5492 
CheckInvBufSize()5493 void RegexTest::CheckInvBufSize() {
5494   if(inv_next>=INV_BUFSIZ) {
5495     errln("%s: increase #define of INV_BUFSIZ ( is %d but needs to be at least %d )\n",
5496           __FILE__, INV_BUFSIZ, inv_next);
5497   } else {
5498     logln("%s: INV_BUFSIZ is %d, usage %d\n", __FILE__, INV_BUFSIZ, inv_next);
5499   }
5500 }
5501 
5502 
Bug10459()5503 void RegexTest::Bug10459() {
5504     UErrorCode status = U_ZERO_ERROR;
5505     UnicodeString patternString("(txt)");
5506     UnicodeString txtString("txt");
5507 
5508     UText *utext_pat = utext_openUnicodeString(NULL, &patternString, &status);
5509     REGEX_CHECK_STATUS;
5510     UText *utext_txt = utext_openUnicodeString(NULL, &txtString, &status);
5511     REGEX_CHECK_STATUS;
5512 
5513     URegularExpression *icu_re = uregex_openUText(utext_pat, 0, NULL, &status);
5514     REGEX_CHECK_STATUS;
5515 
5516     uregex_setUText(icu_re, utext_txt, &status);
5517     REGEX_CHECK_STATUS;
5518 
5519     // The bug was that calling uregex_group() before doing a matching operation
5520     //   was causing a segfault. Only for Regular Expressions created from UText.
5521     //   It should set an U_REGEX_INVALID_STATE.
5522 
5523     UChar buf[100];
5524     int32_t len = uregex_group(icu_re, 0, buf, UPRV_LENGTHOF(buf), &status);
5525     REGEX_ASSERT(status == U_REGEX_INVALID_STATE);
5526     REGEX_ASSERT(len == 0);
5527 
5528     uregex_close(icu_re);
5529     utext_close(utext_pat);
5530     utext_close(utext_txt);
5531 }
5532 
TestCaseInsensitiveStarters()5533 void RegexTest::TestCaseInsensitiveStarters() {
5534     // Test that the data used by RegexCompile::findCaseInsensitiveStarters() hasn't
5535     //  become stale because of new Unicode characters.
5536     // If it is stale, rerun the generation tool
5537     //    https://github.com/unicode-org/icu/tree/main/tools/unicode/c/genregexcasing
5538     // and replace the embedded data in i18n/regexcmp.cpp
5539 
5540     for (UChar32 cp=0; cp<=0x10ffff; cp++) {
5541         if (!u_hasBinaryProperty(cp, UCHAR_CASE_SENSITIVE)) {
5542             continue;
5543         }
5544         UnicodeSet s(cp, cp);
5545         s.closeOver(USET_CASE_INSENSITIVE);
5546         UnicodeSetIterator setIter(s);
5547         while (setIter.next()) {
5548             if (!setIter.isString()) {
5549                 continue;
5550             }
5551             const UnicodeString &str = setIter.getString();
5552             UChar32 firstChar = str.char32At(0);
5553             UnicodeSet starters;
5554             RegexCompile::findCaseInsensitiveStarters(firstChar, &starters);
5555             if (!starters.contains(cp)) {
5556                 errln("CaseInsensitiveStarters for \\u%x is missing character \\u%x.", cp, firstChar);
5557                 return;
5558             }
5559         }
5560     }
5561 }
5562 
5563 
TestBug11049()5564 void RegexTest::TestBug11049() {
5565     // Original bug report: pattern with match start consisting of one of several individual characters,
5566     //  and the text being matched ending with a supplementary character. find() would read past the
5567     //  end of the input text when searching for potential match starting points.
5568 
5569     // To see the problem, the text must exactly fill an allocated buffer, so that valgrind will
5570     // detect the bad read.
5571 
5572     TestCase11049("A|B|C", "a string \\ud800\\udc00", FALSE, __LINE__);
5573     TestCase11049("A|B|C", "string matches at end C", TRUE, __LINE__);
5574 
5575     // Test again with a pattern starting with a single character,
5576     // which takes a different code path than starting with an OR expression,
5577     // but with similar logic.
5578     TestCase11049("C", "a string \\ud800\\udc00", FALSE, __LINE__);
5579     TestCase11049("C", "string matches at end C", TRUE, __LINE__);
5580 }
5581 
5582 // Run a single test case from TestBug11049(). Internal function.
TestCase11049(const char * pattern,const char * data,UBool expectMatch,int32_t lineNumber)5583 void RegexTest::TestCase11049(const char *pattern, const char *data, UBool expectMatch, int32_t lineNumber) {
5584     UErrorCode status = U_ZERO_ERROR;
5585     UnicodeString patternString = UnicodeString(pattern).unescape();
5586     LocalPointer<RegexPattern> compiledPat(RegexPattern::compile(patternString, 0, status));
5587 
5588     UnicodeString dataString = UnicodeString(data).unescape();
5589     UChar *exactBuffer = new UChar[dataString.length()];
5590     dataString.extract(exactBuffer, dataString.length(), status);
5591     UText *ut = utext_openUChars(NULL, exactBuffer, dataString.length(), &status);
5592 
5593     LocalPointer<RegexMatcher> matcher(compiledPat->matcher(status));
5594     REGEX_CHECK_STATUS;
5595     matcher->reset(ut);
5596     UBool result = matcher->find();
5597     if (result != expectMatch) {
5598         errln("File %s, line %d: expected %d, got %d. Pattern = \"%s\", text = \"%s\"",
5599               __FILE__, lineNumber, expectMatch, result, pattern, data);
5600     }
5601 
5602     // Rerun test with UTF-8 input text. Won't see buffer overreads, but could see
5603     //   off-by-one on find() with match at the last code point.
5604     //   Size of the original char * data (invariant charset) will be <= than the equivalent UTF-8
5605     //   because string.unescape() will only shrink it.
5606     char * utf8Buffer = new char[uprv_strlen(data)+1];
5607     u_strToUTF8(utf8Buffer, static_cast<int32_t>(uprv_strlen(data)+1), NULL, dataString.getBuffer(), dataString.length(), &status);
5608     REGEX_CHECK_STATUS;
5609     ut = utext_openUTF8(ut, utf8Buffer, -1, &status);
5610     REGEX_CHECK_STATUS;
5611     matcher->reset(ut);
5612     result = matcher->find();
5613     if (result != expectMatch) {
5614         errln("File %s, line %d (UTF-8 check): expected %d, got %d. Pattern = \"%s\", text = \"%s\"",
5615               __FILE__, lineNumber, expectMatch, result, pattern, data);
5616     }
5617     delete [] utf8Buffer;
5618 
5619     utext_close(ut);
5620     delete [] exactBuffer;
5621 }
5622 
5623 
TestBug11371()5624 void RegexTest::TestBug11371() {
5625     if (quick) {
5626         logln("Skipping test. Runs in exhuastive mode only.");
5627         return;
5628     }
5629     UErrorCode status = U_ZERO_ERROR;
5630     UnicodeString patternString;
5631 
5632     for (int i=0; i<8000000; i++) {
5633         patternString.append(UnicodeString("()"));
5634     }
5635     LocalPointer<RegexPattern> compiledPat(RegexPattern::compile(patternString, 0, status));
5636     if (status != U_REGEX_PATTERN_TOO_BIG) {
5637         errln("File %s, line %d expected status=U_REGEX_PATTERN_TOO_BIG; got %s.",
5638               __FILE__, __LINE__, u_errorName(status));
5639     }
5640 
5641     status = U_ZERO_ERROR;
5642     patternString = "(";
5643     for (int i=0; i<20000000; i++) {
5644         patternString.append(UnicodeString("A++"));
5645     }
5646     patternString.append(UnicodeString("){0}B++"));
5647     LocalPointer<RegexPattern> compiledPat2(RegexPattern::compile(patternString, 0, status));
5648     if (status != U_REGEX_PATTERN_TOO_BIG) {
5649         errln("File %s, line %d expected status=U_REGEX_PATTERN_TOO_BIG; got %s.",
5650               __FILE__, __LINE__, u_errorName(status));
5651     }
5652 
5653     // Pattern with too much string data, such that string indexes overflow operand data field size
5654     // in compiled instruction.
5655     status = U_ZERO_ERROR;
5656     patternString = "";
5657     while (patternString.length() < 0x00ffffff) {
5658         patternString.append(UnicodeString("stuff and things dont you know, these are a few of my favorite strings\n"));
5659     }
5660     patternString.append(UnicodeString("X? trailing string"));
5661     LocalPointer<RegexPattern> compiledPat3(RegexPattern::compile(patternString, 0, status));
5662     if (status != U_REGEX_PATTERN_TOO_BIG) {
5663         errln("File %s, line %d expected status=U_REGEX_PATTERN_TOO_BIG; got %s.",
5664               __FILE__, __LINE__, u_errorName(status));
5665     }
5666 }
5667 
TestBug11480()5668 void RegexTest::TestBug11480() {
5669     // C API, get capture group of a group that does not participate in the match.
5670     //        (Returns a zero length string, with nul termination,
5671     //         indistinguishable from a group with a zero length match.)
5672 
5673     UErrorCode status = U_ZERO_ERROR;
5674     URegularExpression *re = uregex_openC("(A)|(B)", 0, NULL, &status);
5675     REGEX_CHECK_STATUS;
5676     UnicodeString text = UNICODE_STRING_SIMPLE("A");
5677     uregex_setText(re, text.getBuffer(), text.length(), &status);
5678     REGEX_CHECK_STATUS;
5679     REGEX_ASSERT(uregex_lookingAt(re, 0, &status));
5680     UChar buf[10] = {(UChar)13, (UChar)13, (UChar)13, (UChar)13};
5681     int32_t length = uregex_group(re, 2, buf+1, UPRV_LENGTHOF(buf)-1, &status);
5682     REGEX_ASSERT(length == 0);
5683     REGEX_ASSERT(buf[0] == 13);
5684     REGEX_ASSERT(buf[1] == 0);
5685     REGEX_ASSERT(buf[2] == 13);
5686     uregex_close(re);
5687 
5688     // UText C++ API, length of match is 0 for non-participating matches.
5689     UText ut = UTEXT_INITIALIZER;
5690     utext_openUnicodeString(&ut, &text, &status);
5691     RegexMatcher matcher(UnicodeString("(A)|(B)"), 0, status);
5692     REGEX_CHECK_STATUS;
5693     matcher.reset(&ut);
5694     REGEX_ASSERT(matcher.lookingAt(0, status));
5695 
5696     // UText C++ API, Capture group 1 matches "A", position 0, length 1.
5697     int64_t groupLen = -666;
5698     UText group = UTEXT_INITIALIZER;
5699     matcher.group(1, &group, groupLen, status);
5700     REGEX_CHECK_STATUS;
5701     REGEX_ASSERT(groupLen == 1);
5702     REGEX_ASSERT(utext_getNativeIndex(&group) == 0);
5703 
5704     // Capture group 2, the (B), does not participate in the match.
5705     matcher.group(2, &group, groupLen, status);
5706     REGEX_CHECK_STATUS;
5707     REGEX_ASSERT(groupLen == 0);
5708     REGEX_ASSERT(matcher.start(2, status) == -1);
5709     REGEX_CHECK_STATUS;
5710 }
5711 
TestBug12884()5712 void RegexTest::TestBug12884() {
5713     // setTimeLimit() was not effective for empty sub-patterns with large {minimum counts}
5714     UnicodeString pattern(u"(((((((){120}){11}){11}){11}){80}){11}){4}");
5715     UnicodeString text(u"hello");
5716     UErrorCode status = U_ZERO_ERROR;
5717     RegexMatcher m(pattern, text, 0, status);
5718     REGEX_CHECK_STATUS;
5719     m.setTimeLimit(5, status);
5720     m.find(status);
5721     REGEX_ASSERT(status == U_REGEX_TIME_OUT);
5722 
5723     // Non-greedy loops. They take a different code path during matching.
5724     UnicodeString ngPattern(u"(((((((){120}?){11}?){11}?){11}?){80}?){11}?){4}?");
5725     status = U_ZERO_ERROR;
5726     RegexMatcher ngM(ngPattern, text, 0, status);
5727     REGEX_CHECK_STATUS;
5728     ngM.setTimeLimit(5, status);
5729     ngM.find(status);
5730     REGEX_ASSERT(status == U_REGEX_TIME_OUT);
5731 
5732     // UText, wrapping non-UTF-16 text, also takes a different execution path.
5733     StringPiece text8(u8"¿Qué es Unicode?  Unicode proporciona un número único para cada"
5734                           "carácter, sin importar la plataforma, sin importar el programa,"
5735                           "sin importar el idioma.");
5736     status = U_ZERO_ERROR;
5737     LocalUTextPointer ut(utext_openUTF8(NULL, text8.data(), text8.length(), &status));
5738     REGEX_CHECK_STATUS;
5739     m.reset(ut.getAlias());
5740     m.find(status);
5741     REGEX_ASSERT(status == U_REGEX_TIME_OUT);
5742 
5743     status = U_ZERO_ERROR;
5744     ngM.reset(ut.getAlias());
5745     ngM.find(status);
5746     REGEX_ASSERT(status == U_REGEX_TIME_OUT);
5747 }
5748 
5749 // Bug 13631. A find() of a pattern with a zero length look-behind assertions
5750 //            can cause a read past the end of the input text.
5751 //            The failure is seen when running this test with Clang's Address Sanitizer.
5752 
TestBug13631()5753 void RegexTest::TestBug13631() {
5754     const UChar *pats[] = { u"(?<!^)",
5755                             u"(?<=^)",
5756                             nullptr
5757                           };
5758     for (const UChar **pat=pats; *pat; ++pat) {
5759         UErrorCode status = U_ZERO_ERROR;
5760         UnicodeString upat(*pat);
5761         RegexMatcher matcher(upat, 0, status);
5762         const UChar s =u'a';
5763         UText *ut = utext_openUChars(nullptr, &s, 1, &status);
5764         REGEX_CHECK_STATUS;
5765         matcher.reset(ut);
5766         while (matcher.find()) {
5767         }
5768         utext_close(ut);
5769     }
5770 }
5771 
5772 // Bug 13632 Out of bounds memory reference if a replacement string ends with a '$',
5773 //           where a following group specification would be expected.
5774 //           Failure shows when running the test under Clang's Address Sanitizer.
5775 
TestBug13632()5776 void RegexTest::TestBug13632() {
5777     UErrorCode status = U_ZERO_ERROR;
5778     URegularExpression *re = uregex_openC(" ", 0, nullptr, &status);
5779     const char16_t *sourceString = u"Hello, world.";
5780     uregex_setText(re, sourceString, u_strlen(sourceString), &status);
5781 
5782     const int32_t destCap = 20;
5783     char16_t dest[destCap] = {};
5784     const char16_t replacement[] = {u'x', u'$'};    // Not nul terminated string.
5785     uregex_replaceAll(re, replacement, 2, dest, destCap, &status);
5786 
5787     assertEquals("", U_REGEX_INVALID_CAPTURE_GROUP_NAME, status);
5788     uregex_close(re);
5789 }
5790 
TestBug20359()5791 void RegexTest::TestBug20359() {
5792     // The bug was stack overflow while parsing a pattern with a huge number of adjacent \Q\E
5793     // pairs. (Enter and exit pattern literal quote mode). Logic was correct.
5794     // Changed implementation to loop instead of recursing.
5795 
5796     UnicodeString pattern;
5797     for (int i=0; i<50000; ++i) {
5798         pattern += u"\\Q\\E";
5799     }
5800     pattern += u"x";
5801 
5802     UErrorCode status = U_ZERO_ERROR;
5803     LocalURegularExpressionPointer re(uregex_open(pattern.getBuffer(), pattern.length(),
5804                                        0, nullptr, &status));
5805     assertSuccess(WHERE, status);
5806 
5807     // We have passed the point where the bug crashed. The following is a small sanity
5808     // check that the pattern works, that all the \Q\E\Q\E... didn't cause other problems.
5809 
5810     uregex_setText(re.getAlias(), u"abcxyz", -1, &status);
5811     assertSuccess(WHERE, status);
5812     assertTrue(WHERE, uregex_find(re.getAlias(), 0, &status));
5813     assertEquals(WHERE, 3, uregex_start(re.getAlias(), 0, &status));
5814     assertSuccess(WHERE, status);
5815 }
5816 
5817 
TestBug20863()5818 void RegexTest::TestBug20863() {
5819     // Test that patterns with a large number of named capture groups work correctly.
5820     //
5821     // The ticket was not for a bug per se, but to reduce memory usage by using lazy
5822     // construction of the map from capture names to numbers, and decreasing the
5823     // default size of the map.
5824 
5825     constexpr int GROUP_COUNT = 2000;
5826     std::vector<UnicodeString> groupNames;
5827     for (int32_t i=0; i<GROUP_COUNT; ++i) {
5828         UnicodeString name;
5829         name.append(u"name");
5830         name.append(Int64ToUnicodeString(i));
5831         groupNames.push_back(name);
5832     }
5833 
5834     UnicodeString patternString;
5835     for (UnicodeString name: groupNames) {
5836         patternString.append(u"(?<");
5837         patternString.append(name);
5838         patternString.append(u">.)");
5839     }
5840 
5841     UErrorCode status = U_ZERO_ERROR;
5842     UParseError pe;
5843     LocalPointer<RegexPattern> pattern(RegexPattern::compile(patternString, pe, status), status);
5844     if (!assertSuccess(WHERE, status)) {
5845         return;
5846     }
5847 
5848     for (int32_t i=0; i<GROUP_COUNT; ++i) {
5849         int32_t group = pattern->groupNumberFromName(groupNames[i], status);
5850         if (!assertSuccess(WHERE, status)) {
5851             return;
5852         }
5853         assertEquals(WHERE, i+1, group);
5854         // Note: group 0 is the overall match; group 1 is the first separate capture group.
5855     }
5856 
5857     // Verify that assignment of patterns with various combinations of named capture work.
5858     // Lazy creation of the internal named capture map changed the implementation logic here.
5859     {
5860         LocalPointer<RegexPattern> pat1(RegexPattern::compile(u"abc", pe, status), status);
5861         LocalPointer<RegexPattern> pat2(RegexPattern::compile(u"a(?<name>b)c", pe, status), status);
5862         assertSuccess(WHERE, status);
5863         assertFalse(WHERE, *pat1 == *pat2);
5864         *pat1 = *pat2;
5865         assertTrue(WHERE, *pat1 == *pat2);
5866         assertEquals(WHERE, 1, pat1->groupNumberFromName(u"name", status));
5867         assertEquals(WHERE, 1, pat2->groupNumberFromName(u"name", status));
5868         assertSuccess(WHERE, status);
5869     }
5870 
5871     {
5872         LocalPointer<RegexPattern> pat1(RegexPattern::compile(u"abc", pe, status), status);
5873         LocalPointer<RegexPattern> pat2(RegexPattern::compile(u"a(?<name>b)c", pe, status), status);
5874         assertSuccess(WHERE, status);
5875         assertFalse(WHERE, *pat1 == *pat2);
5876         *pat2 = *pat1;
5877         assertTrue(WHERE, *pat1 == *pat2);
5878         assertEquals(WHERE, 0, pat1->groupNumberFromName(u"name", status));
5879         assertEquals(WHERE, U_REGEX_INVALID_CAPTURE_GROUP_NAME, status);
5880         status = U_ZERO_ERROR;
5881         assertEquals(WHERE, 0, pat2->groupNumberFromName(u"name", status));
5882         assertEquals(WHERE, U_REGEX_INVALID_CAPTURE_GROUP_NAME, status);
5883         status = U_ZERO_ERROR;
5884     }
5885 
5886     {
5887         LocalPointer<RegexPattern> pat1(RegexPattern::compile(u"a(?<name1>b)c", pe, status), status);
5888         LocalPointer<RegexPattern> pat2(RegexPattern::compile(u"a(?<name2>b)c", pe, status), status);
5889         assertSuccess(WHERE, status);
5890         assertFalse(WHERE, *pat1 == *pat2);
5891         *pat2 = *pat1;
5892         assertTrue(WHERE, *pat1 == *pat2);
5893         assertEquals(WHERE, 1, pat1->groupNumberFromName(u"name1", status));
5894         assertSuccess(WHERE, status);
5895         assertEquals(WHERE, 1, pat2->groupNumberFromName(u"name1", status));
5896         assertSuccess(WHERE, status);
5897         assertEquals(WHERE, 0, pat1->groupNumberFromName(u"name2", status));
5898         assertEquals(WHERE, U_REGEX_INVALID_CAPTURE_GROUP_NAME, status);
5899         status = U_ZERO_ERROR;
5900         assertEquals(WHERE, 0, pat2->groupNumberFromName(u"name2", status));
5901         assertEquals(WHERE, U_REGEX_INVALID_CAPTURE_GROUP_NAME, status);
5902         status = U_ZERO_ERROR;
5903     }
5904 
5905 }
5906 
5907 
5908 #endif  /* !UCONFIG_NO_REGULAR_EXPRESSIONS  */
5909