1 // © 2016 and later: Unicode, Inc. and others.
2 // License & terms of use: http://www.unicode.org/copyright.html
3 /********************************************************************
4 * COPYRIGHT:
5 * Copyright (c) 2002-2016, International Business Machines Corporation and
6 * others. All Rights Reserved.
7 ********************************************************************/
8
9 //
10 // regextst.cpp
11 //
12 // ICU Regular Expressions test, part of intltest.
13 //
14
15 /*
16 NOTE!!
17
18 PLEASE be careful about ASCII assumptions in this test.
19 This test is one of the worst repeat offenders.
20 If you have questions, contact someone on the ICU PMC
21 who has access to an EBCDIC system.
22
23 */
24
25 #include "intltest.h"
26 #if !UCONFIG_NO_REGULAR_EXPRESSIONS
27
28 #include <stdlib.h>
29 #include <stdio.h>
30 #include <string.h>
31
32 #include "unicode/localpointer.h"
33 #include "unicode/regex.h"
34 #include "unicode/stringpiece.h"
35 #include "unicode/uchar.h"
36 #include "unicode/ucnv.h"
37 #include "unicode/uniset.h"
38 #include "unicode/uregex.h"
39 #include "unicode/usetiter.h"
40 #include "unicode/ustring.h"
41 #include "unicode/utext.h"
42 #include "unicode/utf16.h"
43 #include "cstr.h"
44 #include "regextst.h"
45 #include "regexcmp.h"
46 #include "uvector.h"
47 #include "util.h"
48 #include "cmemory.h"
49 #include "cstring.h"
50 #include "uinvchar.h"
51
52 #define SUPPORT_MUTATING_INPUT_STRING 0
53
54 //---------------------------------------------------------------------------
55 //
56 // Test class boilerplate
57 //
58 //---------------------------------------------------------------------------
RegexTest()59 RegexTest::RegexTest()
60 {
61 }
62
63
~RegexTest()64 RegexTest::~RegexTest()
65 {
66 }
67
68
69
runIndexedTest(int32_t index,UBool exec,const char * & name,char *)70 void RegexTest::runIndexedTest( int32_t index, UBool exec, const char* &name, char* /*par*/ )
71 {
72 if (exec) logln("TestSuite RegexTest: ");
73 TESTCASE_AUTO_BEGIN;
74 TESTCASE_AUTO(Basic);
75 TESTCASE_AUTO(API_Match);
76 TESTCASE_AUTO(API_Replace);
77 TESTCASE_AUTO(API_Pattern);
78 #if !UCONFIG_NO_FILE_IO
79 TESTCASE_AUTO(Extended);
80 #endif
81 TESTCASE_AUTO(Errors);
82 TESTCASE_AUTO(PerlTests);
83 TESTCASE_AUTO(Callbacks);
84 TESTCASE_AUTO(FindProgressCallbacks);
85 TESTCASE_AUTO(Bug6149);
86 TESTCASE_AUTO(UTextBasic);
87 TESTCASE_AUTO(API_Match_UTF8);
88 TESTCASE_AUTO(API_Replace_UTF8);
89 TESTCASE_AUTO(API_Pattern_UTF8);
90 TESTCASE_AUTO(PerlTestsUTF8);
91 TESTCASE_AUTO(PreAllocatedUTextCAPI);
92 TESTCASE_AUTO(Bug7651);
93 TESTCASE_AUTO(Bug7740);
94 TESTCASE_AUTO(Bug8479);
95 TESTCASE_AUTO(Bug7029);
96 TESTCASE_AUTO(CheckInvBufSize);
97 TESTCASE_AUTO(Bug9283);
98 TESTCASE_AUTO(Bug10459);
99 TESTCASE_AUTO(TestCaseInsensitiveStarters);
100 TESTCASE_AUTO(TestBug11049);
101 TESTCASE_AUTO(TestBug11371);
102 TESTCASE_AUTO(TestBug11480);
103 TESTCASE_AUTO(NamedCapture);
104 TESTCASE_AUTO(NamedCaptureLimits);
105 TESTCASE_AUTO(TestBug12884);
106 TESTCASE_AUTO(TestBug13631);
107 TESTCASE_AUTO(TestBug13632);
108 TESTCASE_AUTO(TestBug20359);
109 TESTCASE_AUTO(TestBug20863);
110 TESTCASE_AUTO_END;
111 }
112
113
114 /**
115 * Calls utext_openUTF8 after, potentially, converting invariant text from the compilation codepage
116 * into ASCII.
117 * @see utext_openUTF8
118 */
119 static UText* regextst_openUTF8FromInvariant(UText* ut, const char *inv, int64_t length, UErrorCode *status);
120
121 //---------------------------------------------------------------------------
122 //
123 // Error Checking / Reporting macros used in all of the tests.
124 //
125 //---------------------------------------------------------------------------
126
utextToPrintable(char * buf,int32_t bufLen,UText * text)127 static void utextToPrintable(char *buf, int32_t bufLen, UText *text) {
128 int64_t oldIndex = utext_getNativeIndex(text);
129 utext_setNativeIndex(text, 0);
130 char *bufPtr = buf;
131 UChar32 c = utext_next32From(text, 0);
132 while ((c != U_SENTINEL) && (bufPtr < buf+bufLen)) {
133 if (0x000020<=c && c<0x00007e) {
134 *bufPtr = c;
135 } else {
136 #if 0
137 sprintf(bufPtr,"U+%04X", c);
138 bufPtr+= strlen(bufPtr)-1;
139 #else
140 *bufPtr = '%';
141 #endif
142 }
143 bufPtr++;
144 c = UTEXT_NEXT32(text);
145 }
146 *bufPtr = 0;
147 #if (U_CHARSET_FAMILY==U_EBCDIC_FAMILY)
148 char *ebuf = (char*)malloc(bufLen);
149 uprv_eastrncpy((unsigned char*)ebuf, (const unsigned char*)buf, bufLen);
150 uprv_strncpy(buf, ebuf, bufLen);
151 free((void*)ebuf);
152 #endif
153 utext_setNativeIndex(text, oldIndex);
154 }
155
156
157 static char ASSERT_BUF[1024];
158
extractToAssertBuf(const UnicodeString & message)159 const char* RegexTest::extractToAssertBuf(const UnicodeString& message) {
160 if(message.length()==0) {
161 strcpy(ASSERT_BUF, "[[empty UnicodeString]]");
162 } else {
163 UnicodeString buf;
164 IntlTest::prettify(message,buf);
165 if(buf.length()==0) {
166 strcpy(ASSERT_BUF, "[[escape() returned 0 chars]]");
167 } else {
168 buf.extract(0, 0x7FFFFFFF, ASSERT_BUF, sizeof(ASSERT_BUF)-1);
169 if(ASSERT_BUF[0]==0) {
170 ASSERT_BUF[0]=0;
171 for(int32_t i=0;i<buf.length();i++) {
172 UChar ch = buf[i];
173 sprintf(ASSERT_BUF+strlen(ASSERT_BUF),"\\u%02x",ch);
174 }
175 }
176 }
177 }
178 ASSERT_BUF[sizeof(ASSERT_BUF)-1] = 0;
179 return ASSERT_BUF;
180 }
181
182 #define REGEX_VERBOSE_TEXT(text) UPRV_BLOCK_MACRO_BEGIN { \
183 char buf[200]; \
184 utextToPrintable(buf,UPRV_LENGTHOF(buf),text); \
185 logln("%s:%d: UText %s=\"%s\"", __FILE__, __LINE__, #text, buf); \
186 } UPRV_BLOCK_MACRO_END
187
188 #define REGEX_CHECK_STATUS UPRV_BLOCK_MACRO_BEGIN { \
189 if (U_FAILURE(status)) { \
190 dataerrln("%s:%d: RegexTest failure. status=%s", \
191 __FILE__, __LINE__, u_errorName(status)); \
192 return; \
193 } \
194 } UPRV_BLOCK_MACRO_END
195
196 #define REGEX_ASSERT(expr) UPRV_BLOCK_MACRO_BEGIN { \
197 if ((expr)==FALSE) { \
198 errln("%s:%d: RegexTest failure: REGEX_ASSERT(%s) failed \n", __FILE__, __LINE__, #expr); \
199 } \
200 } UPRV_BLOCK_MACRO_END
201
202 #define REGEX_ASSERT_FAIL(expr, errcode) UPRV_BLOCK_MACRO_BEGIN { \
203 UErrorCode status=U_ZERO_ERROR; \
204 (expr); \
205 if (status!=errcode) { \
206 dataerrln("RegexTest failure at line %d. Expected status=%s, got %s", \
207 __LINE__, u_errorName(errcode), u_errorName(status)); \
208 } \
209 } UPRV_BLOCK_MACRO_END
210
211 #define REGEX_CHECK_STATUS_L(line) UPRV_BLOCK_MACRO_BEGIN { \
212 if (U_FAILURE(status)) { \
213 errln("RegexTest failure at line %d, from %d. status=%d\n",__LINE__, (line), status); \
214 } \
215 } UPRV_BLOCK_MACRO_END
216
217 #define REGEX_ASSERT_L(expr, line) UPRV_BLOCK_MACRO_BEGIN { \
218 if ((expr)==FALSE) { \
219 errln("RegexTest failure at line %d, from %d.", __LINE__, (line)); \
220 return; \
221 } \
222 } UPRV_BLOCK_MACRO_END
223
224 // expected: const char * , restricted to invariant characters.
225 // actual: const UnicodeString &
226 #define REGEX_ASSERT_UNISTR(expected, actual) UPRV_BLOCK_MACRO_BEGIN { \
227 if (UnicodeString(expected, -1, US_INV) != (actual)) { \
228 errln("%s:%d: RegexTest failure: REGEX_ASSERT_UNISTR(%s, %s) failed \n", \
229 __FILE__, __LINE__, expected, extractToAssertBuf(actual)); \
230 } \
231 } UPRV_BLOCK_MACRO_END
232
233
testUTextEqual(UText * uta,UText * utb)234 static UBool testUTextEqual(UText *uta, UText *utb) {
235 UChar32 ca = 0;
236 UChar32 cb = 0;
237 utext_setNativeIndex(uta, 0);
238 utext_setNativeIndex(utb, 0);
239 do {
240 ca = utext_next32(uta);
241 cb = utext_next32(utb);
242 if (ca != cb) {
243 break;
244 }
245 } while (ca != U_SENTINEL);
246 return ca == cb;
247 }
248
249
250 /**
251 * @param expected expected text in UTF-8 (not platform) codepage
252 */
assertUText(const char * expected,UText * actual,const char * file,int line)253 void RegexTest::assertUText(const char *expected, UText *actual, const char *file, int line) {
254 UErrorCode status = U_ZERO_ERROR;
255 UText expectedText = UTEXT_INITIALIZER;
256 utext_openUTF8(&expectedText, expected, -1, &status);
257 if(U_FAILURE(status)) {
258 errln("%s:%d: assertUText: error %s calling utext_openUTF8(expected: %d chars)\n", file, line, u_errorName(status), strlen(expected));
259 return;
260 }
261 if(utext_nativeLength(&expectedText)==0 && (strlen(expected)!=0)) {
262 errln("%s:%d: assertUText: expected is %d utf-8 bytes, but utext_nativeLength(expectedText) returned 0.", file, line, strlen(expected));
263 return;
264 }
265 utext_setNativeIndex(actual, 0);
266 if (!testUTextEqual(&expectedText, actual)) {
267 char buf[201 /*21*/];
268 char expectedBuf[201];
269 utextToPrintable(buf, UPRV_LENGTHOF(buf), actual);
270 utextToPrintable(expectedBuf, UPRV_LENGTHOF(expectedBuf), &expectedText);
271 errln("%s:%d: assertUText: Failure: expected \"%s\" (%d chars), got \"%s\" (%d chars)", file, line, expectedBuf, (int)utext_nativeLength(&expectedText), buf, (int)utext_nativeLength(actual));
272 }
273 utext_close(&expectedText);
274 }
275 /**
276 * @param expected invariant (platform local text) input
277 */
278
assertUTextInvariant(const char * expected,UText * actual,const char * file,int line)279 void RegexTest::assertUTextInvariant(const char *expected, UText *actual, const char *file, int line) {
280 UErrorCode status = U_ZERO_ERROR;
281 UText expectedText = UTEXT_INITIALIZER;
282 regextst_openUTF8FromInvariant(&expectedText, expected, -1, &status);
283 if(U_FAILURE(status)) {
284 errln("%s:%d: assertUTextInvariant: error %s calling regextst_openUTF8FromInvariant(expected: %d chars)\n", file, line, u_errorName(status), strlen(expected));
285 return;
286 }
287 utext_setNativeIndex(actual, 0);
288 if (!testUTextEqual(&expectedText, actual)) {
289 char buf[201 /*21*/];
290 char expectedBuf[201];
291 utextToPrintable(buf, UPRV_LENGTHOF(buf), actual);
292 utextToPrintable(expectedBuf, UPRV_LENGTHOF(expectedBuf), &expectedText);
293 errln("%s:%d: assertUTextInvariant: Failure: expected \"%s\" (%d uchars), got \"%s\" (%d chars)", file, line, expectedBuf, (int)utext_nativeLength(&expectedText), buf, (int)utext_nativeLength(actual));
294 }
295 utext_close(&expectedText);
296 }
297
298 /**
299 * Assumes utf-8 input
300 */
301 #define REGEX_ASSERT_UTEXT_UTF8(expected, actual) assertUText((expected), (actual), __FILE__, __LINE__)
302 /**
303 * Assumes Invariant input
304 */
305 #define REGEX_ASSERT_UTEXT_INVARIANT(expected, actual) assertUTextInvariant((expected), (actual), __FILE__, __LINE__)
306
307 /**
308 * This buffer ( inv_buf ) is used to hold the UTF-8 strings
309 * passed into utext_openUTF8. An error will be given if
310 * INV_BUFSIZ is too small. It's only used on EBCDIC systems.
311 */
312
313 #define INV_BUFSIZ 2048 /* increase this if too small */
314
315 static int64_t inv_next=0;
316
317 #if U_CHARSET_FAMILY!=U_ASCII_FAMILY
318 static char inv_buf[INV_BUFSIZ];
319 #endif
320
regextst_openUTF8FromInvariant(UText * ut,const char * inv,int64_t length,UErrorCode * status)321 static UText* regextst_openUTF8FromInvariant(UText *ut, const char *inv, int64_t length, UErrorCode *status) {
322 if(length==-1) length=strlen(inv);
323 #if U_CHARSET_FAMILY==U_ASCII_FAMILY
324 inv_next+=length;
325 return utext_openUTF8(ut, inv, length, status);
326 #else
327 if(inv_next+length+1>INV_BUFSIZ) {
328 fprintf(stderr, "%s:%d Error: INV_BUFSIZ #defined to be %d but needs to be at least %d.\n",
329 __FILE__, __LINE__, INV_BUFSIZ, (inv_next+length+1));
330 *status = U_MEMORY_ALLOCATION_ERROR;
331 return NULL;
332 }
333
334 unsigned char *buf = (unsigned char*)inv_buf+inv_next;
335 uprv_aestrncpy(buf, (const uint8_t*)inv, length);
336 inv_next+=length;
337
338 #if 0
339 fprintf(stderr, " Note: INV_BUFSIZ at %d, used=%d\n", INV_BUFSIZ, inv_next);
340 #endif
341
342 return utext_openUTF8(ut, (const char*)buf, length, status);
343 #endif
344 }
345
346
347 //---------------------------------------------------------------------------
348 //
349 // REGEX_TESTLM Macro + invocation function to simplify writing quick tests
350 // for the LookingAt() and Match() functions.
351 //
352 // usage:
353 // REGEX_TESTLM("pattern", "input text", lookingAt expected, matches expected);
354 //
355 // The expected results are UBool - TRUE or FALSE.
356 // The input text is unescaped. The pattern is not.
357 //
358 //
359 //---------------------------------------------------------------------------
360
361 #define REGEX_TESTLM(pat, text, looking, match) UPRV_BLOCK_MACRO_BEGIN { \
362 doRegexLMTest(pat, text, looking, match, __LINE__); \
363 doRegexLMTestUTF8(pat, text, looking, match, __LINE__); \
364 } UPRV_BLOCK_MACRO_END
365
doRegexLMTest(const char * pat,const char * text,UBool looking,UBool match,int32_t line)366 UBool RegexTest::doRegexLMTest(const char *pat, const char *text, UBool looking, UBool match, int32_t line) {
367 const UnicodeString pattern(pat, -1, US_INV);
368 const UnicodeString inputText(text, -1, US_INV);
369 UErrorCode status = U_ZERO_ERROR;
370 UParseError pe;
371 RegexPattern *REPattern = NULL;
372 RegexMatcher *REMatcher = NULL;
373 UBool retVal = TRUE;
374
375 UnicodeString patString(pat, -1, US_INV);
376 REPattern = RegexPattern::compile(patString, 0, pe, status);
377 if (U_FAILURE(status)) {
378 dataerrln("RegexTest failure in RegexPattern::compile() at line %d. Status = %s",
379 line, u_errorName(status));
380 return FALSE;
381 }
382 if (line==376) { REPattern->dumpPattern();}
383
384 UnicodeString inputString(inputText);
385 UnicodeString unEscapedInput = inputString.unescape();
386 REMatcher = REPattern->matcher(unEscapedInput, status);
387 if (U_FAILURE(status)) {
388 errln("RegexTest failure in REPattern::matcher() at line %d. Status = %s\n",
389 line, u_errorName(status));
390 return FALSE;
391 }
392
393 UBool actualmatch;
394 actualmatch = REMatcher->lookingAt(status);
395 if (U_FAILURE(status)) {
396 errln("RegexTest failure in lookingAt() at line %d. Status = %s\n",
397 line, u_errorName(status));
398 retVal = FALSE;
399 }
400 if (actualmatch != looking) {
401 errln("RegexTest: wrong return from lookingAt() at line %d.\n", line);
402 retVal = FALSE;
403 }
404
405 status = U_ZERO_ERROR;
406 actualmatch = REMatcher->matches(status);
407 if (U_FAILURE(status)) {
408 errln("RegexTest failure in matches() at line %d. Status = %s\n",
409 line, u_errorName(status));
410 retVal = FALSE;
411 }
412 if (actualmatch != match) {
413 errln("RegexTest: wrong return from matches() at line %d.\n", line);
414 retVal = FALSE;
415 }
416
417 if (retVal == FALSE) {
418 REPattern->dumpPattern();
419 }
420
421 delete REPattern;
422 delete REMatcher;
423 return retVal;
424 }
425
426
doRegexLMTestUTF8(const char * pat,const char * text,UBool looking,UBool match,int32_t line)427 UBool RegexTest::doRegexLMTestUTF8(const char *pat, const char *text, UBool looking, UBool match, int32_t line) {
428 UText pattern = UTEXT_INITIALIZER;
429 int32_t inputUTF8Length;
430 char *textChars = NULL;
431 UText inputText = UTEXT_INITIALIZER;
432 UErrorCode status = U_ZERO_ERROR;
433 UParseError pe;
434 RegexPattern *REPattern = NULL;
435 RegexMatcher *REMatcher = NULL;
436 UBool retVal = TRUE;
437
438 regextst_openUTF8FromInvariant(&pattern, pat, -1, &status);
439 REPattern = RegexPattern::compile(&pattern, 0, pe, status);
440 if (U_FAILURE(status)) {
441 dataerrln("RegexTest failure in RegexPattern::compile() at line %d (UTF8). Status = %s\n",
442 line, u_errorName(status));
443 return FALSE;
444 }
445
446 UnicodeString inputString(text, -1, US_INV);
447 UnicodeString unEscapedInput = inputString.unescape();
448 LocalUConverterPointer UTF8Converter(ucnv_open("UTF8", &status));
449 ucnv_setFromUCallBack(UTF8Converter.getAlias(), UCNV_FROM_U_CALLBACK_STOP, NULL, NULL, NULL, &status);
450
451 inputUTF8Length = unEscapedInput.extract(NULL, 0, UTF8Converter.getAlias(), status);
452 if (U_FAILURE(status) && status != U_BUFFER_OVERFLOW_ERROR) {
453 // UTF-8 does not allow unpaired surrogates, so this could actually happen
454 logln("RegexTest unable to convert input to UTF8 at line %d. Status = %s\n", line, u_errorName(status));
455 return TRUE; // not a failure of the Regex engine
456 }
457 status = U_ZERO_ERROR; // buffer overflow
458 textChars = new char[inputUTF8Length+1];
459 unEscapedInput.extract(textChars, inputUTF8Length+1, UTF8Converter.getAlias(), status);
460 utext_openUTF8(&inputText, textChars, inputUTF8Length, &status);
461
462 REMatcher = &REPattern->matcher(status)->reset(&inputText);
463 if (U_FAILURE(status)) {
464 errln("RegexTest failure in REPattern::matcher() at line %d (UTF8). Status = %s\n",
465 line, u_errorName(status));
466 return FALSE;
467 }
468
469 UBool actualmatch;
470 actualmatch = REMatcher->lookingAt(status);
471 if (U_FAILURE(status)) {
472 errln("RegexTest failure in lookingAt() at line %d (UTF8). Status = %s\n",
473 line, u_errorName(status));
474 retVal = FALSE;
475 }
476 if (actualmatch != looking) {
477 errln("RegexTest: wrong return from lookingAt() at line %d (UTF8).\n", line);
478 retVal = FALSE;
479 }
480
481 status = U_ZERO_ERROR;
482 actualmatch = REMatcher->matches(status);
483 if (U_FAILURE(status)) {
484 errln("RegexTest failure in matches() at line %d (UTF8). Status = %s\n",
485 line, u_errorName(status));
486 retVal = FALSE;
487 }
488 if (actualmatch != match) {
489 errln("RegexTest: wrong return from matches() at line %d (UTF8).\n", line);
490 retVal = FALSE;
491 }
492
493 if (retVal == FALSE) {
494 REPattern->dumpPattern();
495 }
496
497 delete REPattern;
498 delete REMatcher;
499 utext_close(&inputText);
500 utext_close(&pattern);
501 delete[] textChars;
502 return retVal;
503 }
504
505
506
507 //---------------------------------------------------------------------------
508 //
509 // REGEX_ERR Macro + invocation function to simplify writing tests
510 // regex tests for incorrect patterns
511 //
512 // usage:
513 // REGEX_ERR("pattern", expected error line, column, expected status);
514 //
515 //---------------------------------------------------------------------------
516 #define REGEX_ERR(pat, line, col, status) regex_err(pat, line, col, status, __LINE__)
517
regex_err(const char * pat,int32_t errLine,int32_t errCol,UErrorCode expectedStatus,int32_t line)518 void RegexTest::regex_err(const char *pat, int32_t errLine, int32_t errCol,
519 UErrorCode expectedStatus, int32_t line) {
520 UnicodeString pattern(pat);
521
522 UErrorCode status = U_ZERO_ERROR;
523 UParseError pe;
524 RegexPattern *callerPattern = NULL;
525
526 //
527 // Compile the caller's pattern
528 //
529 UnicodeString patString(pat);
530 callerPattern = RegexPattern::compile(patString, 0, pe, status);
531 if (status != expectedStatus) {
532 dataerrln("Line %d: unexpected error %s compiling pattern.", line, u_errorName(status));
533 } else {
534 if (status != U_ZERO_ERROR) {
535 if (pe.line != errLine || pe.offset != errCol) {
536 errln("Line %d: incorrect line/offset from UParseError. Expected %d/%d; got %d/%d.\n",
537 line, errLine, errCol, pe.line, pe.offset);
538 }
539 }
540 }
541
542 delete callerPattern;
543
544 //
545 // Compile again, using a UTF-8-based UText
546 //
547 UText patternText = UTEXT_INITIALIZER;
548 regextst_openUTF8FromInvariant(&patternText, pat, -1, &status);
549 callerPattern = RegexPattern::compile(&patternText, 0, pe, status);
550 if (status != expectedStatus) {
551 dataerrln("Line %d: unexpected error %s compiling pattern.", line, u_errorName(status));
552 } else {
553 if (status != U_ZERO_ERROR) {
554 if (pe.line != errLine || pe.offset != errCol) {
555 errln("Line %d: incorrect line/offset from UParseError. Expected %d/%d; got %d/%d.\n",
556 line, errLine, errCol, pe.line, pe.offset);
557 }
558 }
559 }
560
561 delete callerPattern;
562 utext_close(&patternText);
563 }
564
565
566
567 //---------------------------------------------------------------------------
568 //
569 // Basic Check for basic functionality of regex pattern matching.
570 // Avoid the use of REGEX_FIND test macro, which has
571 // substantial dependencies on basic Regex functionality.
572 //
573 //---------------------------------------------------------------------------
Basic()574 void RegexTest::Basic() {
575
576
577 //
578 // Debug - slide failing test cases early
579 //
580 #if 0
581 {
582 // REGEX_TESTLM("a\N{LATIN SMALL LETTER B}c", "abc", FALSE, FALSE);
583 UParseError pe;
584 UErrorCode status = U_ZERO_ERROR;
585 RegexPattern *pattern;
586 pattern = RegexPattern::compile(UNICODE_STRING_SIMPLE("a\\u00dfx").unescape(), UREGEX_CASE_INSENSITIVE, pe, status);
587 pattern->dumpPattern();
588 RegexMatcher *m = pattern->matcher(UNICODE_STRING_SIMPLE("a\\u00dfxzzz").unescape(), status);
589 UBool result = m->find();
590 printf("result = %d\n", result);
591 // REGEX_FIND("", "<0>ab<1>cc</1><2>ccc</2></0>ddd");
592 // REGEX_FIND("(X([abc=X]+)+X)|(y[abc=]+)", "=XX====================");
593 }
594 exit(1);
595 #endif
596
597
598 //
599 // Pattern with parentheses
600 //
601 REGEX_TESTLM("st(abc)ring", "stabcring thing", TRUE, FALSE);
602 REGEX_TESTLM("st(abc)ring", "stabcring", TRUE, TRUE);
603 REGEX_TESTLM("st(abc)ring", "stabcrung", FALSE, FALSE);
604
605 //
606 // Patterns with *
607 //
608 REGEX_TESTLM("st(abc)*ring", "string", TRUE, TRUE);
609 REGEX_TESTLM("st(abc)*ring", "stabcring", TRUE, TRUE);
610 REGEX_TESTLM("st(abc)*ring", "stabcabcring", TRUE, TRUE);
611 REGEX_TESTLM("st(abc)*ring", "stabcabcdring", FALSE, FALSE);
612 REGEX_TESTLM("st(abc)*ring", "stabcabcabcring etc.", TRUE, FALSE);
613
614 REGEX_TESTLM("a*", "", TRUE, TRUE);
615 REGEX_TESTLM("a*", "b", TRUE, FALSE);
616
617
618 //
619 // Patterns with "."
620 //
621 REGEX_TESTLM(".", "abc", TRUE, FALSE);
622 REGEX_TESTLM("...", "abc", TRUE, TRUE);
623 REGEX_TESTLM("....", "abc", FALSE, FALSE);
624 REGEX_TESTLM(".*", "abcxyz123", TRUE, TRUE);
625 REGEX_TESTLM("ab.*xyz", "abcdefghij", FALSE, FALSE);
626 REGEX_TESTLM("ab.*xyz", "abcdefg...wxyz", TRUE, TRUE);
627 REGEX_TESTLM("ab.*xyz", "abcde...wxyz...abc..xyz", TRUE, TRUE);
628 REGEX_TESTLM("ab.*xyz", "abcde...wxyz...abc..xyz...", TRUE, FALSE);
629
630 //
631 // Patterns with * applied to chars at end of literal string
632 //
633 REGEX_TESTLM("abc*", "ab", TRUE, TRUE);
634 REGEX_TESTLM("abc*", "abccccc", TRUE, TRUE);
635
636 //
637 // Supplemental chars match as single chars, not a pair of surrogates.
638 //
639 REGEX_TESTLM(".", "\\U00011000", TRUE, TRUE);
640 REGEX_TESTLM("...", "\\U00011000x\\U00012002", TRUE, TRUE);
641 REGEX_TESTLM("...", "\\U00011000x\\U00012002y", TRUE, FALSE);
642
643
644 //
645 // UnicodeSets in the pattern
646 //
647 REGEX_TESTLM("[1-6]", "1", TRUE, TRUE);
648 REGEX_TESTLM("[1-6]", "3", TRUE, TRUE);
649 REGEX_TESTLM("[1-6]", "7", FALSE, FALSE);
650 REGEX_TESTLM("a[1-6]", "a3", TRUE, TRUE);
651 REGEX_TESTLM("a[1-6]", "a3", TRUE, TRUE);
652 REGEX_TESTLM("a[1-6]b", "a3b", TRUE, TRUE);
653
654 REGEX_TESTLM("a[0-9]*b", "a123b", TRUE, TRUE);
655 REGEX_TESTLM("a[0-9]*b", "abc", TRUE, FALSE);
656 REGEX_TESTLM("[\\p{Nd}]*", "123456", TRUE, TRUE);
657 REGEX_TESTLM("[\\p{Nd}]*", "a123456", TRUE, FALSE); // note that * matches 0 occurrences.
658 REGEX_TESTLM("[a][b][[:Zs:]]*", "ab ", TRUE, TRUE);
659
660 //
661 // OR operator in patterns
662 //
663 REGEX_TESTLM("(a|b)", "a", TRUE, TRUE);
664 REGEX_TESTLM("(a|b)", "b", TRUE, TRUE);
665 REGEX_TESTLM("(a|b)", "c", FALSE, FALSE);
666 REGEX_TESTLM("a|b", "b", TRUE, TRUE);
667
668 REGEX_TESTLM("(a|b|c)*", "aabcaaccbcabc", TRUE, TRUE);
669 REGEX_TESTLM("(a|b|c)*", "aabcaaccbcabdc", TRUE, FALSE);
670 REGEX_TESTLM("(a(b|c|d)(x|y|z)*|123)", "ac", TRUE, TRUE);
671 REGEX_TESTLM("(a(b|c|d)(x|y|z)*|123)", "123", TRUE, TRUE);
672 REGEX_TESTLM("(a|(1|2)*)(b|c|d)(x|y|z)*|123", "123", TRUE, TRUE);
673 REGEX_TESTLM("(a|(1|2)*)(b|c|d)(x|y|z)*|123", "222211111czzzzw", TRUE, FALSE);
674
675 //
676 // +
677 //
678 REGEX_TESTLM("ab+", "abbc", TRUE, FALSE);
679 REGEX_TESTLM("ab+c", "ac", FALSE, FALSE);
680 REGEX_TESTLM("b+", "", FALSE, FALSE);
681 REGEX_TESTLM("(abc|def)+", "defabc", TRUE, TRUE);
682 REGEX_TESTLM(".+y", "zippity dooy dah ", TRUE, FALSE);
683 REGEX_TESTLM(".+y", "zippity dooy", TRUE, TRUE);
684
685 //
686 // ?
687 //
688 REGEX_TESTLM("ab?", "ab", TRUE, TRUE);
689 REGEX_TESTLM("ab?", "a", TRUE, TRUE);
690 REGEX_TESTLM("ab?", "ac", TRUE, FALSE);
691 REGEX_TESTLM("ab?", "abb", TRUE, FALSE);
692 REGEX_TESTLM("a(b|c)?d", "abd", TRUE, TRUE);
693 REGEX_TESTLM("a(b|c)?d", "acd", TRUE, TRUE);
694 REGEX_TESTLM("a(b|c)?d", "ad", TRUE, TRUE);
695 REGEX_TESTLM("a(b|c)?d", "abcd", FALSE, FALSE);
696 REGEX_TESTLM("a(b|c)?d", "ab", FALSE, FALSE);
697
698 //
699 // Escape sequences that become single literal chars, handled internally
700 // by ICU's Unescape.
701 //
702
703 // REGEX_TESTLM("\101\142", "Ab", TRUE, TRUE); // Octal TODO: not implemented yet.
704 REGEX_TESTLM("\\a", "\\u0007", TRUE, TRUE); // BEL
705 REGEX_TESTLM("\\cL", "\\u000c", TRUE, TRUE); // Control-L
706 REGEX_TESTLM("\\e", "\\u001b", TRUE, TRUE); // Escape
707 REGEX_TESTLM("\\f", "\\u000c", TRUE, TRUE); // Form Feed
708 REGEX_TESTLM("\\n", "\\u000a", TRUE, TRUE); // new line
709 REGEX_TESTLM("\\r", "\\u000d", TRUE, TRUE); // CR
710 REGEX_TESTLM("\\t", "\\u0009", TRUE, TRUE); // Tab
711 REGEX_TESTLM("\\u1234", "\\u1234", TRUE, TRUE);
712 REGEX_TESTLM("\\U00001234", "\\u1234", TRUE, TRUE);
713
714 REGEX_TESTLM(".*\\Ax", "xyz", TRUE, FALSE); // \A matches only at the beginning of input
715 REGEX_TESTLM(".*\\Ax", " xyz", FALSE, FALSE); // \A matches only at the beginning of input
716
717 // Escape of special chars in patterns
718 REGEX_TESTLM("\\\\\\|\\(\\)\\[\\{\\~\\$\\*\\+\\?\\.", "\\\\|()[{~$*+?.", TRUE, TRUE);
719 }
720
721
722 //---------------------------------------------------------------------------
723 //
724 // UTextBasic Check for quirks that are specific to the UText
725 // implementation.
726 //
727 //---------------------------------------------------------------------------
UTextBasic()728 void RegexTest::UTextBasic() {
729 const char str_abc[] = { 0x61, 0x62, 0x63, 0x00 }; /* abc */
730 UErrorCode status = U_ZERO_ERROR;
731 UText pattern = UTEXT_INITIALIZER;
732 utext_openUTF8(&pattern, str_abc, -1, &status);
733 RegexMatcher matcher(&pattern, 0, status);
734 REGEX_CHECK_STATUS;
735
736 UText input = UTEXT_INITIALIZER;
737 utext_openUTF8(&input, str_abc, -1, &status);
738 REGEX_CHECK_STATUS;
739 matcher.reset(&input);
740 REGEX_CHECK_STATUS;
741 REGEX_ASSERT_UTEXT_UTF8(str_abc, matcher.inputText());
742
743 matcher.reset(matcher.inputText());
744 REGEX_CHECK_STATUS;
745 REGEX_ASSERT_UTEXT_UTF8(str_abc, matcher.inputText());
746
747 utext_close(&pattern);
748 utext_close(&input);
749 }
750
751
752 //---------------------------------------------------------------------------
753 //
754 // API_Match Test that the API for class RegexMatcher
755 // is present and nominally working, but excluding functions
756 // implementing replace operations.
757 //
758 //---------------------------------------------------------------------------
API_Match()759 void RegexTest::API_Match() {
760 UParseError pe;
761 UErrorCode status=U_ZERO_ERROR;
762 int32_t flags = 0;
763
764 //
765 // Debug - slide failing test cases early
766 //
767 #if 0
768 {
769 }
770 return;
771 #endif
772
773 //
774 // Simple pattern compilation
775 //
776 {
777 UnicodeString re("abc");
778 RegexPattern *pat2;
779 pat2 = RegexPattern::compile(re, flags, pe, status);
780 REGEX_CHECK_STATUS;
781
782 UnicodeString inStr1 = "abcdef this is a test";
783 UnicodeString instr2 = "not abc";
784 UnicodeString empty = "";
785
786
787 //
788 // Matcher creation and reset.
789 //
790 RegexMatcher *m1 = pat2->matcher(inStr1, status);
791 REGEX_CHECK_STATUS;
792 REGEX_ASSERT(m1->lookingAt(status) == TRUE);
793 REGEX_ASSERT(m1->input() == inStr1);
794 m1->reset(instr2);
795 REGEX_ASSERT(m1->lookingAt(status) == FALSE);
796 REGEX_ASSERT(m1->input() == instr2);
797 m1->reset(inStr1);
798 REGEX_ASSERT(m1->input() == inStr1);
799 REGEX_ASSERT(m1->lookingAt(status) == TRUE);
800 m1->reset(empty);
801 REGEX_ASSERT(m1->lookingAt(status) == FALSE);
802 REGEX_ASSERT(m1->input() == empty);
803 REGEX_ASSERT(&m1->pattern() == pat2);
804
805 //
806 // reset(pos, status)
807 //
808 m1->reset(inStr1);
809 m1->reset(4, status);
810 REGEX_CHECK_STATUS;
811 REGEX_ASSERT(m1->input() == inStr1);
812 REGEX_ASSERT(m1->lookingAt(status) == TRUE);
813
814 m1->reset(-1, status);
815 REGEX_ASSERT(status == U_INDEX_OUTOFBOUNDS_ERROR);
816 status = U_ZERO_ERROR;
817
818 m1->reset(0, status);
819 REGEX_CHECK_STATUS;
820 status = U_ZERO_ERROR;
821
822 int32_t len = m1->input().length();
823 m1->reset(len-1, status);
824 REGEX_CHECK_STATUS;
825 status = U_ZERO_ERROR;
826
827 m1->reset(len, status);
828 REGEX_CHECK_STATUS;
829 status = U_ZERO_ERROR;
830
831 m1->reset(len+1, status);
832 REGEX_ASSERT(status == U_INDEX_OUTOFBOUNDS_ERROR);
833 status = U_ZERO_ERROR;
834
835 //
836 // match(pos, status)
837 //
838 m1->reset(instr2);
839 REGEX_ASSERT(m1->matches(4, status) == TRUE);
840 m1->reset();
841 REGEX_ASSERT(m1->matches(3, status) == FALSE);
842 m1->reset();
843 REGEX_ASSERT(m1->matches(5, status) == FALSE);
844 REGEX_ASSERT(m1->matches(4, status) == TRUE);
845 REGEX_ASSERT(m1->matches(-1, status) == FALSE);
846 REGEX_ASSERT(status == U_INDEX_OUTOFBOUNDS_ERROR);
847
848 // Match() at end of string should fail, but should not
849 // be an error.
850 status = U_ZERO_ERROR;
851 len = m1->input().length();
852 REGEX_ASSERT(m1->matches(len, status) == FALSE);
853 REGEX_CHECK_STATUS;
854
855 // Match beyond end of string should fail with an error.
856 status = U_ZERO_ERROR;
857 REGEX_ASSERT(m1->matches(len+1, status) == FALSE);
858 REGEX_ASSERT(status == U_INDEX_OUTOFBOUNDS_ERROR);
859
860 // Successful match at end of string.
861 {
862 status = U_ZERO_ERROR;
863 RegexMatcher m("A?", 0, status); // will match zero length string.
864 REGEX_CHECK_STATUS;
865 m.reset(inStr1);
866 len = inStr1.length();
867 REGEX_ASSERT(m.matches(len, status) == TRUE);
868 REGEX_CHECK_STATUS;
869 m.reset(empty);
870 REGEX_ASSERT(m.matches(0, status) == TRUE);
871 REGEX_CHECK_STATUS;
872 }
873
874
875 //
876 // lookingAt(pos, status)
877 //
878 status = U_ZERO_ERROR;
879 m1->reset(instr2); // "not abc"
880 REGEX_ASSERT(m1->lookingAt(4, status) == TRUE);
881 REGEX_ASSERT(m1->lookingAt(5, status) == FALSE);
882 REGEX_ASSERT(m1->lookingAt(3, status) == FALSE);
883 REGEX_ASSERT(m1->lookingAt(4, status) == TRUE);
884 REGEX_ASSERT(m1->lookingAt(-1, status) == FALSE);
885 REGEX_ASSERT(status == U_INDEX_OUTOFBOUNDS_ERROR);
886 status = U_ZERO_ERROR;
887 len = m1->input().length();
888 REGEX_ASSERT(m1->lookingAt(len, status) == FALSE);
889 REGEX_CHECK_STATUS;
890 REGEX_ASSERT(m1->lookingAt(len+1, status) == FALSE);
891 REGEX_ASSERT(status == U_INDEX_OUTOFBOUNDS_ERROR);
892
893 delete m1;
894 delete pat2;
895 }
896
897
898 //
899 // Capture Group.
900 // RegexMatcher::start();
901 // RegexMatcher::end();
902 // RegexMatcher::groupCount();
903 //
904 {
905 int32_t flags=0;
906 UParseError pe;
907 UErrorCode status=U_ZERO_ERROR;
908
909 UnicodeString re("01(23(45)67)(.*)");
910 RegexPattern *pat = RegexPattern::compile(re, flags, pe, status);
911 REGEX_CHECK_STATUS;
912 UnicodeString data = "0123456789";
913
914 RegexMatcher *matcher = pat->matcher(data, status);
915 REGEX_CHECK_STATUS;
916 REGEX_ASSERT(matcher->lookingAt(status) == TRUE);
917 static const int32_t matchStarts[] = {0, 2, 4, 8};
918 static const int32_t matchEnds[] = {10, 8, 6, 10};
919 int32_t i;
920 for (i=0; i<4; i++) {
921 int32_t actualStart = matcher->start(i, status);
922 REGEX_CHECK_STATUS;
923 if (actualStart != matchStarts[i]) {
924 errln("RegexTest failure at line %d, index %d. Expected %d, got %d\n",
925 __LINE__, i, matchStarts[i], actualStart);
926 }
927 int32_t actualEnd = matcher->end(i, status);
928 REGEX_CHECK_STATUS;
929 if (actualEnd != matchEnds[i]) {
930 errln("RegexTest failure at line %d index %d. Expected %d, got %d\n",
931 __LINE__, i, matchEnds[i], actualEnd);
932 }
933 }
934
935 REGEX_ASSERT(matcher->start(0, status) == matcher->start(status));
936 REGEX_ASSERT(matcher->end(0, status) == matcher->end(status));
937
938 REGEX_ASSERT_FAIL(matcher->start(-1, status), U_INDEX_OUTOFBOUNDS_ERROR);
939 REGEX_ASSERT_FAIL(matcher->start( 4, status), U_INDEX_OUTOFBOUNDS_ERROR);
940 matcher->reset();
941 REGEX_ASSERT_FAIL(matcher->start( 0, status), U_REGEX_INVALID_STATE);
942
943 matcher->lookingAt(status);
944 REGEX_ASSERT(matcher->group(status) == "0123456789");
945 REGEX_ASSERT(matcher->group(0, status) == "0123456789");
946 REGEX_ASSERT(matcher->group(1, status) == "234567" );
947 REGEX_ASSERT(matcher->group(2, status) == "45" );
948 REGEX_ASSERT(matcher->group(3, status) == "89" );
949 REGEX_CHECK_STATUS;
950 REGEX_ASSERT_FAIL(matcher->group(-1, status), U_INDEX_OUTOFBOUNDS_ERROR);
951 REGEX_ASSERT_FAIL(matcher->group( 4, status), U_INDEX_OUTOFBOUNDS_ERROR);
952 matcher->reset();
953 REGEX_ASSERT_FAIL(matcher->group( 0, status), U_REGEX_INVALID_STATE);
954
955 delete matcher;
956 delete pat;
957
958 }
959
960 //
961 // find
962 //
963 {
964 int32_t flags=0;
965 UParseError pe;
966 UErrorCode status=U_ZERO_ERROR;
967
968 UnicodeString re("abc");
969 RegexPattern *pat = RegexPattern::compile(re, flags, pe, status);
970 REGEX_CHECK_STATUS;
971 UnicodeString data = ".abc..abc...abc..";
972 // 012345678901234567
973
974 RegexMatcher *matcher = pat->matcher(data, status);
975 REGEX_CHECK_STATUS;
976 REGEX_ASSERT(matcher->find());
977 REGEX_ASSERT(matcher->start(status) == 1);
978 REGEX_ASSERT(matcher->find());
979 REGEX_ASSERT(matcher->start(status) == 6);
980 REGEX_ASSERT(matcher->find());
981 REGEX_ASSERT(matcher->start(status) == 12);
982 REGEX_ASSERT(matcher->find() == FALSE);
983 REGEX_ASSERT(matcher->find() == FALSE);
984
985 matcher->reset();
986 REGEX_ASSERT(matcher->find());
987 REGEX_ASSERT(matcher->start(status) == 1);
988
989 REGEX_ASSERT(matcher->find(0, status));
990 REGEX_ASSERT(matcher->start(status) == 1);
991 REGEX_ASSERT(matcher->find(1, status));
992 REGEX_ASSERT(matcher->start(status) == 1);
993 REGEX_ASSERT(matcher->find(2, status));
994 REGEX_ASSERT(matcher->start(status) == 6);
995 REGEX_ASSERT(matcher->find(12, status));
996 REGEX_ASSERT(matcher->start(status) == 12);
997 REGEX_ASSERT(matcher->find(13, status) == FALSE);
998 REGEX_ASSERT(matcher->find(16, status) == FALSE);
999 REGEX_ASSERT(matcher->find(17, status) == FALSE);
1000 REGEX_ASSERT_FAIL(matcher->start(status), U_REGEX_INVALID_STATE);
1001
1002 status = U_ZERO_ERROR;
1003 REGEX_ASSERT_FAIL(matcher->find(-1, status), U_INDEX_OUTOFBOUNDS_ERROR);
1004 status = U_ZERO_ERROR;
1005 REGEX_ASSERT_FAIL(matcher->find(18, status), U_INDEX_OUTOFBOUNDS_ERROR);
1006
1007 REGEX_ASSERT(matcher->groupCount() == 0);
1008
1009 delete matcher;
1010 delete pat;
1011 }
1012
1013
1014 //
1015 // find, with \G in pattern (true if at the end of a previous match).
1016 //
1017 {
1018 int32_t flags=0;
1019 UParseError pe;
1020 UErrorCode status=U_ZERO_ERROR;
1021
1022 UnicodeString re(".*?(?:(\\Gabc)|(abc))", -1, US_INV);
1023 RegexPattern *pat = RegexPattern::compile(re, flags, pe, status);
1024 REGEX_CHECK_STATUS;
1025 UnicodeString data = ".abcabc.abc..";
1026 // 012345678901234567
1027
1028 RegexMatcher *matcher = pat->matcher(data, status);
1029 REGEX_CHECK_STATUS;
1030 REGEX_ASSERT(matcher->find());
1031 REGEX_ASSERT(matcher->start(status) == 0);
1032 REGEX_ASSERT(matcher->start(1, status) == -1);
1033 REGEX_ASSERT(matcher->start(2, status) == 1);
1034
1035 REGEX_ASSERT(matcher->find());
1036 REGEX_ASSERT(matcher->start(status) == 4);
1037 REGEX_ASSERT(matcher->start(1, status) == 4);
1038 REGEX_ASSERT(matcher->start(2, status) == -1);
1039 REGEX_CHECK_STATUS;
1040
1041 delete matcher;
1042 delete pat;
1043 }
1044
1045 //
1046 // find with zero length matches, match position should bump ahead
1047 // to prevent loops.
1048 //
1049 {
1050 int32_t i;
1051 UErrorCode status=U_ZERO_ERROR;
1052 RegexMatcher m("(?= ?)", 0, status); // This pattern will zero-length matches anywhere,
1053 // using an always-true look-ahead.
1054 REGEX_CHECK_STATUS;
1055 UnicodeString s(" ");
1056 m.reset(s);
1057 for (i=0; ; i++) {
1058 if (m.find() == FALSE) {
1059 break;
1060 }
1061 REGEX_ASSERT(m.start(status) == i);
1062 REGEX_ASSERT(m.end(status) == i);
1063 }
1064 REGEX_ASSERT(i==5);
1065
1066 // Check that the bump goes over surrogate pairs OK
1067 s = UNICODE_STRING_SIMPLE("\\U00010001\\U00010002\\U00010003\\U00010004");
1068 s = s.unescape();
1069 m.reset(s);
1070 for (i=0; ; i+=2) {
1071 if (m.find() == FALSE) {
1072 break;
1073 }
1074 REGEX_ASSERT(m.start(status) == i);
1075 REGEX_ASSERT(m.end(status) == i);
1076 }
1077 REGEX_ASSERT(i==10);
1078 }
1079 {
1080 // find() loop breaking test.
1081 // with pattern of /.?/, should see a series of one char matches, then a single
1082 // match of zero length at the end of the input string.
1083 int32_t i;
1084 UErrorCode status=U_ZERO_ERROR;
1085 RegexMatcher m(".?", 0, status);
1086 REGEX_CHECK_STATUS;
1087 UnicodeString s(" ");
1088 m.reset(s);
1089 for (i=0; ; i++) {
1090 if (m.find() == FALSE) {
1091 break;
1092 }
1093 REGEX_ASSERT(m.start(status) == i);
1094 REGEX_ASSERT(m.end(status) == (i<4 ? i+1 : i));
1095 }
1096 REGEX_ASSERT(i==5);
1097 }
1098
1099
1100 //
1101 // Matchers with no input string behave as if they had an empty input string.
1102 //
1103
1104 {
1105 UErrorCode status = U_ZERO_ERROR;
1106 RegexMatcher m(".?", 0, status);
1107 REGEX_CHECK_STATUS;
1108 REGEX_ASSERT(m.find());
1109 REGEX_ASSERT(m.start(status) == 0);
1110 REGEX_ASSERT(m.input() == "");
1111 }
1112 {
1113 UErrorCode status = U_ZERO_ERROR;
1114 RegexPattern *p = RegexPattern::compile(".", 0, status);
1115 RegexMatcher *m = p->matcher(status);
1116 REGEX_CHECK_STATUS;
1117
1118 REGEX_ASSERT(m->find() == FALSE);
1119 REGEX_ASSERT(m->input() == "");
1120 delete m;
1121 delete p;
1122 }
1123
1124 //
1125 // Regions
1126 //
1127 {
1128 UErrorCode status = U_ZERO_ERROR;
1129 UnicodeString testString("This is test data");
1130 RegexMatcher m(".*", testString, 0, status);
1131 REGEX_CHECK_STATUS;
1132 REGEX_ASSERT(m.regionStart() == 0);
1133 REGEX_ASSERT(m.regionEnd() == testString.length());
1134 REGEX_ASSERT(m.hasTransparentBounds() == FALSE);
1135 REGEX_ASSERT(m.hasAnchoringBounds() == TRUE);
1136
1137 m.region(2,4, status);
1138 REGEX_CHECK_STATUS;
1139 REGEX_ASSERT(m.matches(status));
1140 REGEX_ASSERT(m.start(status)==2);
1141 REGEX_ASSERT(m.end(status)==4);
1142 REGEX_CHECK_STATUS;
1143
1144 m.reset();
1145 REGEX_ASSERT(m.regionStart() == 0);
1146 REGEX_ASSERT(m.regionEnd() == testString.length());
1147
1148 UnicodeString shorterString("short");
1149 m.reset(shorterString);
1150 REGEX_ASSERT(m.regionStart() == 0);
1151 REGEX_ASSERT(m.regionEnd() == shorterString.length());
1152
1153 REGEX_ASSERT(m.hasAnchoringBounds() == TRUE);
1154 REGEX_ASSERT(&m == &m.useAnchoringBounds(FALSE));
1155 REGEX_ASSERT(m.hasAnchoringBounds() == FALSE);
1156 REGEX_ASSERT(&m == &m.reset());
1157 REGEX_ASSERT(m.hasAnchoringBounds() == FALSE);
1158
1159 REGEX_ASSERT(&m == &m.useAnchoringBounds(TRUE));
1160 REGEX_ASSERT(m.hasAnchoringBounds() == TRUE);
1161 REGEX_ASSERT(&m == &m.reset());
1162 REGEX_ASSERT(m.hasAnchoringBounds() == TRUE);
1163
1164 REGEX_ASSERT(m.hasTransparentBounds() == FALSE);
1165 REGEX_ASSERT(&m == &m.useTransparentBounds(TRUE));
1166 REGEX_ASSERT(m.hasTransparentBounds() == TRUE);
1167 REGEX_ASSERT(&m == &m.reset());
1168 REGEX_ASSERT(m.hasTransparentBounds() == TRUE);
1169
1170 REGEX_ASSERT(&m == &m.useTransparentBounds(FALSE));
1171 REGEX_ASSERT(m.hasTransparentBounds() == FALSE);
1172 REGEX_ASSERT(&m == &m.reset());
1173 REGEX_ASSERT(m.hasTransparentBounds() == FALSE);
1174
1175 }
1176
1177 //
1178 // hitEnd() and requireEnd()
1179 //
1180 {
1181 UErrorCode status = U_ZERO_ERROR;
1182 UnicodeString testString("aabb");
1183 RegexMatcher m1(".*", testString, 0, status);
1184 REGEX_ASSERT(m1.lookingAt(status) == TRUE);
1185 REGEX_ASSERT(m1.hitEnd() == TRUE);
1186 REGEX_ASSERT(m1.requireEnd() == FALSE);
1187 REGEX_CHECK_STATUS;
1188
1189 status = U_ZERO_ERROR;
1190 RegexMatcher m2("a*", testString, 0, status);
1191 REGEX_ASSERT(m2.lookingAt(status) == TRUE);
1192 REGEX_ASSERT(m2.hitEnd() == FALSE);
1193 REGEX_ASSERT(m2.requireEnd() == FALSE);
1194 REGEX_CHECK_STATUS;
1195
1196 status = U_ZERO_ERROR;
1197 RegexMatcher m3(".*$", testString, 0, status);
1198 REGEX_ASSERT(m3.lookingAt(status) == TRUE);
1199 REGEX_ASSERT(m3.hitEnd() == TRUE);
1200 REGEX_ASSERT(m3.requireEnd() == TRUE);
1201 REGEX_CHECK_STATUS;
1202 }
1203
1204
1205 //
1206 // Compilation error on reset with UChar *
1207 // These were a hazard that people were stumbling over with runtime errors.
1208 // Changed them to compiler errors by adding private methods that more closely
1209 // matched the incorrect use of the functions.
1210 //
1211 #if 0
1212 {
1213 UErrorCode status = U_ZERO_ERROR;
1214 UChar ucharString[20];
1215 RegexMatcher m(".", 0, status);
1216 m.reset(ucharString); // should not compile.
1217
1218 RegexPattern *p = RegexPattern::compile(".", 0, status);
1219 RegexMatcher *m2 = p->matcher(ucharString, status); // should not compile.
1220
1221 RegexMatcher m3(".", ucharString, 0, status); // Should not compile
1222 }
1223 #endif
1224
1225 //
1226 // Time Outs.
1227 // Note: These tests will need to be changed when the regexp engine is
1228 // able to detect and cut short the exponential time behavior on
1229 // this type of match.
1230 //
1231 {
1232 UErrorCode status = U_ZERO_ERROR;
1233 // Enough 'a's in the string to cause the match to time out.
1234 // (Each on additional 'a' doubles the time)
1235 UnicodeString testString("aaaaaaaaaaaaaaaaaaaaa");
1236 RegexMatcher matcher("(a+)+b", testString, 0, status);
1237 REGEX_CHECK_STATUS;
1238 REGEX_ASSERT(matcher.getTimeLimit() == 0);
1239 matcher.setTimeLimit(100, status);
1240 REGEX_ASSERT(matcher.getTimeLimit() == 100);
1241 REGEX_ASSERT(matcher.lookingAt(status) == FALSE);
1242 REGEX_ASSERT(status == U_REGEX_TIME_OUT);
1243 }
1244 {
1245 UErrorCode status = U_ZERO_ERROR;
1246 // Few enough 'a's to slip in under the time limit.
1247 UnicodeString testString("aaaaaaaaaaaaaaaaaa");
1248 RegexMatcher matcher("(a+)+b", testString, 0, status);
1249 REGEX_CHECK_STATUS;
1250 matcher.setTimeLimit(100, status);
1251 REGEX_ASSERT(matcher.lookingAt(status) == FALSE);
1252 REGEX_CHECK_STATUS;
1253 }
1254
1255 //
1256 // Stack Limits
1257 //
1258 {
1259 UErrorCode status = U_ZERO_ERROR;
1260 UnicodeString testString(1000000, 0x41, 1000000); // Length 1,000,000, filled with 'A'
1261
1262 // Adding the capturing parentheses to the pattern "(A)+A$" inhibits optimizations
1263 // of the '+', and makes the stack frames larger.
1264 RegexMatcher matcher("(A)+A$", testString, 0, status);
1265
1266 // With the default stack, this match should fail to run
1267 REGEX_ASSERT(matcher.lookingAt(status) == FALSE);
1268 REGEX_ASSERT(status == U_REGEX_STACK_OVERFLOW);
1269
1270 // With unlimited stack, it should run
1271 status = U_ZERO_ERROR;
1272 matcher.setStackLimit(0, status);
1273 REGEX_CHECK_STATUS;
1274 REGEX_ASSERT(matcher.lookingAt(status) == TRUE);
1275 REGEX_CHECK_STATUS;
1276 REGEX_ASSERT(matcher.getStackLimit() == 0);
1277
1278 // With a limited stack, it the match should fail
1279 status = U_ZERO_ERROR;
1280 matcher.setStackLimit(10000, status);
1281 REGEX_ASSERT(matcher.lookingAt(status) == FALSE);
1282 REGEX_ASSERT(status == U_REGEX_STACK_OVERFLOW);
1283 REGEX_ASSERT(matcher.getStackLimit() == 10000);
1284 }
1285
1286 // A pattern that doesn't save state should work with
1287 // a minimal sized stack
1288 {
1289 UErrorCode status = U_ZERO_ERROR;
1290 UnicodeString testString = "abc";
1291 RegexMatcher matcher("abc", testString, 0, status);
1292 REGEX_CHECK_STATUS;
1293 matcher.setStackLimit(30, status);
1294 REGEX_CHECK_STATUS;
1295 REGEX_ASSERT(matcher.matches(status) == TRUE);
1296 REGEX_CHECK_STATUS;
1297 REGEX_ASSERT(matcher.getStackLimit() == 30);
1298
1299 // Negative stack sizes should fail
1300 status = U_ZERO_ERROR;
1301 matcher.setStackLimit(1000, status);
1302 REGEX_CHECK_STATUS;
1303 matcher.setStackLimit(-1, status);
1304 REGEX_ASSERT(status == U_ILLEGAL_ARGUMENT_ERROR);
1305 REGEX_ASSERT(matcher.getStackLimit() == 1000);
1306 }
1307
1308
1309 }
1310
1311
1312
1313
1314
1315
1316 //---------------------------------------------------------------------------
1317 //
1318 // API_Replace API test for class RegexMatcher, testing the
1319 // Replace family of functions.
1320 //
1321 //---------------------------------------------------------------------------
API_Replace()1322 void RegexTest::API_Replace() {
1323 //
1324 // Replace
1325 //
1326 int32_t flags=0;
1327 UParseError pe;
1328 UErrorCode status=U_ZERO_ERROR;
1329
1330 UnicodeString re("abc");
1331 RegexPattern *pat = RegexPattern::compile(re, flags, pe, status);
1332 REGEX_CHECK_STATUS;
1333 UnicodeString data = ".abc..abc...abc..";
1334 // 012345678901234567
1335 RegexMatcher *matcher = pat->matcher(data, status);
1336
1337 //
1338 // Plain vanilla matches.
1339 //
1340 UnicodeString dest;
1341 dest = matcher->replaceFirst("yz", status);
1342 REGEX_CHECK_STATUS;
1343 REGEX_ASSERT(dest == ".yz..abc...abc..");
1344
1345 dest = matcher->replaceAll("yz", status);
1346 REGEX_CHECK_STATUS;
1347 REGEX_ASSERT(dest == ".yz..yz...yz..");
1348
1349 //
1350 // Plain vanilla non-matches.
1351 //
1352 UnicodeString d2 = ".abx..abx...abx..";
1353 matcher->reset(d2);
1354 dest = matcher->replaceFirst("yz", status);
1355 REGEX_CHECK_STATUS;
1356 REGEX_ASSERT(dest == ".abx..abx...abx..");
1357
1358 dest = matcher->replaceAll("yz", status);
1359 REGEX_CHECK_STATUS;
1360 REGEX_ASSERT(dest == ".abx..abx...abx..");
1361
1362 //
1363 // Empty source string
1364 //
1365 UnicodeString d3 = "";
1366 matcher->reset(d3);
1367 dest = matcher->replaceFirst("yz", status);
1368 REGEX_CHECK_STATUS;
1369 REGEX_ASSERT(dest == "");
1370
1371 dest = matcher->replaceAll("yz", status);
1372 REGEX_CHECK_STATUS;
1373 REGEX_ASSERT(dest == "");
1374
1375 //
1376 // Empty substitution string
1377 //
1378 matcher->reset(data); // ".abc..abc...abc.."
1379 dest = matcher->replaceFirst("", status);
1380 REGEX_CHECK_STATUS;
1381 REGEX_ASSERT(dest == "...abc...abc..");
1382
1383 dest = matcher->replaceAll("", status);
1384 REGEX_CHECK_STATUS;
1385 REGEX_ASSERT(dest == "........");
1386
1387 //
1388 // match whole string
1389 //
1390 UnicodeString d4 = "abc";
1391 matcher->reset(d4);
1392 dest = matcher->replaceFirst("xyz", status);
1393 REGEX_CHECK_STATUS;
1394 REGEX_ASSERT(dest == "xyz");
1395
1396 dest = matcher->replaceAll("xyz", status);
1397 REGEX_CHECK_STATUS;
1398 REGEX_ASSERT(dest == "xyz");
1399
1400 //
1401 // Capture Group, simple case
1402 //
1403 UnicodeString re2("a(..)");
1404 RegexPattern *pat2 = RegexPattern::compile(re2, flags, pe, status);
1405 REGEX_CHECK_STATUS;
1406 UnicodeString d5 = "abcdefg";
1407 RegexMatcher *matcher2 = pat2->matcher(d5, status);
1408 REGEX_CHECK_STATUS;
1409 dest = matcher2->replaceFirst("$1$1", status);
1410 REGEX_CHECK_STATUS;
1411 REGEX_ASSERT(dest == "bcbcdefg");
1412
1413 dest = matcher2->replaceFirst(UNICODE_STRING_SIMPLE("The value of \\$1 is $1."), status);
1414 REGEX_CHECK_STATUS;
1415 REGEX_ASSERT(dest == "The value of $1 is bc.defg");
1416
1417 dest = matcher2->replaceFirst("$ by itself, no group number $$$", status);
1418 REGEX_ASSERT(U_FAILURE(status));
1419 status = U_ZERO_ERROR;
1420
1421 UnicodeString replacement = UNICODE_STRING_SIMPLE("Supplemental Digit 1 $\\U0001D7CF.");
1422 replacement = replacement.unescape();
1423 dest = matcher2->replaceFirst(replacement, status);
1424 REGEX_CHECK_STATUS;
1425 REGEX_ASSERT(dest == "Supplemental Digit 1 bc.defg");
1426
1427 REGEX_ASSERT_FAIL(matcher2->replaceFirst("bad capture group number $5...",status), U_INDEX_OUTOFBOUNDS_ERROR);
1428
1429
1430 //
1431 // Replacement String with \u hex escapes
1432 //
1433 {
1434 UnicodeString src = "abc 1 abc 2 abc 3";
1435 UnicodeString substitute = UNICODE_STRING_SIMPLE("--\\u0043--");
1436 matcher->reset(src);
1437 UnicodeString result = matcher->replaceAll(substitute, status);
1438 REGEX_CHECK_STATUS;
1439 REGEX_ASSERT(result == "--C-- 1 --C-- 2 --C-- 3");
1440 }
1441 {
1442 UnicodeString src = "abc !";
1443 UnicodeString substitute = UNICODE_STRING_SIMPLE("--\\U00010000--");
1444 matcher->reset(src);
1445 UnicodeString result = matcher->replaceAll(substitute, status);
1446 REGEX_CHECK_STATUS;
1447 UnicodeString expected = UnicodeString("--");
1448 expected.append((UChar32)0x10000);
1449 expected.append("-- !");
1450 REGEX_ASSERT(result == expected);
1451 }
1452 // TODO: need more through testing of capture substitutions.
1453
1454 // Bug 4057
1455 //
1456 {
1457 status = U_ZERO_ERROR;
1458 UnicodeString s = "The matches start with ss and end with ee ss stuff ee fin";
1459 RegexMatcher m("ss(.*?)ee", 0, status);
1460 REGEX_CHECK_STATUS;
1461 UnicodeString result;
1462
1463 // Multiple finds do NOT bump up the previous appendReplacement position.
1464 m.reset(s);
1465 m.find();
1466 m.find();
1467 m.appendReplacement(result, "ooh", status);
1468 REGEX_CHECK_STATUS;
1469 REGEX_ASSERT(result == "The matches start with ss and end with ee ooh");
1470
1471 // After a reset into the interior of a string, appendReplacemnt still starts at beginning.
1472 status = U_ZERO_ERROR;
1473 result.truncate(0);
1474 m.reset(10, status);
1475 m.find();
1476 m.find();
1477 m.appendReplacement(result, "ooh", status);
1478 REGEX_CHECK_STATUS;
1479 REGEX_ASSERT(result == "The matches start with ss and end with ee ooh");
1480
1481 // find() at interior of string, appendReplacemnt still starts at beginning.
1482 status = U_ZERO_ERROR;
1483 result.truncate(0);
1484 m.reset();
1485 m.find(10, status);
1486 m.find();
1487 m.appendReplacement(result, "ooh", status);
1488 REGEX_CHECK_STATUS;
1489 REGEX_ASSERT(result == "The matches start with ss and end with ee ooh");
1490
1491 m.appendTail(result);
1492 REGEX_ASSERT(result == "The matches start with ss and end with ee ooh fin");
1493
1494 }
1495
1496 delete matcher2;
1497 delete pat2;
1498 delete matcher;
1499 delete pat;
1500 }
1501
1502
1503 //---------------------------------------------------------------------------
1504 //
1505 // API_Pattern Test that the API for class RegexPattern is
1506 // present and nominally working.
1507 //
1508 //---------------------------------------------------------------------------
API_Pattern()1509 void RegexTest::API_Pattern() {
1510 RegexPattern pata; // Test default constructor to not crash.
1511 RegexPattern patb;
1512
1513 REGEX_ASSERT(pata == patb);
1514 REGEX_ASSERT(pata == pata);
1515
1516 UnicodeString re1("abc[a-l][m-z]");
1517 UnicodeString re2("def");
1518 UErrorCode status = U_ZERO_ERROR;
1519 UParseError pe;
1520
1521 RegexPattern *pat1 = RegexPattern::compile(re1, 0, pe, status);
1522 RegexPattern *pat2 = RegexPattern::compile(re2, 0, pe, status);
1523 REGEX_CHECK_STATUS;
1524 REGEX_ASSERT(*pat1 == *pat1);
1525 REGEX_ASSERT(*pat1 != pata);
1526
1527 // Assign
1528 patb = *pat1;
1529 REGEX_ASSERT(patb == *pat1);
1530
1531 // Copy Construct
1532 RegexPattern patc(*pat1);
1533 REGEX_ASSERT(patc == *pat1);
1534 REGEX_ASSERT(patb == patc);
1535 REGEX_ASSERT(pat1 != pat2);
1536 patb = *pat2;
1537 REGEX_ASSERT(patb != patc);
1538 REGEX_ASSERT(patb == *pat2);
1539
1540 // Compile with no flags.
1541 RegexPattern *pat1a = RegexPattern::compile(re1, pe, status);
1542 REGEX_ASSERT(*pat1a == *pat1);
1543
1544 REGEX_ASSERT(pat1a->flags() == 0);
1545
1546 // Compile with different flags should be not equal
1547 RegexPattern *pat1b = RegexPattern::compile(re1, UREGEX_CASE_INSENSITIVE, pe, status);
1548 REGEX_CHECK_STATUS;
1549
1550 REGEX_ASSERT(*pat1b != *pat1a);
1551 REGEX_ASSERT(pat1b->flags() == UREGEX_CASE_INSENSITIVE);
1552 REGEX_ASSERT(pat1a->flags() == 0);
1553 delete pat1b;
1554
1555 // clone
1556 RegexPattern *pat1c = pat1->clone();
1557 REGEX_ASSERT(*pat1c == *pat1);
1558 REGEX_ASSERT(*pat1c != *pat2);
1559
1560 delete pat1c;
1561 delete pat1a;
1562 delete pat1;
1563 delete pat2;
1564
1565
1566 //
1567 // Verify that a matcher created from a cloned pattern works.
1568 // (Jitterbug 3423)
1569 //
1570 {
1571 UErrorCode status = U_ZERO_ERROR;
1572 RegexPattern *pSource = RegexPattern::compile(UNICODE_STRING_SIMPLE("\\p{L}+"), 0, status);
1573 RegexPattern *pClone = pSource->clone();
1574 delete pSource;
1575 RegexMatcher *mFromClone = pClone->matcher(status);
1576 REGEX_CHECK_STATUS;
1577 UnicodeString s = "Hello World";
1578 mFromClone->reset(s);
1579 REGEX_ASSERT(mFromClone->find() == TRUE);
1580 REGEX_ASSERT(mFromClone->group(status) == "Hello");
1581 REGEX_ASSERT(mFromClone->find() == TRUE);
1582 REGEX_ASSERT(mFromClone->group(status) == "World");
1583 REGEX_ASSERT(mFromClone->find() == FALSE);
1584 delete mFromClone;
1585 delete pClone;
1586 }
1587
1588 //
1589 // matches convenience API
1590 //
1591 REGEX_ASSERT(RegexPattern::matches(".*", "random input", pe, status) == TRUE);
1592 REGEX_CHECK_STATUS;
1593 REGEX_ASSERT(RegexPattern::matches("abc", "random input", pe, status) == FALSE);
1594 REGEX_CHECK_STATUS;
1595 REGEX_ASSERT(RegexPattern::matches(".*nput", "random input", pe, status) == TRUE);
1596 REGEX_CHECK_STATUS;
1597 REGEX_ASSERT(RegexPattern::matches("random input", "random input", pe, status) == TRUE);
1598 REGEX_CHECK_STATUS;
1599 REGEX_ASSERT(RegexPattern::matches(".*u", "random input", pe, status) == FALSE);
1600 REGEX_CHECK_STATUS;
1601 status = U_INDEX_OUTOFBOUNDS_ERROR;
1602 REGEX_ASSERT(RegexPattern::matches("abc", "abc", pe, status) == FALSE);
1603 REGEX_ASSERT(status == U_INDEX_OUTOFBOUNDS_ERROR);
1604
1605
1606 //
1607 // Split()
1608 //
1609 status = U_ZERO_ERROR;
1610 pat1 = RegexPattern::compile(" +", pe, status);
1611 REGEX_CHECK_STATUS;
1612 UnicodeString fields[10];
1613
1614 int32_t n;
1615 n = pat1->split("Now is the time", fields, 10, status);
1616 REGEX_CHECK_STATUS;
1617 REGEX_ASSERT(n==4);
1618 REGEX_ASSERT(fields[0]=="Now");
1619 REGEX_ASSERT(fields[1]=="is");
1620 REGEX_ASSERT(fields[2]=="the");
1621 REGEX_ASSERT(fields[3]=="time");
1622 REGEX_ASSERT(fields[4]=="");
1623
1624 n = pat1->split("Now is the time", fields, 2, status);
1625 REGEX_CHECK_STATUS;
1626 REGEX_ASSERT(n==2);
1627 REGEX_ASSERT(fields[0]=="Now");
1628 REGEX_ASSERT(fields[1]=="is the time");
1629 REGEX_ASSERT(fields[2]=="the"); // left over from previous test
1630
1631 fields[1] = "*";
1632 status = U_ZERO_ERROR;
1633 n = pat1->split("Now is the time", fields, 1, status);
1634 REGEX_CHECK_STATUS;
1635 REGEX_ASSERT(n==1);
1636 REGEX_ASSERT(fields[0]=="Now is the time");
1637 REGEX_ASSERT(fields[1]=="*");
1638 status = U_ZERO_ERROR;
1639
1640 n = pat1->split(" Now is the time ", fields, 10, status);
1641 REGEX_CHECK_STATUS;
1642 REGEX_ASSERT(n==6);
1643 REGEX_ASSERT(fields[0]=="");
1644 REGEX_ASSERT(fields[1]=="Now");
1645 REGEX_ASSERT(fields[2]=="is");
1646 REGEX_ASSERT(fields[3]=="the");
1647 REGEX_ASSERT(fields[4]=="time");
1648 REGEX_ASSERT(fields[5]=="");
1649
1650 n = pat1->split(" ", fields, 10, status);
1651 REGEX_CHECK_STATUS;
1652 REGEX_ASSERT(n==2);
1653 REGEX_ASSERT(fields[0]=="");
1654 REGEX_ASSERT(fields[1]=="");
1655
1656 fields[0] = "foo";
1657 n = pat1->split("", fields, 10, status);
1658 REGEX_CHECK_STATUS;
1659 REGEX_ASSERT(n==0);
1660 REGEX_ASSERT(fields[0]=="foo");
1661
1662 delete pat1;
1663
1664 // split, with a pattern with (capture)
1665 pat1 = RegexPattern::compile(UNICODE_STRING_SIMPLE("<(\\w*)>"), pe, status);
1666 REGEX_CHECK_STATUS;
1667
1668 status = U_ZERO_ERROR;
1669 n = pat1->split("<a>Now is <b>the time<c>", fields, 10, status);
1670 REGEX_CHECK_STATUS;
1671 REGEX_ASSERT(n==7);
1672 REGEX_ASSERT(fields[0]=="");
1673 REGEX_ASSERT(fields[1]=="a");
1674 REGEX_ASSERT(fields[2]=="Now is ");
1675 REGEX_ASSERT(fields[3]=="b");
1676 REGEX_ASSERT(fields[4]=="the time");
1677 REGEX_ASSERT(fields[5]=="c");
1678 REGEX_ASSERT(fields[6]=="");
1679 REGEX_ASSERT(status==U_ZERO_ERROR);
1680
1681 n = pat1->split(" <a>Now is <b>the time<c>", fields, 10, status);
1682 REGEX_CHECK_STATUS;
1683 REGEX_ASSERT(n==7);
1684 REGEX_ASSERT(fields[0]==" ");
1685 REGEX_ASSERT(fields[1]=="a");
1686 REGEX_ASSERT(fields[2]=="Now is ");
1687 REGEX_ASSERT(fields[3]=="b");
1688 REGEX_ASSERT(fields[4]=="the time");
1689 REGEX_ASSERT(fields[5]=="c");
1690 REGEX_ASSERT(fields[6]=="");
1691
1692 status = U_ZERO_ERROR;
1693 fields[6] = "foo";
1694 n = pat1->split(" <a>Now is <b>the time<c>", fields, 6, status);
1695 REGEX_CHECK_STATUS;
1696 REGEX_ASSERT(n==6);
1697 REGEX_ASSERT(fields[0]==" ");
1698 REGEX_ASSERT(fields[1]=="a");
1699 REGEX_ASSERT(fields[2]=="Now is ");
1700 REGEX_ASSERT(fields[3]=="b");
1701 REGEX_ASSERT(fields[4]=="the time");
1702 REGEX_ASSERT(fields[5]==""); // All text following "<c>" field delimiter.
1703 REGEX_ASSERT(fields[6]=="foo");
1704
1705 status = U_ZERO_ERROR;
1706 fields[5] = "foo";
1707 n = pat1->split(" <a>Now is <b>the time<c>", fields, 5, status);
1708 REGEX_CHECK_STATUS;
1709 REGEX_ASSERT(n==5);
1710 REGEX_ASSERT(fields[0]==" ");
1711 REGEX_ASSERT(fields[1]=="a");
1712 REGEX_ASSERT(fields[2]=="Now is ");
1713 REGEX_ASSERT(fields[3]=="b");
1714 REGEX_ASSERT(fields[4]=="the time<c>");
1715 REGEX_ASSERT(fields[5]=="foo");
1716
1717 status = U_ZERO_ERROR;
1718 fields[5] = "foo";
1719 n = pat1->split(" <a>Now is <b>the time", fields, 5, status);
1720 REGEX_CHECK_STATUS;
1721 REGEX_ASSERT(n==5);
1722 REGEX_ASSERT(fields[0]==" ");
1723 REGEX_ASSERT(fields[1]=="a");
1724 REGEX_ASSERT(fields[2]=="Now is ");
1725 REGEX_ASSERT(fields[3]=="b");
1726 REGEX_ASSERT(fields[4]=="the time");
1727 REGEX_ASSERT(fields[5]=="foo");
1728
1729 status = U_ZERO_ERROR;
1730 n = pat1->split(" <a>Now is <b>the time<c>", fields, 4, status);
1731 REGEX_CHECK_STATUS;
1732 REGEX_ASSERT(n==4);
1733 REGEX_ASSERT(fields[0]==" ");
1734 REGEX_ASSERT(fields[1]=="a");
1735 REGEX_ASSERT(fields[2]=="Now is ");
1736 REGEX_ASSERT(fields[3]=="the time<c>");
1737 status = U_ZERO_ERROR;
1738 delete pat1;
1739
1740 pat1 = RegexPattern::compile("([-,])", pe, status);
1741 REGEX_CHECK_STATUS;
1742 n = pat1->split("1-10,20", fields, 10, status);
1743 REGEX_CHECK_STATUS;
1744 REGEX_ASSERT(n==5);
1745 REGEX_ASSERT(fields[0]=="1");
1746 REGEX_ASSERT(fields[1]=="-");
1747 REGEX_ASSERT(fields[2]=="10");
1748 REGEX_ASSERT(fields[3]==",");
1749 REGEX_ASSERT(fields[4]=="20");
1750 delete pat1;
1751
1752 // Test split of string with empty trailing fields
1753 pat1 = RegexPattern::compile(",", pe, status);
1754 REGEX_CHECK_STATUS;
1755 n = pat1->split("a,b,c,", fields, 10, status);
1756 REGEX_CHECK_STATUS;
1757 REGEX_ASSERT(n==4);
1758 REGEX_ASSERT(fields[0]=="a");
1759 REGEX_ASSERT(fields[1]=="b");
1760 REGEX_ASSERT(fields[2]=="c");
1761 REGEX_ASSERT(fields[3]=="");
1762
1763 n = pat1->split("a,,,", fields, 10, status);
1764 REGEX_CHECK_STATUS;
1765 REGEX_ASSERT(n==4);
1766 REGEX_ASSERT(fields[0]=="a");
1767 REGEX_ASSERT(fields[1]=="");
1768 REGEX_ASSERT(fields[2]=="");
1769 REGEX_ASSERT(fields[3]=="");
1770 delete pat1;
1771
1772 // Split Separator with zero length match.
1773 pat1 = RegexPattern::compile(":?", pe, status);
1774 REGEX_CHECK_STATUS;
1775 n = pat1->split("abc", fields, 10, status);
1776 REGEX_CHECK_STATUS;
1777 REGEX_ASSERT(n==5);
1778 REGEX_ASSERT(fields[0]=="");
1779 REGEX_ASSERT(fields[1]=="a");
1780 REGEX_ASSERT(fields[2]=="b");
1781 REGEX_ASSERT(fields[3]=="c");
1782 REGEX_ASSERT(fields[4]=="");
1783
1784 delete pat1;
1785
1786 //
1787 // RegexPattern::pattern()
1788 //
1789 pat1 = new RegexPattern();
1790 REGEX_ASSERT(pat1->pattern() == "");
1791 delete pat1;
1792
1793 pat1 = RegexPattern::compile("(Hello, world)*", pe, status);
1794 REGEX_CHECK_STATUS;
1795 REGEX_ASSERT(pat1->pattern() == "(Hello, world)*");
1796 delete pat1;
1797
1798
1799 //
1800 // classID functions
1801 //
1802 pat1 = RegexPattern::compile("(Hello, world)*", pe, status);
1803 REGEX_CHECK_STATUS;
1804 REGEX_ASSERT(pat1->getDynamicClassID() == RegexPattern::getStaticClassID());
1805 REGEX_ASSERT(pat1->getDynamicClassID() != NULL);
1806 UnicodeString Hello("Hello, world.");
1807 RegexMatcher *m = pat1->matcher(Hello, status);
1808 REGEX_ASSERT(pat1->getDynamicClassID() != m->getDynamicClassID());
1809 REGEX_ASSERT(m->getDynamicClassID() == RegexMatcher::getStaticClassID());
1810 REGEX_ASSERT(m->getDynamicClassID() != NULL);
1811 delete m;
1812 delete pat1;
1813
1814 }
1815
1816 //---------------------------------------------------------------------------
1817 //
1818 // API_Match_UTF8 Test that the alternate engine for class RegexMatcher
1819 // is present and working, but excluding functions
1820 // implementing replace operations.
1821 //
1822 //---------------------------------------------------------------------------
API_Match_UTF8()1823 void RegexTest::API_Match_UTF8() {
1824 UParseError pe;
1825 UErrorCode status=U_ZERO_ERROR;
1826 int32_t flags = 0;
1827
1828 //
1829 // Debug - slide failing test cases early
1830 //
1831 #if 0
1832 {
1833 }
1834 return;
1835 #endif
1836
1837 //
1838 // Simple pattern compilation
1839 //
1840 {
1841 UText re = UTEXT_INITIALIZER;
1842 regextst_openUTF8FromInvariant(&re, "abc", -1, &status);
1843 REGEX_VERBOSE_TEXT(&re);
1844 RegexPattern *pat2;
1845 pat2 = RegexPattern::compile(&re, flags, pe, status);
1846 REGEX_CHECK_STATUS;
1847
1848 UText input1 = UTEXT_INITIALIZER;
1849 UText input2 = UTEXT_INITIALIZER;
1850 UText empty = UTEXT_INITIALIZER;
1851 regextst_openUTF8FromInvariant(&input1, "abcdef this is a test", -1, &status);
1852 REGEX_VERBOSE_TEXT(&input1);
1853 regextst_openUTF8FromInvariant(&input2, "not abc", -1, &status);
1854 REGEX_VERBOSE_TEXT(&input2);
1855 utext_openUChars(&empty, NULL, 0, &status);
1856
1857 int32_t input1Len = static_cast<int32_t>(strlen("abcdef this is a test")); /* TODO: why not nativelen (input1) ? */
1858 int32_t input2Len = static_cast<int32_t>(strlen("not abc"));
1859
1860
1861 //
1862 // Matcher creation and reset.
1863 //
1864 RegexMatcher *m1 = &pat2->matcher(status)->reset(&input1);
1865 REGEX_CHECK_STATUS;
1866 REGEX_ASSERT(m1->lookingAt(status) == TRUE);
1867 const char str_abcdefthisisatest[] = { 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x20, 0x74, 0x68, 0x69, 0x73, 0x20, 0x69, 0x73, 0x20, 0x61, 0x20, 0x74, 0x65, 0x73, 0x74, 0x00 }; /* abcdef this is a test */
1868 REGEX_ASSERT_UTEXT_UTF8(str_abcdefthisisatest, m1->inputText());
1869 m1->reset(&input2);
1870 REGEX_ASSERT(m1->lookingAt(status) == FALSE);
1871 const char str_notabc[] = { 0x6e, 0x6f, 0x74, 0x20, 0x61, 0x62, 0x63, 0x00 }; /* not abc */
1872 REGEX_ASSERT_UTEXT_UTF8(str_notabc, m1->inputText());
1873 m1->reset(&input1);
1874 REGEX_ASSERT_UTEXT_UTF8(str_abcdefthisisatest, m1->inputText());
1875 REGEX_ASSERT(m1->lookingAt(status) == TRUE);
1876 m1->reset(&empty);
1877 REGEX_ASSERT(m1->lookingAt(status) == FALSE);
1878 REGEX_ASSERT(utext_nativeLength(&empty) == 0);
1879
1880 //
1881 // reset(pos, status)
1882 //
1883 m1->reset(&input1);
1884 m1->reset(4, status);
1885 REGEX_CHECK_STATUS;
1886 REGEX_ASSERT_UTEXT_UTF8(str_abcdefthisisatest, m1->inputText());
1887 REGEX_ASSERT(m1->lookingAt(status) == TRUE);
1888
1889 m1->reset(-1, status);
1890 REGEX_ASSERT(status == U_INDEX_OUTOFBOUNDS_ERROR);
1891 status = U_ZERO_ERROR;
1892
1893 m1->reset(0, status);
1894 REGEX_CHECK_STATUS;
1895 status = U_ZERO_ERROR;
1896
1897 m1->reset(input1Len-1, status);
1898 REGEX_CHECK_STATUS;
1899 status = U_ZERO_ERROR;
1900
1901 m1->reset(input1Len, status);
1902 REGEX_CHECK_STATUS;
1903 status = U_ZERO_ERROR;
1904
1905 m1->reset(input1Len+1, status);
1906 REGEX_ASSERT(status == U_INDEX_OUTOFBOUNDS_ERROR);
1907 status = U_ZERO_ERROR;
1908
1909 //
1910 // match(pos, status)
1911 //
1912 m1->reset(&input2);
1913 REGEX_ASSERT(m1->matches(4, status) == TRUE);
1914 m1->reset();
1915 REGEX_ASSERT(m1->matches(3, status) == FALSE);
1916 m1->reset();
1917 REGEX_ASSERT(m1->matches(5, status) == FALSE);
1918 REGEX_ASSERT(m1->matches(4, status) == TRUE);
1919 REGEX_ASSERT(m1->matches(-1, status) == FALSE);
1920 REGEX_ASSERT(status == U_INDEX_OUTOFBOUNDS_ERROR);
1921
1922 // Match() at end of string should fail, but should not
1923 // be an error.
1924 status = U_ZERO_ERROR;
1925 REGEX_ASSERT(m1->matches(input2Len, status) == FALSE);
1926 REGEX_CHECK_STATUS;
1927
1928 // Match beyond end of string should fail with an error.
1929 status = U_ZERO_ERROR;
1930 REGEX_ASSERT(m1->matches(input2Len+1, status) == FALSE);
1931 REGEX_ASSERT(status == U_INDEX_OUTOFBOUNDS_ERROR);
1932
1933 // Successful match at end of string.
1934 {
1935 status = U_ZERO_ERROR;
1936 RegexMatcher m("A?", 0, status); // will match zero length string.
1937 REGEX_CHECK_STATUS;
1938 m.reset(&input1);
1939 REGEX_ASSERT(m.matches(input1Len, status) == TRUE);
1940 REGEX_CHECK_STATUS;
1941 m.reset(&empty);
1942 REGEX_ASSERT(m.matches(0, status) == TRUE);
1943 REGEX_CHECK_STATUS;
1944 }
1945
1946
1947 //
1948 // lookingAt(pos, status)
1949 //
1950 status = U_ZERO_ERROR;
1951 m1->reset(&input2); // "not abc"
1952 REGEX_ASSERT(m1->lookingAt(4, status) == TRUE);
1953 REGEX_ASSERT(m1->lookingAt(5, status) == FALSE);
1954 REGEX_ASSERT(m1->lookingAt(3, status) == FALSE);
1955 REGEX_ASSERT(m1->lookingAt(4, status) == TRUE);
1956 REGEX_ASSERT(m1->lookingAt(-1, status) == FALSE);
1957 REGEX_ASSERT(status == U_INDEX_OUTOFBOUNDS_ERROR);
1958 status = U_ZERO_ERROR;
1959 REGEX_ASSERT(m1->lookingAt(input2Len, status) == FALSE);
1960 REGEX_CHECK_STATUS;
1961 REGEX_ASSERT(m1->lookingAt(input2Len+1, status) == FALSE);
1962 REGEX_ASSERT(status == U_INDEX_OUTOFBOUNDS_ERROR);
1963
1964 delete m1;
1965 delete pat2;
1966
1967 utext_close(&re);
1968 utext_close(&input1);
1969 utext_close(&input2);
1970 utext_close(&empty);
1971 }
1972
1973
1974 //
1975 // Capture Group.
1976 // RegexMatcher::start();
1977 // RegexMatcher::end();
1978 // RegexMatcher::groupCount();
1979 //
1980 {
1981 int32_t flags=0;
1982 UParseError pe;
1983 UErrorCode status=U_ZERO_ERROR;
1984 UText re=UTEXT_INITIALIZER;
1985 const char str_01234567_pat[] = { 0x30, 0x31, 0x28, 0x32, 0x33, 0x28, 0x34, 0x35, 0x29, 0x36, 0x37, 0x29, 0x28, 0x2e, 0x2a, 0x29, 0x00 }; /* 01(23(45)67)(.*) */
1986 utext_openUTF8(&re, str_01234567_pat, -1, &status);
1987
1988 RegexPattern *pat = RegexPattern::compile(&re, flags, pe, status);
1989 REGEX_CHECK_STATUS;
1990
1991 UText input = UTEXT_INITIALIZER;
1992 const char str_0123456789[] = { 0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37, 0x38, 0x39, 0x00 }; /* 0123456789 */
1993 utext_openUTF8(&input, str_0123456789, -1, &status);
1994
1995 RegexMatcher *matcher = &pat->matcher(status)->reset(&input);
1996 REGEX_CHECK_STATUS;
1997 REGEX_ASSERT(matcher->lookingAt(status) == TRUE);
1998 static const int32_t matchStarts[] = {0, 2, 4, 8};
1999 static const int32_t matchEnds[] = {10, 8, 6, 10};
2000 int32_t i;
2001 for (i=0; i<4; i++) {
2002 int32_t actualStart = matcher->start(i, status);
2003 REGEX_CHECK_STATUS;
2004 if (actualStart != matchStarts[i]) {
2005 errln("RegexTest failure at %s:%d, index %d. Expected %d, got %d\n",
2006 __FILE__, __LINE__, i, matchStarts[i], actualStart);
2007 }
2008 int32_t actualEnd = matcher->end(i, status);
2009 REGEX_CHECK_STATUS;
2010 if (actualEnd != matchEnds[i]) {
2011 errln("RegexTest failure at %s:%d index %d. Expected %d, got %d\n",
2012 __FILE__, __LINE__, i, matchEnds[i], actualEnd);
2013 }
2014 }
2015
2016 REGEX_ASSERT(matcher->start(0, status) == matcher->start(status));
2017 REGEX_ASSERT(matcher->end(0, status) == matcher->end(status));
2018
2019 REGEX_ASSERT_FAIL(matcher->start(-1, status), U_INDEX_OUTOFBOUNDS_ERROR);
2020 REGEX_ASSERT_FAIL(matcher->start( 4, status), U_INDEX_OUTOFBOUNDS_ERROR);
2021 matcher->reset();
2022 REGEX_ASSERT_FAIL(matcher->start( 0, status), U_REGEX_INVALID_STATE);
2023
2024 matcher->lookingAt(status);
2025
2026 UnicodeString dest;
2027 UText destText = UTEXT_INITIALIZER;
2028 utext_openUnicodeString(&destText, &dest, &status);
2029 UText *result;
2030 //const char str_0123456789[] = { 0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37, 0x38, 0x39, 0x00 }; /* 0123456789 */
2031 // Test shallow-clone API
2032 int64_t group_len;
2033 result = matcher->group((UText *)NULL, group_len, status);
2034 REGEX_CHECK_STATUS;
2035 REGEX_ASSERT_UTEXT_UTF8(str_0123456789, result);
2036 utext_close(result);
2037 result = matcher->group(0, &destText, group_len, status);
2038 REGEX_CHECK_STATUS;
2039 REGEX_ASSERT(result == &destText);
2040 REGEX_ASSERT_UTEXT_UTF8(str_0123456789, result);
2041 // destText is now immutable, reopen it
2042 utext_close(&destText);
2043 utext_openUnicodeString(&destText, &dest, &status);
2044
2045 int64_t length;
2046 result = matcher->group(0, NULL, length, status);
2047 REGEX_CHECK_STATUS;
2048 REGEX_ASSERT_UTEXT_UTF8(str_0123456789, result);
2049 utext_close(result);
2050 result = matcher->group(0, &destText, length, status);
2051 REGEX_CHECK_STATUS;
2052 REGEX_ASSERT(result == &destText);
2053 REGEX_ASSERT(utext_getNativeIndex(result) == 0);
2054 REGEX_ASSERT(length == 10);
2055 REGEX_ASSERT_UTEXT_INVARIANT("0123456789", result);
2056
2057 // Capture Group 1 == "234567"
2058 result = matcher->group(1, NULL, length, status);
2059 REGEX_CHECK_STATUS;
2060 REGEX_ASSERT(utext_getNativeIndex(result) == 2);
2061 REGEX_ASSERT(length == 6);
2062 REGEX_ASSERT_UTEXT_INVARIANT("0123456789", result);
2063 utext_close(result);
2064
2065 result = matcher->group(1, &destText, length, status);
2066 REGEX_CHECK_STATUS;
2067 REGEX_ASSERT(result == &destText);
2068 REGEX_ASSERT(utext_getNativeIndex(result) == 2);
2069 REGEX_ASSERT(length == 6);
2070 REGEX_ASSERT_UTEXT_INVARIANT("0123456789", result);
2071 utext_close(result);
2072
2073 // Capture Group 2 == "45"
2074 result = matcher->group(2, NULL, length, status);
2075 REGEX_CHECK_STATUS;
2076 REGEX_ASSERT(utext_getNativeIndex(result) == 4);
2077 REGEX_ASSERT(length == 2);
2078 REGEX_ASSERT_UTEXT_INVARIANT("0123456789", result);
2079 utext_close(result);
2080
2081 result = matcher->group(2, &destText, length, status);
2082 REGEX_CHECK_STATUS;
2083 REGEX_ASSERT(result == &destText);
2084 REGEX_ASSERT(utext_getNativeIndex(result) == 4);
2085 REGEX_ASSERT(length == 2);
2086 REGEX_ASSERT_UTEXT_INVARIANT("0123456789", result);
2087 utext_close(result);
2088
2089 // Capture Group 3 == "89"
2090 result = matcher->group(3, NULL, length, status);
2091 REGEX_CHECK_STATUS;
2092 REGEX_ASSERT(utext_getNativeIndex(result) == 8);
2093 REGEX_ASSERT(length == 2);
2094 REGEX_ASSERT_UTEXT_INVARIANT("0123456789", result);
2095 utext_close(result);
2096
2097 result = matcher->group(3, &destText, length, status);
2098 REGEX_CHECK_STATUS;
2099 REGEX_ASSERT(result == &destText);
2100 REGEX_ASSERT(utext_getNativeIndex(result) == 8);
2101 REGEX_ASSERT(length == 2);
2102 REGEX_ASSERT_UTEXT_INVARIANT("0123456789", result);
2103 utext_close(result);
2104
2105 // Capture Group number out of range.
2106 status = U_ZERO_ERROR;
2107 REGEX_ASSERT_FAIL(matcher->group(-1, status), U_INDEX_OUTOFBOUNDS_ERROR);
2108 status = U_ZERO_ERROR;
2109 REGEX_ASSERT_FAIL(matcher->group( 4, status), U_INDEX_OUTOFBOUNDS_ERROR);
2110 status = U_ZERO_ERROR;
2111 matcher->reset();
2112 REGEX_ASSERT_FAIL(matcher->group( 0, status), U_REGEX_INVALID_STATE);
2113
2114 delete matcher;
2115 delete pat;
2116
2117 utext_close(&destText);
2118 utext_close(&input);
2119 utext_close(&re);
2120 }
2121
2122 //
2123 // find
2124 //
2125 {
2126 int32_t flags=0;
2127 UParseError pe;
2128 UErrorCode status=U_ZERO_ERROR;
2129 UText re=UTEXT_INITIALIZER;
2130 const char str_abc[] = { 0x61, 0x62, 0x63, 0x00 }; /* abc */
2131 utext_openUTF8(&re, str_abc, -1, &status);
2132
2133 RegexPattern *pat = RegexPattern::compile(&re, flags, pe, status);
2134 REGEX_CHECK_STATUS;
2135 UText input = UTEXT_INITIALIZER;
2136 const char str_abcabcabc[] = { 0x2e, 0x61, 0x62, 0x63, 0x2e, 0x2e, 0x61, 0x62, 0x63, 0x2e, 0x2e, 0x2e, 0x61, 0x62, 0x63, 0x2e, 0x2e, 0x00 }; /* .abc..abc...abc.. */
2137 utext_openUTF8(&input, str_abcabcabc, -1, &status);
2138 // 012345678901234567
2139
2140 RegexMatcher *matcher = &pat->matcher(status)->reset(&input);
2141 REGEX_CHECK_STATUS;
2142 REGEX_ASSERT(matcher->find());
2143 REGEX_ASSERT(matcher->start(status) == 1);
2144 REGEX_ASSERT(matcher->find());
2145 REGEX_ASSERT(matcher->start(status) == 6);
2146 REGEX_ASSERT(matcher->find());
2147 REGEX_ASSERT(matcher->start(status) == 12);
2148 REGEX_ASSERT(matcher->find() == FALSE);
2149 REGEX_ASSERT(matcher->find() == FALSE);
2150
2151 matcher->reset();
2152 REGEX_ASSERT(matcher->find());
2153 REGEX_ASSERT(matcher->start(status) == 1);
2154
2155 REGEX_ASSERT(matcher->find(0, status));
2156 REGEX_ASSERT(matcher->start(status) == 1);
2157 REGEX_ASSERT(matcher->find(1, status));
2158 REGEX_ASSERT(matcher->start(status) == 1);
2159 REGEX_ASSERT(matcher->find(2, status));
2160 REGEX_ASSERT(matcher->start(status) == 6);
2161 REGEX_ASSERT(matcher->find(12, status));
2162 REGEX_ASSERT(matcher->start(status) == 12);
2163 REGEX_ASSERT(matcher->find(13, status) == FALSE);
2164 REGEX_ASSERT(matcher->find(16, status) == FALSE);
2165 REGEX_ASSERT(matcher->find(17, status) == FALSE);
2166 REGEX_ASSERT_FAIL(matcher->start(status), U_REGEX_INVALID_STATE);
2167
2168 status = U_ZERO_ERROR;
2169 REGEX_ASSERT_FAIL(matcher->find(-1, status), U_INDEX_OUTOFBOUNDS_ERROR);
2170 status = U_ZERO_ERROR;
2171 REGEX_ASSERT_FAIL(matcher->find(18, status), U_INDEX_OUTOFBOUNDS_ERROR);
2172
2173 REGEX_ASSERT(matcher->groupCount() == 0);
2174
2175 delete matcher;
2176 delete pat;
2177
2178 utext_close(&input);
2179 utext_close(&re);
2180 }
2181
2182
2183 //
2184 // find, with \G in pattern (true if at the end of a previous match).
2185 //
2186 {
2187 int32_t flags=0;
2188 UParseError pe;
2189 UErrorCode status=U_ZERO_ERROR;
2190 UText re=UTEXT_INITIALIZER;
2191 const char str_Gabcabc[] = { 0x2e, 0x2a, 0x3f, 0x28, 0x3f, 0x3a, 0x28, 0x5c, 0x47, 0x61, 0x62, 0x63, 0x29, 0x7c, 0x28, 0x61, 0x62, 0x63, 0x29, 0x29, 0x00 }; /* .*?(?:(\\Gabc)|(abc)) */
2192 utext_openUTF8(&re, str_Gabcabc, -1, &status);
2193
2194 RegexPattern *pat = RegexPattern::compile(&re, flags, pe, status);
2195
2196 REGEX_CHECK_STATUS;
2197 UText input = UTEXT_INITIALIZER;
2198 const char str_abcabcabc[] = { 0x2e, 0x61, 0x62, 0x63, 0x61, 0x62, 0x63, 0x2e, 0x61, 0x62, 0x63, 0x2e, 0x2e, 0x00 }; /* .abcabc.abc.. */
2199 utext_openUTF8(&input, str_abcabcabc, -1, &status);
2200 // 012345678901234567
2201
2202 RegexMatcher *matcher = &pat->matcher(status)->reset(&input);
2203 REGEX_CHECK_STATUS;
2204 REGEX_ASSERT(matcher->find());
2205 REGEX_ASSERT(matcher->start(status) == 0);
2206 REGEX_ASSERT(matcher->start(1, status) == -1);
2207 REGEX_ASSERT(matcher->start(2, status) == 1);
2208
2209 REGEX_ASSERT(matcher->find());
2210 REGEX_ASSERT(matcher->start(status) == 4);
2211 REGEX_ASSERT(matcher->start(1, status) == 4);
2212 REGEX_ASSERT(matcher->start(2, status) == -1);
2213 REGEX_CHECK_STATUS;
2214
2215 delete matcher;
2216 delete pat;
2217
2218 utext_close(&input);
2219 utext_close(&re);
2220 }
2221
2222 //
2223 // find with zero length matches, match position should bump ahead
2224 // to prevent loops.
2225 //
2226 {
2227 int32_t i;
2228 UErrorCode status=U_ZERO_ERROR;
2229 RegexMatcher m("(?= ?)", 0, status); // This pattern will zero-length matches anywhere,
2230 // using an always-true look-ahead.
2231 REGEX_CHECK_STATUS;
2232 UText s = UTEXT_INITIALIZER;
2233 utext_openUTF8(&s, " ", -1, &status);
2234 m.reset(&s);
2235 for (i=0; ; i++) {
2236 if (m.find() == FALSE) {
2237 break;
2238 }
2239 REGEX_ASSERT(m.start(status) == i);
2240 REGEX_ASSERT(m.end(status) == i);
2241 }
2242 REGEX_ASSERT(i==5);
2243
2244 // Check that the bump goes over characters outside the BMP OK
2245 // "\\U00010001\\U00010002\\U00010003\\U00010004".unescape()...in UTF-8
2246 unsigned char aboveBMP[] = {0xF0, 0x90, 0x80, 0x81, 0xF0, 0x90, 0x80, 0x82, 0xF0, 0x90, 0x80, 0x83, 0xF0, 0x90, 0x80, 0x84, 0x00};
2247 utext_openUTF8(&s, (char *)aboveBMP, -1, &status);
2248 m.reset(&s);
2249 for (i=0; ; i+=4) {
2250 if (m.find() == FALSE) {
2251 break;
2252 }
2253 REGEX_ASSERT(m.start(status) == i);
2254 REGEX_ASSERT(m.end(status) == i);
2255 }
2256 REGEX_ASSERT(i==20);
2257
2258 utext_close(&s);
2259 }
2260 {
2261 // find() loop breaking test.
2262 // with pattern of /.?/, should see a series of one char matches, then a single
2263 // match of zero length at the end of the input string.
2264 int32_t i;
2265 UErrorCode status=U_ZERO_ERROR;
2266 RegexMatcher m(".?", 0, status);
2267 REGEX_CHECK_STATUS;
2268 UText s = UTEXT_INITIALIZER;
2269 utext_openUTF8(&s, " ", -1, &status);
2270 m.reset(&s);
2271 for (i=0; ; i++) {
2272 if (m.find() == FALSE) {
2273 break;
2274 }
2275 REGEX_ASSERT(m.start(status) == i);
2276 REGEX_ASSERT(m.end(status) == (i<4 ? i+1 : i));
2277 }
2278 REGEX_ASSERT(i==5);
2279
2280 utext_close(&s);
2281 }
2282
2283
2284 //
2285 // Matchers with no input string behave as if they had an empty input string.
2286 //
2287
2288 {
2289 UErrorCode status = U_ZERO_ERROR;
2290 RegexMatcher m(".?", 0, status);
2291 REGEX_CHECK_STATUS;
2292 REGEX_ASSERT(m.find());
2293 REGEX_ASSERT(m.start(status) == 0);
2294 REGEX_ASSERT(m.input() == "");
2295 }
2296 {
2297 UErrorCode status = U_ZERO_ERROR;
2298 RegexPattern *p = RegexPattern::compile(".", 0, status);
2299 RegexMatcher *m = p->matcher(status);
2300 REGEX_CHECK_STATUS;
2301
2302 REGEX_ASSERT(m->find() == FALSE);
2303 REGEX_ASSERT(utext_nativeLength(m->inputText()) == 0);
2304 delete m;
2305 delete p;
2306 }
2307
2308 //
2309 // Regions
2310 //
2311 {
2312 UErrorCode status = U_ZERO_ERROR;
2313 UText testPattern = UTEXT_INITIALIZER;
2314 UText testText = UTEXT_INITIALIZER;
2315 regextst_openUTF8FromInvariant(&testPattern, ".*", -1, &status);
2316 REGEX_VERBOSE_TEXT(&testPattern);
2317 regextst_openUTF8FromInvariant(&testText, "This is test data", -1, &status);
2318 REGEX_VERBOSE_TEXT(&testText);
2319
2320 RegexMatcher m(&testPattern, &testText, 0, status);
2321 REGEX_CHECK_STATUS;
2322 REGEX_ASSERT(m.regionStart() == 0);
2323 REGEX_ASSERT(m.regionEnd() == (int32_t)strlen("This is test data"));
2324 REGEX_ASSERT(m.hasTransparentBounds() == FALSE);
2325 REGEX_ASSERT(m.hasAnchoringBounds() == TRUE);
2326
2327 m.region(2,4, status);
2328 REGEX_CHECK_STATUS;
2329 REGEX_ASSERT(m.matches(status));
2330 REGEX_ASSERT(m.start(status)==2);
2331 REGEX_ASSERT(m.end(status)==4);
2332 REGEX_CHECK_STATUS;
2333
2334 m.reset();
2335 REGEX_ASSERT(m.regionStart() == 0);
2336 REGEX_ASSERT(m.regionEnd() == (int32_t)strlen("This is test data"));
2337
2338 regextst_openUTF8FromInvariant(&testText, "short", -1, &status);
2339 REGEX_VERBOSE_TEXT(&testText);
2340 m.reset(&testText);
2341 REGEX_ASSERT(m.regionStart() == 0);
2342 REGEX_ASSERT(m.regionEnd() == (int32_t)strlen("short"));
2343
2344 REGEX_ASSERT(m.hasAnchoringBounds() == TRUE);
2345 REGEX_ASSERT(&m == &m.useAnchoringBounds(FALSE));
2346 REGEX_ASSERT(m.hasAnchoringBounds() == FALSE);
2347 REGEX_ASSERT(&m == &m.reset());
2348 REGEX_ASSERT(m.hasAnchoringBounds() == FALSE);
2349
2350 REGEX_ASSERT(&m == &m.useAnchoringBounds(TRUE));
2351 REGEX_ASSERT(m.hasAnchoringBounds() == TRUE);
2352 REGEX_ASSERT(&m == &m.reset());
2353 REGEX_ASSERT(m.hasAnchoringBounds() == TRUE);
2354
2355 REGEX_ASSERT(m.hasTransparentBounds() == FALSE);
2356 REGEX_ASSERT(&m == &m.useTransparentBounds(TRUE));
2357 REGEX_ASSERT(m.hasTransparentBounds() == TRUE);
2358 REGEX_ASSERT(&m == &m.reset());
2359 REGEX_ASSERT(m.hasTransparentBounds() == TRUE);
2360
2361 REGEX_ASSERT(&m == &m.useTransparentBounds(FALSE));
2362 REGEX_ASSERT(m.hasTransparentBounds() == FALSE);
2363 REGEX_ASSERT(&m == &m.reset());
2364 REGEX_ASSERT(m.hasTransparentBounds() == FALSE);
2365
2366 utext_close(&testText);
2367 utext_close(&testPattern);
2368 }
2369
2370 //
2371 // hitEnd() and requireEnd()
2372 //
2373 {
2374 UErrorCode status = U_ZERO_ERROR;
2375 UText testPattern = UTEXT_INITIALIZER;
2376 UText testText = UTEXT_INITIALIZER;
2377 const char str_[] = { 0x2e, 0x2a, 0x00 }; /* .* */
2378 const char str_aabb[] = { 0x61, 0x61, 0x62, 0x62, 0x00 }; /* aabb */
2379 utext_openUTF8(&testPattern, str_, -1, &status);
2380 utext_openUTF8(&testText, str_aabb, -1, &status);
2381
2382 RegexMatcher m1(&testPattern, &testText, 0, status);
2383 REGEX_ASSERT(m1.lookingAt(status) == TRUE);
2384 REGEX_ASSERT(m1.hitEnd() == TRUE);
2385 REGEX_ASSERT(m1.requireEnd() == FALSE);
2386 REGEX_CHECK_STATUS;
2387
2388 status = U_ZERO_ERROR;
2389 const char str_a[] = { 0x61, 0x2a, 0x00 }; /* a* */
2390 utext_openUTF8(&testPattern, str_a, -1, &status);
2391 RegexMatcher m2(&testPattern, &testText, 0, status);
2392 REGEX_ASSERT(m2.lookingAt(status) == TRUE);
2393 REGEX_ASSERT(m2.hitEnd() == FALSE);
2394 REGEX_ASSERT(m2.requireEnd() == FALSE);
2395 REGEX_CHECK_STATUS;
2396
2397 status = U_ZERO_ERROR;
2398 const char str_dotstardollar[] = { 0x2e, 0x2a, 0x24, 0x00 }; /* .*$ */
2399 utext_openUTF8(&testPattern, str_dotstardollar, -1, &status);
2400 RegexMatcher m3(&testPattern, &testText, 0, status);
2401 REGEX_ASSERT(m3.lookingAt(status) == TRUE);
2402 REGEX_ASSERT(m3.hitEnd() == TRUE);
2403 REGEX_ASSERT(m3.requireEnd() == TRUE);
2404 REGEX_CHECK_STATUS;
2405
2406 utext_close(&testText);
2407 utext_close(&testPattern);
2408 }
2409 }
2410
2411
2412 //---------------------------------------------------------------------------
2413 //
2414 // API_Replace_UTF8 API test for class RegexMatcher, testing the
2415 // Replace family of functions.
2416 //
2417 //---------------------------------------------------------------------------
API_Replace_UTF8()2418 void RegexTest::API_Replace_UTF8() {
2419 //
2420 // Replace
2421 //
2422 int32_t flags=0;
2423 UParseError pe;
2424 UErrorCode status=U_ZERO_ERROR;
2425
2426 UText re=UTEXT_INITIALIZER;
2427 regextst_openUTF8FromInvariant(&re, "abc", -1, &status);
2428 REGEX_VERBOSE_TEXT(&re);
2429 RegexPattern *pat = RegexPattern::compile(&re, flags, pe, status);
2430 REGEX_CHECK_STATUS;
2431
2432 char data[] = { 0x2e, 0x61, 0x62, 0x63, 0x2e, 0x2e, 0x61, 0x62, 0x63, 0x2e, 0x2e, 0x2e, 0x61, 0x62, 0x63, 0x2e, 0x2e, 0x00 }; /* .abc..abc...abc.. */
2433 // 012345678901234567
2434 UText dataText = UTEXT_INITIALIZER;
2435 utext_openUTF8(&dataText, data, -1, &status);
2436 REGEX_CHECK_STATUS;
2437 REGEX_VERBOSE_TEXT(&dataText);
2438 RegexMatcher *matcher = &pat->matcher(status)->reset(&dataText);
2439
2440 //
2441 // Plain vanilla matches.
2442 //
2443 UnicodeString dest;
2444 UText destText = UTEXT_INITIALIZER;
2445 utext_openUnicodeString(&destText, &dest, &status);
2446 UText *result;
2447
2448 UText replText = UTEXT_INITIALIZER;
2449
2450 const char str_yz[] = { 0x79, 0x7a, 0x00 }; /* yz */
2451 utext_openUTF8(&replText, str_yz, -1, &status);
2452 REGEX_VERBOSE_TEXT(&replText);
2453 result = matcher->replaceFirst(&replText, NULL, status);
2454 REGEX_CHECK_STATUS;
2455 const char str_yzabcabc[] = { 0x2e, 0x79, 0x7a, 0x2e, 0x2e, 0x61, 0x62, 0x63, 0x2e, 0x2e, 0x2e, 0x61, 0x62, 0x63, 0x2e, 0x2e, 0x00 }; /* .yz..abc...abc.. */
2456 REGEX_ASSERT_UTEXT_UTF8(str_yzabcabc, result);
2457 utext_close(result);
2458 result = matcher->replaceFirst(&replText, &destText, status);
2459 REGEX_CHECK_STATUS;
2460 REGEX_ASSERT(result == &destText);
2461 REGEX_ASSERT_UTEXT_UTF8(str_yzabcabc, result);
2462
2463 result = matcher->replaceAll(&replText, NULL, status);
2464 REGEX_CHECK_STATUS;
2465 const char str_yzyzyz[] = { 0x2e, 0x79, 0x7a, 0x2e, 0x2e, 0x79, 0x7a, 0x2e, 0x2e, 0x2e, 0x79, 0x7a, 0x2e, 0x2e, 0x00 }; /* .yz..yz...yz.. */
2466 REGEX_ASSERT_UTEXT_UTF8(str_yzyzyz, result);
2467 utext_close(result);
2468
2469 utext_replace(&destText, 0, utext_nativeLength(&destText), NULL, 0, &status);
2470 result = matcher->replaceAll(&replText, &destText, status);
2471 REGEX_CHECK_STATUS;
2472 REGEX_ASSERT(result == &destText);
2473 REGEX_ASSERT_UTEXT_UTF8(str_yzyzyz, result);
2474
2475 //
2476 // Plain vanilla non-matches.
2477 //
2478 const char str_abxabxabx[] = { 0x2e, 0x61, 0x62, 0x78, 0x2e, 0x2e, 0x61, 0x62, 0x78, 0x2e, 0x2e, 0x2e, 0x61, 0x62, 0x78, 0x2e, 0x2e, 0x00 }; /* .abx..abx...abx.. */
2479 utext_openUTF8(&dataText, str_abxabxabx, -1, &status);
2480 matcher->reset(&dataText);
2481
2482 result = matcher->replaceFirst(&replText, NULL, status);
2483 REGEX_CHECK_STATUS;
2484 REGEX_ASSERT_UTEXT_UTF8(str_abxabxabx, result);
2485 utext_close(result);
2486 result = matcher->replaceFirst(&replText, &destText, status);
2487 REGEX_CHECK_STATUS;
2488 REGEX_ASSERT(result == &destText);
2489 REGEX_ASSERT_UTEXT_UTF8(str_abxabxabx, result);
2490
2491 result = matcher->replaceAll(&replText, NULL, status);
2492 REGEX_CHECK_STATUS;
2493 REGEX_ASSERT_UTEXT_UTF8(str_abxabxabx, result);
2494 utext_close(result);
2495 utext_replace(&destText, 0, utext_nativeLength(&destText), NULL, 0, &status);
2496 result = matcher->replaceAll(&replText, &destText, status);
2497 REGEX_CHECK_STATUS;
2498 REGEX_ASSERT(result == &destText);
2499 REGEX_ASSERT_UTEXT_UTF8(str_abxabxabx, result);
2500
2501 //
2502 // Empty source string
2503 //
2504 utext_openUTF8(&dataText, NULL, 0, &status);
2505 matcher->reset(&dataText);
2506
2507 result = matcher->replaceFirst(&replText, NULL, status);
2508 REGEX_CHECK_STATUS;
2509 REGEX_ASSERT_UTEXT_UTF8("", result);
2510 utext_close(result);
2511 result = matcher->replaceFirst(&replText, &destText, status);
2512 REGEX_CHECK_STATUS;
2513 REGEX_ASSERT(result == &destText);
2514 REGEX_ASSERT_UTEXT_UTF8("", result);
2515
2516 result = matcher->replaceAll(&replText, NULL, status);
2517 REGEX_CHECK_STATUS;
2518 REGEX_ASSERT_UTEXT_UTF8("", result);
2519 utext_close(result);
2520 result = matcher->replaceAll(&replText, &destText, status);
2521 REGEX_CHECK_STATUS;
2522 REGEX_ASSERT(result == &destText);
2523 REGEX_ASSERT_UTEXT_UTF8("", result);
2524
2525 //
2526 // Empty substitution string
2527 //
2528 utext_openUTF8(&dataText, data, -1, &status); // ".abc..abc...abc.."
2529 matcher->reset(&dataText);
2530
2531 utext_openUTF8(&replText, NULL, 0, &status);
2532 result = matcher->replaceFirst(&replText, NULL, status);
2533 REGEX_CHECK_STATUS;
2534 const char str_abcabc[] = { 0x2e, 0x2e, 0x2e, 0x61, 0x62, 0x63, 0x2e, 0x2e, 0x2e, 0x61, 0x62, 0x63, 0x2e, 0x2e, 0x00 }; /* ...abc...abc.. */
2535 REGEX_ASSERT_UTEXT_UTF8(str_abcabc, result);
2536 utext_close(result);
2537 result = matcher->replaceFirst(&replText, &destText, status);
2538 REGEX_CHECK_STATUS;
2539 REGEX_ASSERT(result == &destText);
2540 REGEX_ASSERT_UTEXT_UTF8(str_abcabc, result);
2541
2542 result = matcher->replaceAll(&replText, NULL, status);
2543 REGEX_CHECK_STATUS;
2544 const char str_dots[] = { 0x2e, 0x2e, 0x2e, 0x2e, 0x2e, 0x2e, 0x2e, 0x2e, 0x00 }; /* ........ */
2545 REGEX_ASSERT_UTEXT_UTF8(str_dots, result);
2546 utext_close(result);
2547 utext_replace(&destText, 0, utext_nativeLength(&destText), NULL, 0, &status);
2548 result = matcher->replaceAll(&replText, &destText, status);
2549 REGEX_CHECK_STATUS;
2550 REGEX_ASSERT(result == &destText);
2551 REGEX_ASSERT_UTEXT_UTF8(str_dots, result);
2552
2553 //
2554 // match whole string
2555 //
2556 const char str_abc[] = { 0x61, 0x62, 0x63, 0x00 }; /* abc */
2557 utext_openUTF8(&dataText, str_abc, -1, &status);
2558 matcher->reset(&dataText);
2559
2560 const char str_xyz[] = { 0x78, 0x79, 0x7a, 0x00 }; /* xyz */
2561 utext_openUTF8(&replText, str_xyz, -1, &status);
2562 result = matcher->replaceFirst(&replText, NULL, status);
2563 REGEX_CHECK_STATUS;
2564 REGEX_ASSERT_UTEXT_UTF8(str_xyz, result);
2565 utext_close(result);
2566 utext_replace(&destText, 0, utext_nativeLength(&destText), NULL, 0, &status);
2567 result = matcher->replaceFirst(&replText, &destText, status);
2568 REGEX_CHECK_STATUS;
2569 REGEX_ASSERT(result == &destText);
2570 REGEX_ASSERT_UTEXT_UTF8(str_xyz, result);
2571
2572 result = matcher->replaceAll(&replText, NULL, status);
2573 REGEX_CHECK_STATUS;
2574 REGEX_ASSERT_UTEXT_UTF8(str_xyz, result);
2575 utext_close(result);
2576 utext_replace(&destText, 0, utext_nativeLength(&destText), NULL, 0, &status);
2577 result = matcher->replaceAll(&replText, &destText, status);
2578 REGEX_CHECK_STATUS;
2579 REGEX_ASSERT(result == &destText);
2580 REGEX_ASSERT_UTEXT_UTF8(str_xyz, result);
2581
2582 //
2583 // Capture Group, simple case
2584 //
2585 const char str_add[] = { 0x61, 0x28, 0x2e, 0x2e, 0x29, 0x00 }; /* a(..) */
2586 utext_openUTF8(&re, str_add, -1, &status);
2587 RegexPattern *pat2 = RegexPattern::compile(&re, flags, pe, status);
2588 REGEX_CHECK_STATUS;
2589
2590 const char str_abcdefg[] = { 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x00 }; /* abcdefg */
2591 utext_openUTF8(&dataText, str_abcdefg, -1, &status);
2592 RegexMatcher *matcher2 = &pat2->matcher(status)->reset(&dataText);
2593 REGEX_CHECK_STATUS;
2594
2595 const char str_11[] = { 0x24, 0x31, 0x24, 0x31, 0x00 }; /* $1$1 */
2596 utext_openUTF8(&replText, str_11, -1, &status);
2597 result = matcher2->replaceFirst(&replText, NULL, status);
2598 REGEX_CHECK_STATUS;
2599 const char str_bcbcdefg[] = { 0x62, 0x63, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x00 }; /* bcbcdefg */
2600 REGEX_ASSERT_UTEXT_UTF8(str_bcbcdefg, result);
2601 utext_close(result);
2602 utext_replace(&destText, 0, utext_nativeLength(&destText), NULL, 0, &status);
2603 result = matcher2->replaceFirst(&replText, &destText, status);
2604 REGEX_CHECK_STATUS;
2605 REGEX_ASSERT(result == &destText);
2606 REGEX_ASSERT_UTEXT_UTF8(str_bcbcdefg, result);
2607
2608 const char str_v[24] = { 0x54, 0x68, 0x65, 0x20, 0x76, 0x61, 0x6c, 0x75, 0x65, 0x20, 0x6f, 0x66, 0x20, 0x5c, 0x24, 0x31, 0x20, 0x69, 0x73, 0x20, 0x24, 0x31, 0x2e, 0x00 }; /* The value of \$1 is $1. */
2609 utext_openUTF8(&replText, str_v, -1, &status);
2610 REGEX_VERBOSE_TEXT(&replText);
2611 result = matcher2->replaceFirst(&replText, NULL, status);
2612 REGEX_CHECK_STATUS;
2613 const char str_Thevalueof1isbcdefg[] = { 0x54, 0x68, 0x65, 0x20, 0x76, 0x61, 0x6c, 0x75, 0x65, 0x20, 0x6f, 0x66, 0x20, 0x24, 0x31, 0x20, 0x69, 0x73, 0x20, 0x62, 0x63, 0x2e, 0x64, 0x65, 0x66, 0x67, 0x00 }; /* The value of $1 is bc.defg */
2614 REGEX_ASSERT_UTEXT_UTF8(str_Thevalueof1isbcdefg, result);
2615 utext_close(result);
2616 utext_replace(&destText, 0, utext_nativeLength(&destText), NULL, 0, &status);
2617 result = matcher2->replaceFirst(&replText, &destText, status);
2618 REGEX_CHECK_STATUS;
2619 REGEX_ASSERT(result == &destText);
2620 REGEX_ASSERT_UTEXT_UTF8(str_Thevalueof1isbcdefg, result);
2621
2622 const char str_byitselfnogroupnumber[] = { 0x5c, 0x24, 0x20, 0x62, 0x79, 0x20, 0x69, 0x74, 0x73, 0x65, 0x6c,
2623 0x66, 0x2c, 0x20, 0x6e, 0x6f, 0x20, 0x67, 0x72, 0x6f, 0x75, 0x70, 0x20, 0x6e, 0x75, 0x6d, 0x62,
2624 0x65, 0x72, 0x20, 0x5c, 0x24, 0x5c, 0x24, 0x5c, 0x24, 0x00 }; /* \$ by itself, no group number \$\$\$ */
2625 utext_openUTF8(&replText, str_byitselfnogroupnumber, -1, &status);
2626 result = matcher2->replaceFirst(&replText, NULL, status);
2627 REGEX_CHECK_STATUS;
2628 const char str_byitselfnogroupnumberdefg[] = { 0x24, 0x20, 0x62, 0x79, 0x20, 0x69, 0x74, 0x73, 0x65, 0x6c, 0x66, 0x2c, 0x20, 0x6e, 0x6f, 0x20, 0x67, 0x72, 0x6f, 0x75, 0x70, 0x20, 0x6e, 0x75, 0x6d, 0x62, 0x65, 0x72, 0x20, 0x24, 0x24, 0x24, 0x64, 0x65, 0x66, 0x67, 0x00 }; /* $ by itself, no group number $$$defg */
2629 REGEX_ASSERT_UTEXT_UTF8(str_byitselfnogroupnumberdefg, result);
2630 utext_close(result);
2631 utext_replace(&destText, 0, utext_nativeLength(&destText), NULL, 0, &status);
2632 result = matcher2->replaceFirst(&replText, &destText, status);
2633 REGEX_CHECK_STATUS;
2634 REGEX_ASSERT(result == &destText);
2635 REGEX_ASSERT_UTEXT_UTF8(str_byitselfnogroupnumberdefg, result);
2636
2637 unsigned char supplDigitChars[] = { 0x53, 0x75, 0x70, 0x70, 0x6c, 0x65, 0x6d, 0x65, 0x6e, 0x74, 0x61, 0x6c, 0x20, 0x44, 0x69, 0x67, 0x69, 0x74, 0x20, 0x31, 0x20, 0x24, 0x78, 0x78, 0x78, 0x78, 0x2e, 0x00 }; /* Supplemental Digit 1 $xxxx. */
2638 //unsigned char supplDigitChars[] = "Supplemental Digit 1 $xxxx."; // \U0001D7CF, MATHEMATICAL BOLD DIGIT ONE
2639 // 012345678901234567890123456
2640 supplDigitChars[22] = 0xF0;
2641 supplDigitChars[23] = 0x9D;
2642 supplDigitChars[24] = 0x9F;
2643 supplDigitChars[25] = 0x8F;
2644 utext_openUTF8(&replText, (char *)supplDigitChars, -1, &status);
2645
2646 result = matcher2->replaceFirst(&replText, NULL, status);
2647 REGEX_CHECK_STATUS;
2648 const char str_SupplementalDigit1bcdefg[] = { 0x53, 0x75, 0x70, 0x70, 0x6c, 0x65, 0x6d, 0x65, 0x6e, 0x74, 0x61, 0x6c, 0x20, 0x44, 0x69, 0x67, 0x69, 0x74, 0x20, 0x31, 0x20, 0x62, 0x63, 0x2e, 0x64, 0x65, 0x66, 0x67, 0x00 }; /* Supplemental Digit 1 bc.defg */
2649 REGEX_ASSERT_UTEXT_UTF8(str_SupplementalDigit1bcdefg, result);
2650 utext_close(result);
2651 utext_replace(&destText, 0, utext_nativeLength(&destText), NULL, 0, &status);
2652 result = matcher2->replaceFirst(&replText, &destText, status);
2653 REGEX_CHECK_STATUS;
2654 REGEX_ASSERT(result == &destText);
2655 REGEX_ASSERT_UTEXT_UTF8(str_SupplementalDigit1bcdefg, result);
2656 const char str_badcapturegroupnumber5[] = { 0x62, 0x61, 0x64, 0x20, 0x63, 0x61, 0x70, 0x74, 0x75, 0x72, 0x65, 0x20, 0x67, 0x72, 0x6f, 0x75, 0x70, 0x20, 0x6e, 0x75, 0x6d, 0x62, 0x65, 0x72, 0x20, 0x24, 0x35, 0x2e, 0x2e, 0x2e, 0x00 }; /* bad capture group number $5..." */
2657 utext_openUTF8(&replText, str_badcapturegroupnumber5, -1, &status);
2658 REGEX_ASSERT_FAIL((result = matcher2->replaceFirst(&replText, NULL, status)), U_INDEX_OUTOFBOUNDS_ERROR);
2659 // REGEX_ASSERT_UTEXT_UTF8("abcdefg", result);
2660 utext_close(result);
2661 utext_replace(&destText, 0, utext_nativeLength(&destText), NULL, 0, &status);
2662 REGEX_ASSERT_FAIL((result = matcher2->replaceFirst(&replText, &destText, status)), U_INDEX_OUTOFBOUNDS_ERROR);
2663 REGEX_ASSERT(result == &destText);
2664 // REGEX_ASSERT_UTEXT_UTF8("abcdefg", result);
2665
2666 //
2667 // Replacement String with \u hex escapes
2668 //
2669 {
2670 const char str_abc1abc2abc3[] = { 0x61, 0x62, 0x63, 0x20, 0x31, 0x20, 0x61, 0x62, 0x63, 0x20, 0x32, 0x20, 0x61, 0x62, 0x63, 0x20, 0x33, 0x00 }; /* abc 1 abc 2 abc 3 */
2671 const char str_u0043[] = { 0x2d, 0x2d, 0x5c, 0x75, 0x30, 0x30, 0x34, 0x33, 0x2d, 0x2d, 0x00 }; /* --\u0043-- */
2672 utext_openUTF8(&dataText, str_abc1abc2abc3, -1, &status);
2673 utext_openUTF8(&replText, str_u0043, -1, &status);
2674 matcher->reset(&dataText);
2675
2676 result = matcher->replaceAll(&replText, NULL, status);
2677 REGEX_CHECK_STATUS;
2678 const char str_C1C2C3[] = { 0x2d, 0x2d, 0x43, 0x2d, 0x2d, 0x20, 0x31, 0x20, 0x2d, 0x2d, 0x43, 0x2d, 0x2d, 0x20, 0x32, 0x20, 0x2d, 0x2d, 0x43, 0x2d, 0x2d, 0x20, 0x33, 0x00 }; /* --C-- 1 --C-- 2 --C-- 3 */
2679 REGEX_ASSERT_UTEXT_UTF8(str_C1C2C3, result);
2680 utext_close(result);
2681 utext_replace(&destText, 0, utext_nativeLength(&destText), NULL, 0, &status);
2682 result = matcher->replaceAll(&replText, &destText, status);
2683 REGEX_CHECK_STATUS;
2684 REGEX_ASSERT(result == &destText);
2685 REGEX_ASSERT_UTEXT_UTF8(str_C1C2C3, result);
2686 }
2687 {
2688 const char str_abc[] = { 0x61, 0x62, 0x63, 0x20, 0x21, 0x00 }; /* abc ! */
2689 utext_openUTF8(&dataText, str_abc, -1, &status);
2690 const char str_U00010000[] = { 0x2d, 0x2d, 0x5c, 0x55, 0x30, 0x30, 0x30, 0x31, 0x30, 0x30, 0x30, 0x30, 0x2d, 0x2d, 0x00 }; /* --\U00010000-- */
2691 utext_openUTF8(&replText, str_U00010000, -1, &status);
2692 matcher->reset(&dataText);
2693
2694 unsigned char expected[] = { 0x2d, 0x2d, 0x78, 0x78, 0x78, 0x78, 0x2d, 0x2d, 0x20, 0x21, 0x00 }; /* --xxxx-- ! */ // \U00010000, "LINEAR B SYLLABLE B008 A"
2695 // 0123456789
2696 expected[2] = 0xF0;
2697 expected[3] = 0x90;
2698 expected[4] = 0x80;
2699 expected[5] = 0x80;
2700
2701 result = matcher->replaceAll(&replText, NULL, status);
2702 REGEX_CHECK_STATUS;
2703 REGEX_ASSERT_UTEXT_UTF8((char *)expected, result);
2704 utext_close(result);
2705 utext_replace(&destText, 0, utext_nativeLength(&destText), NULL, 0, &status);
2706 result = matcher->replaceAll(&replText, &destText, status);
2707 REGEX_CHECK_STATUS;
2708 REGEX_ASSERT(result == &destText);
2709 REGEX_ASSERT_UTEXT_UTF8((char *)expected, result);
2710 }
2711 // TODO: need more through testing of capture substitutions.
2712
2713 // Bug 4057
2714 //
2715 {
2716 status = U_ZERO_ERROR;
2717 const char str_ssee[] = { 0x73, 0x73, 0x28, 0x2e, 0x2a, 0x3f, 0x29, 0x65, 0x65, 0x00 }; /* ss(.*?)ee */
2718 const char str_blah[] = { 0x54, 0x68, 0x65, 0x20, 0x6d, 0x61, 0x74, 0x63, 0x68, 0x65, 0x73, 0x20, 0x73, 0x74, 0x61, 0x72, 0x74, 0x20, 0x77, 0x69, 0x74, 0x68, 0x20, 0x73, 0x73, 0x20, 0x61, 0x6e, 0x64, 0x20, 0x65, 0x6e, 0x64, 0x20, 0x77, 0x69, 0x74, 0x68, 0x20, 0x65, 0x65, 0x20, 0x73, 0x73, 0x20, 0x73, 0x74, 0x75, 0x66, 0x66, 0x20, 0x65, 0x65, 0x20, 0x66, 0x69, 0x6e, 0x00 }; /* The matches start with ss and end with ee ss stuff ee fin */
2719 const char str_ooh[] = { 0x6f, 0x6f, 0x68, 0x00 }; /* ooh */
2720 utext_openUTF8(&re, str_ssee, -1, &status);
2721 utext_openUTF8(&dataText, str_blah, -1, &status);
2722 utext_openUTF8(&replText, str_ooh, -1, &status);
2723
2724 RegexMatcher m(&re, 0, status);
2725 REGEX_CHECK_STATUS;
2726
2727 UnicodeString result;
2728 UText resultText = UTEXT_INITIALIZER;
2729 utext_openUnicodeString(&resultText, &result, &status);
2730
2731 // Multiple finds do NOT bump up the previous appendReplacement position.
2732 m.reset(&dataText);
2733 m.find();
2734 m.find();
2735 m.appendReplacement(&resultText, &replText, status);
2736 REGEX_CHECK_STATUS;
2737 const char str_blah2[] = { 0x54, 0x68, 0x65, 0x20, 0x6d, 0x61, 0x74, 0x63, 0x68, 0x65, 0x73, 0x20, 0x73, 0x74, 0x61, 0x72, 0x74, 0x20, 0x77, 0x69, 0x74, 0x68, 0x20, 0x73, 0x73, 0x20, 0x61, 0x6e, 0x64, 0x20, 0x65, 0x6e, 0x64, 0x20, 0x77, 0x69, 0x74, 0x68, 0x20, 0x65, 0x65, 0x20, 0x6f, 0x6f, 0x68, 0x00 }; /* The matches start with ss and end with ee ooh */
2738 REGEX_ASSERT_UTEXT_UTF8(str_blah2, &resultText);
2739
2740 // After a reset into the interior of a string, appendReplacement still starts at beginning.
2741 status = U_ZERO_ERROR;
2742 result.truncate(0);
2743 utext_openUnicodeString(&resultText, &result, &status);
2744 m.reset(10, status);
2745 m.find();
2746 m.find();
2747 m.appendReplacement(&resultText, &replText, status);
2748 REGEX_CHECK_STATUS;
2749 const char str_blah3[] = { 0x54, 0x68, 0x65, 0x20, 0x6d, 0x61, 0x74, 0x63, 0x68, 0x65, 0x73, 0x20, 0x73, 0x74, 0x61, 0x72, 0x74, 0x20, 0x77, 0x69, 0x74, 0x68, 0x20, 0x73, 0x73, 0x20, 0x61, 0x6e, 0x64, 0x20, 0x65, 0x6e, 0x64, 0x20, 0x77, 0x69, 0x74, 0x68, 0x20, 0x65, 0x65, 0x20, 0x6f, 0x6f, 0x68, 0x00 }; /* The matches start with ss and end with ee ooh */
2750 REGEX_ASSERT_UTEXT_UTF8(str_blah3, &resultText);
2751
2752 // find() at interior of string, appendReplacement still starts at beginning.
2753 status = U_ZERO_ERROR;
2754 result.truncate(0);
2755 utext_openUnicodeString(&resultText, &result, &status);
2756 m.reset();
2757 m.find(10, status);
2758 m.find();
2759 m.appendReplacement(&resultText, &replText, status);
2760 REGEX_CHECK_STATUS;
2761 const char str_blah8[] = { 0x54, 0x68, 0x65, 0x20, 0x6d, 0x61, 0x74, 0x63, 0x68, 0x65, 0x73, 0x20, 0x73, 0x74, 0x61, 0x72, 0x74, 0x20, 0x77, 0x69, 0x74, 0x68, 0x20, 0x73, 0x73, 0x20, 0x61, 0x6e, 0x64, 0x20, 0x65, 0x6e, 0x64, 0x20, 0x77, 0x69, 0x74, 0x68, 0x20, 0x65, 0x65, 0x20, 0x6f, 0x6f, 0x68, 0x00 }; /* The matches start with ss and end with ee ooh */
2762 REGEX_ASSERT_UTEXT_UTF8(str_blah8, &resultText);
2763
2764 m.appendTail(&resultText, status);
2765 const char str_blah9[] = { 0x54, 0x68, 0x65, 0x20, 0x6d, 0x61, 0x74, 0x63, 0x68, 0x65, 0x73, 0x20, 0x73, 0x74, 0x61, 0x72, 0x74, 0x20, 0x77, 0x69, 0x74, 0x68, 0x20, 0x73, 0x73, 0x20, 0x61, 0x6e, 0x64, 0x20, 0x65, 0x6e, 0x64, 0x20, 0x77, 0x69, 0x74, 0x68, 0x20, 0x65, 0x65, 0x20, 0x6f, 0x6f, 0x68, 0x20, 0x66, 0x69, 0x6e, 0x00 }; /* The matches start with ss and end with ee ooh fin */
2766 REGEX_ASSERT_UTEXT_UTF8(str_blah9, &resultText);
2767
2768 utext_close(&resultText);
2769 }
2770
2771 delete matcher2;
2772 delete pat2;
2773 delete matcher;
2774 delete pat;
2775
2776 utext_close(&dataText);
2777 utext_close(&replText);
2778 utext_close(&destText);
2779 utext_close(&re);
2780 }
2781
2782
2783 //---------------------------------------------------------------------------
2784 //
2785 // API_Pattern_UTF8 Test that the API for class RegexPattern is
2786 // present and nominally working.
2787 //
2788 //---------------------------------------------------------------------------
API_Pattern_UTF8()2789 void RegexTest::API_Pattern_UTF8() {
2790 RegexPattern pata; // Test default constructor to not crash.
2791 RegexPattern patb;
2792
2793 REGEX_ASSERT(pata == patb);
2794 REGEX_ASSERT(pata == pata);
2795
2796 UText re1 = UTEXT_INITIALIZER;
2797 UText re2 = UTEXT_INITIALIZER;
2798 UErrorCode status = U_ZERO_ERROR;
2799 UParseError pe;
2800
2801 const char str_abcalmz[] = { 0x61, 0x62, 0x63, 0x5b, 0x61, 0x2d, 0x6c, 0x5d, 0x5b, 0x6d, 0x2d, 0x7a, 0x5d, 0x00 }; /* abc[a-l][m-z] */
2802 const char str_def[] = { 0x64, 0x65, 0x66, 0x00 }; /* def */
2803 utext_openUTF8(&re1, str_abcalmz, -1, &status);
2804 utext_openUTF8(&re2, str_def, -1, &status);
2805
2806 RegexPattern *pat1 = RegexPattern::compile(&re1, 0, pe, status);
2807 RegexPattern *pat2 = RegexPattern::compile(&re2, 0, pe, status);
2808 REGEX_CHECK_STATUS;
2809 REGEX_ASSERT(*pat1 == *pat1);
2810 REGEX_ASSERT(*pat1 != pata);
2811
2812 // Assign
2813 patb = *pat1;
2814 REGEX_ASSERT(patb == *pat1);
2815
2816 // Copy Construct
2817 RegexPattern patc(*pat1);
2818 REGEX_ASSERT(patc == *pat1);
2819 REGEX_ASSERT(patb == patc);
2820 REGEX_ASSERT(pat1 != pat2);
2821 patb = *pat2;
2822 REGEX_ASSERT(patb != patc);
2823 REGEX_ASSERT(patb == *pat2);
2824
2825 // Compile with no flags.
2826 RegexPattern *pat1a = RegexPattern::compile(&re1, pe, status);
2827 REGEX_ASSERT(*pat1a == *pat1);
2828
2829 REGEX_ASSERT(pat1a->flags() == 0);
2830
2831 // Compile with different flags should be not equal
2832 RegexPattern *pat1b = RegexPattern::compile(&re1, UREGEX_CASE_INSENSITIVE, pe, status);
2833 REGEX_CHECK_STATUS;
2834
2835 REGEX_ASSERT(*pat1b != *pat1a);
2836 REGEX_ASSERT(pat1b->flags() == UREGEX_CASE_INSENSITIVE);
2837 REGEX_ASSERT(pat1a->flags() == 0);
2838 delete pat1b;
2839
2840 // clone
2841 RegexPattern *pat1c = pat1->clone();
2842 REGEX_ASSERT(*pat1c == *pat1);
2843 REGEX_ASSERT(*pat1c != *pat2);
2844
2845 delete pat1c;
2846 delete pat1a;
2847 delete pat1;
2848 delete pat2;
2849
2850 utext_close(&re1);
2851 utext_close(&re2);
2852
2853
2854 //
2855 // Verify that a matcher created from a cloned pattern works.
2856 // (Jitterbug 3423)
2857 //
2858 {
2859 UErrorCode status = U_ZERO_ERROR;
2860 UText pattern = UTEXT_INITIALIZER;
2861 const char str_pL[] = { 0x5c, 0x70, 0x7b, 0x4c, 0x7d, 0x2b, 0x00 }; /* \p{L}+ */
2862 utext_openUTF8(&pattern, str_pL, -1, &status);
2863
2864 RegexPattern *pSource = RegexPattern::compile(&pattern, 0, status);
2865 RegexPattern *pClone = pSource->clone();
2866 delete pSource;
2867 RegexMatcher *mFromClone = pClone->matcher(status);
2868 REGEX_CHECK_STATUS;
2869
2870 UText input = UTEXT_INITIALIZER;
2871 const char str_HelloWorld[] = { 0x48, 0x65, 0x6c, 0x6c, 0x6f, 0x20, 0x57, 0x6f, 0x72, 0x6c, 0x64, 0x00 }; /* Hello World */
2872 utext_openUTF8(&input, str_HelloWorld, -1, &status);
2873 mFromClone->reset(&input);
2874 REGEX_ASSERT(mFromClone->find() == TRUE);
2875 REGEX_ASSERT(mFromClone->group(status) == "Hello");
2876 REGEX_ASSERT(mFromClone->find() == TRUE);
2877 REGEX_ASSERT(mFromClone->group(status) == "World");
2878 REGEX_ASSERT(mFromClone->find() == FALSE);
2879 delete mFromClone;
2880 delete pClone;
2881
2882 utext_close(&input);
2883 utext_close(&pattern);
2884 }
2885
2886 //
2887 // matches convenience API
2888 //
2889 {
2890 UErrorCode status = U_ZERO_ERROR;
2891 UText pattern = UTEXT_INITIALIZER;
2892 UText input = UTEXT_INITIALIZER;
2893
2894 const char str_randominput[] = { 0x72, 0x61, 0x6e, 0x64, 0x6f, 0x6d, 0x20, 0x69, 0x6e, 0x70, 0x75, 0x74, 0x00 }; /* random input */
2895 utext_openUTF8(&input, str_randominput, -1, &status);
2896
2897 const char str_dotstar[] = { 0x2e, 0x2a, 0x00 }; /* .* */
2898 utext_openUTF8(&pattern, str_dotstar, -1, &status);
2899 REGEX_ASSERT(RegexPattern::matches(&pattern, &input, pe, status) == TRUE);
2900 REGEX_CHECK_STATUS;
2901
2902 const char str_abc[] = { 0x61, 0x62, 0x63, 0x00 }; /* abc */
2903 utext_openUTF8(&pattern, str_abc, -1, &status);
2904 REGEX_ASSERT(RegexPattern::matches("abc", "random input", pe, status) == FALSE);
2905 REGEX_CHECK_STATUS;
2906
2907 const char str_nput[] = { 0x2e, 0x2a, 0x6e, 0x70, 0x75, 0x74, 0x00 }; /* .*nput */
2908 utext_openUTF8(&pattern, str_nput, -1, &status);
2909 REGEX_ASSERT(RegexPattern::matches(".*nput", "random input", pe, status) == TRUE);
2910 REGEX_CHECK_STATUS;
2911
2912 utext_openUTF8(&pattern, str_randominput, -1, &status);
2913 REGEX_ASSERT(RegexPattern::matches("random input", "random input", pe, status) == TRUE);
2914 REGEX_CHECK_STATUS;
2915
2916 const char str_u[] = { 0x2e, 0x2a, 0x75, 0x00 }; /* .*u */
2917 utext_openUTF8(&pattern, str_u, -1, &status);
2918 REGEX_ASSERT(RegexPattern::matches(".*u", "random input", pe, status) == FALSE);
2919 REGEX_CHECK_STATUS;
2920
2921 utext_openUTF8(&input, str_abc, -1, &status);
2922 utext_openUTF8(&pattern, str_abc, -1, &status);
2923 status = U_INDEX_OUTOFBOUNDS_ERROR;
2924 REGEX_ASSERT(RegexPattern::matches("abc", "abc", pe, status) == FALSE);
2925 REGEX_ASSERT(status == U_INDEX_OUTOFBOUNDS_ERROR);
2926
2927 utext_close(&input);
2928 utext_close(&pattern);
2929 }
2930
2931
2932 //
2933 // Split()
2934 //
2935 status = U_ZERO_ERROR;
2936 const char str_spaceplus[] = { 0x20, 0x2b, 0x00 }; /* + */
2937 utext_openUTF8(&re1, str_spaceplus, -1, &status);
2938 pat1 = RegexPattern::compile(&re1, pe, status);
2939 REGEX_CHECK_STATUS;
2940 UnicodeString fields[10];
2941
2942 int32_t n;
2943 n = pat1->split("Now is the time", fields, 10, status);
2944 REGEX_CHECK_STATUS;
2945 REGEX_ASSERT(n==4);
2946 REGEX_ASSERT(fields[0]=="Now");
2947 REGEX_ASSERT(fields[1]=="is");
2948 REGEX_ASSERT(fields[2]=="the");
2949 REGEX_ASSERT(fields[3]=="time");
2950 REGEX_ASSERT(fields[4]=="");
2951
2952 n = pat1->split("Now is the time", fields, 2, status);
2953 REGEX_CHECK_STATUS;
2954 REGEX_ASSERT(n==2);
2955 REGEX_ASSERT(fields[0]=="Now");
2956 REGEX_ASSERT(fields[1]=="is the time");
2957 REGEX_ASSERT(fields[2]=="the"); // left over from previous test
2958
2959 fields[1] = "*";
2960 status = U_ZERO_ERROR;
2961 n = pat1->split("Now is the time", fields, 1, status);
2962 REGEX_CHECK_STATUS;
2963 REGEX_ASSERT(n==1);
2964 REGEX_ASSERT(fields[0]=="Now is the time");
2965 REGEX_ASSERT(fields[1]=="*");
2966 status = U_ZERO_ERROR;
2967
2968 n = pat1->split(" Now is the time ", fields, 10, status);
2969 REGEX_CHECK_STATUS;
2970 REGEX_ASSERT(n==6);
2971 REGEX_ASSERT(fields[0]=="");
2972 REGEX_ASSERT(fields[1]=="Now");
2973 REGEX_ASSERT(fields[2]=="is");
2974 REGEX_ASSERT(fields[3]=="the");
2975 REGEX_ASSERT(fields[4]=="time");
2976 REGEX_ASSERT(fields[5]=="");
2977 REGEX_ASSERT(fields[6]=="");
2978
2979 fields[2] = "*";
2980 n = pat1->split(" ", fields, 10, status);
2981 REGEX_CHECK_STATUS;
2982 REGEX_ASSERT(n==2);
2983 REGEX_ASSERT(fields[0]=="");
2984 REGEX_ASSERT(fields[1]=="");
2985 REGEX_ASSERT(fields[2]=="*");
2986
2987 fields[0] = "foo";
2988 n = pat1->split("", fields, 10, status);
2989 REGEX_CHECK_STATUS;
2990 REGEX_ASSERT(n==0);
2991 REGEX_ASSERT(fields[0]=="foo");
2992
2993 delete pat1;
2994
2995 // split, with a pattern with (capture)
2996 regextst_openUTF8FromInvariant(&re1, "<(\\w*)>", -1, &status);
2997 pat1 = RegexPattern::compile(&re1, pe, status);
2998 REGEX_CHECK_STATUS;
2999
3000 status = U_ZERO_ERROR;
3001 fields[6] = fields[7] = "*";
3002 n = pat1->split("<a>Now is <b>the time<c>", fields, 10, status);
3003 REGEX_CHECK_STATUS;
3004 REGEX_ASSERT(n==7);
3005 REGEX_ASSERT(fields[0]=="");
3006 REGEX_ASSERT(fields[1]=="a");
3007 REGEX_ASSERT(fields[2]=="Now is ");
3008 REGEX_ASSERT(fields[3]=="b");
3009 REGEX_ASSERT(fields[4]=="the time");
3010 REGEX_ASSERT(fields[5]=="c");
3011 REGEX_ASSERT(fields[6]=="");
3012 REGEX_ASSERT(fields[7]=="*");
3013 REGEX_ASSERT(status==U_ZERO_ERROR);
3014
3015 fields[6] = fields[7] = "*";
3016 n = pat1->split(" <a>Now is <b>the time<c>", fields, 10, status);
3017 REGEX_CHECK_STATUS;
3018 REGEX_ASSERT(n==7);
3019 REGEX_ASSERT(fields[0]==" ");
3020 REGEX_ASSERT(fields[1]=="a");
3021 REGEX_ASSERT(fields[2]=="Now is ");
3022 REGEX_ASSERT(fields[3]=="b");
3023 REGEX_ASSERT(fields[4]=="the time");
3024 REGEX_ASSERT(fields[5]=="c");
3025 REGEX_ASSERT(fields[6]=="");
3026 REGEX_ASSERT(fields[7]=="*");
3027
3028 status = U_ZERO_ERROR;
3029 fields[6] = "foo";
3030 n = pat1->split(" <a>Now is <b>the time<c> ", fields, 6, status);
3031 REGEX_CHECK_STATUS;
3032 REGEX_ASSERT(n==6);
3033 REGEX_ASSERT(fields[0]==" ");
3034 REGEX_ASSERT(fields[1]=="a");
3035 REGEX_ASSERT(fields[2]=="Now is ");
3036 REGEX_ASSERT(fields[3]=="b");
3037 REGEX_ASSERT(fields[4]=="the time");
3038 REGEX_ASSERT(fields[5]==" ");
3039 REGEX_ASSERT(fields[6]=="foo");
3040
3041 status = U_ZERO_ERROR;
3042 fields[5] = "foo";
3043 n = pat1->split(" <a>Now is <b>the time<c>", fields, 5, status);
3044 REGEX_CHECK_STATUS;
3045 REGEX_ASSERT(n==5);
3046 REGEX_ASSERT(fields[0]==" ");
3047 REGEX_ASSERT(fields[1]=="a");
3048 REGEX_ASSERT(fields[2]=="Now is ");
3049 REGEX_ASSERT(fields[3]=="b");
3050 REGEX_ASSERT(fields[4]=="the time<c>");
3051 REGEX_ASSERT(fields[5]=="foo");
3052
3053 status = U_ZERO_ERROR;
3054 fields[5] = "foo";
3055 n = pat1->split(" <a>Now is <b>the time", fields, 5, status);
3056 REGEX_CHECK_STATUS;
3057 REGEX_ASSERT(n==5);
3058 REGEX_ASSERT(fields[0]==" ");
3059 REGEX_ASSERT(fields[1]=="a");
3060 REGEX_ASSERT(fields[2]=="Now is ");
3061 REGEX_ASSERT(fields[3]=="b");
3062 REGEX_ASSERT(fields[4]=="the time");
3063 REGEX_ASSERT(fields[5]=="foo");
3064
3065 status = U_ZERO_ERROR;
3066 n = pat1->split(" <a>Now is <b>the time<c>", fields, 4, status);
3067 REGEX_CHECK_STATUS;
3068 REGEX_ASSERT(n==4);
3069 REGEX_ASSERT(fields[0]==" ");
3070 REGEX_ASSERT(fields[1]=="a");
3071 REGEX_ASSERT(fields[2]=="Now is ");
3072 REGEX_ASSERT(fields[3]=="the time<c>");
3073 status = U_ZERO_ERROR;
3074 delete pat1;
3075
3076 regextst_openUTF8FromInvariant(&re1, "([-,])", -1, &status);
3077 pat1 = RegexPattern::compile(&re1, pe, status);
3078 REGEX_CHECK_STATUS;
3079 n = pat1->split("1-10,20", fields, 10, status);
3080 REGEX_CHECK_STATUS;
3081 REGEX_ASSERT(n==5);
3082 REGEX_ASSERT(fields[0]=="1");
3083 REGEX_ASSERT(fields[1]=="-");
3084 REGEX_ASSERT(fields[2]=="10");
3085 REGEX_ASSERT(fields[3]==",");
3086 REGEX_ASSERT(fields[4]=="20");
3087 delete pat1;
3088
3089
3090 //
3091 // split of a UText based string, with library allocating output UTexts.
3092 //
3093 {
3094 status = U_ZERO_ERROR;
3095 RegexMatcher matcher(UnicodeString("(:)"), 0, status);
3096 UnicodeString stringToSplit("first:second:third");
3097 UText *textToSplit = utext_openUnicodeString(NULL, &stringToSplit, &status);
3098 REGEX_CHECK_STATUS;
3099
3100 UText *splits[10] = {NULL};
3101 int32_t numFields = matcher.split(textToSplit, splits, UPRV_LENGTHOF(splits), status);
3102 REGEX_CHECK_STATUS;
3103 REGEX_ASSERT(numFields == 5);
3104 REGEX_ASSERT_UTEXT_INVARIANT("first", splits[0]);
3105 REGEX_ASSERT_UTEXT_INVARIANT(":", splits[1]);
3106 REGEX_ASSERT_UTEXT_INVARIANT("second", splits[2]);
3107 REGEX_ASSERT_UTEXT_INVARIANT(":", splits[3]);
3108 REGEX_ASSERT_UTEXT_INVARIANT("third", splits[4]);
3109 REGEX_ASSERT(splits[5] == NULL);
3110
3111 for (int i=0; i<UPRV_LENGTHOF(splits); i++) {
3112 if (splits[i]) {
3113 utext_close(splits[i]);
3114 splits[i] = NULL;
3115 }
3116 }
3117 utext_close(textToSplit);
3118 }
3119
3120
3121 //
3122 // RegexPattern::pattern() and patternText()
3123 //
3124 pat1 = new RegexPattern();
3125 REGEX_ASSERT(pat1->pattern() == "");
3126 REGEX_ASSERT_UTEXT_UTF8("", pat1->patternText(status));
3127 delete pat1;
3128 const char *helloWorldInvariant = "(Hello, world)*";
3129 regextst_openUTF8FromInvariant(&re1, helloWorldInvariant, -1, &status);
3130 pat1 = RegexPattern::compile(&re1, pe, status);
3131 REGEX_CHECK_STATUS;
3132 REGEX_ASSERT_UNISTR("(Hello, world)*", pat1->pattern());
3133 REGEX_ASSERT_UTEXT_INVARIANT("(Hello, world)*", pat1->patternText(status));
3134 delete pat1;
3135
3136 utext_close(&re1);
3137 }
3138
3139
3140 //---------------------------------------------------------------------------
3141 //
3142 // Extended A more thorough check for features of regex patterns
3143 // The test cases are in a separate data file,
3144 // source/tests/testdata/regextst.txt
3145 // A description of the test data format is included in that file.
3146 //
3147 //---------------------------------------------------------------------------
3148
3149 const char *
getPath(char buffer[2048],const char * filename)3150 RegexTest::getPath(char buffer[2048], const char *filename) {
3151 UErrorCode status=U_ZERO_ERROR;
3152 const char *testDataDirectory = IntlTest::getSourceTestData(status);
3153 if (U_FAILURE(status)) {
3154 errln("ERROR: loadTestData() failed - %s", u_errorName(status));
3155 return NULL;
3156 }
3157
3158 strcpy(buffer, testDataDirectory);
3159 strcat(buffer, filename);
3160 return buffer;
3161 }
3162
Extended()3163 void RegexTest::Extended() {
3164 char tdd[2048];
3165 const char *srcPath;
3166 UErrorCode status = U_ZERO_ERROR;
3167 int32_t lineNum = 0;
3168
3169 //
3170 // Open and read the test data file.
3171 //
3172 srcPath=getPath(tdd, "regextst.txt");
3173 if(srcPath==NULL) {
3174 return; /* something went wrong, error already output */
3175 }
3176
3177 int32_t len;
3178 UChar *testData = ReadAndConvertFile(srcPath, len, "utf-8", status);
3179 if (U_FAILURE(status)) {
3180 return; /* something went wrong, error already output */
3181 }
3182
3183 //
3184 // Put the test data into a UnicodeString
3185 //
3186 UnicodeString testString(FALSE, testData, len);
3187
3188 RegexMatcher quotedStuffMat(UNICODE_STRING_SIMPLE("\\s*([\\'\\\"/])(.*?)\\1"), 0, status);
3189 RegexMatcher commentMat (UNICODE_STRING_SIMPLE("\\s*(#.*)?$"), 0, status);
3190 RegexMatcher flagsMat (UNICODE_STRING_SIMPLE("\\s*([ixsmdteDEGLMQvabtyYzZ2-9]*)([:letter:]*)"), 0, status);
3191
3192 RegexMatcher lineMat(UNICODE_STRING_SIMPLE("(.*?)\\r?\\n"), testString, 0, status);
3193 UnicodeString testPattern; // The pattern for test from the test file.
3194 UnicodeString testFlags; // the flags for a test.
3195 UnicodeString matchString; // The marked up string to be used as input
3196
3197 if (U_FAILURE(status)){
3198 dataerrln("Construct RegexMatcher() error - %s", u_errorName(status));
3199 delete [] testData;
3200 return;
3201 }
3202
3203 //
3204 // Loop over the test data file, once per line.
3205 //
3206 while (lineMat.find()) {
3207 lineNum++;
3208 if (U_FAILURE(status)) {
3209 errln("%s:%d: ICU Error \"%s\"", srcPath, lineNum, u_errorName(status));
3210 }
3211
3212 status = U_ZERO_ERROR;
3213 UnicodeString testLine = lineMat.group(1, status);
3214 if (testLine.length() == 0) {
3215 continue;
3216 }
3217
3218 //
3219 // Parse the test line. Skip blank and comment only lines.
3220 // Separate out the three main fields - pattern, flags, target.
3221 //
3222
3223 commentMat.reset(testLine);
3224 if (commentMat.lookingAt(status)) {
3225 // This line is a comment, or blank.
3226 continue;
3227 }
3228
3229 //
3230 // Pull out the pattern field, remove it from the test file line.
3231 //
3232 quotedStuffMat.reset(testLine);
3233 if (quotedStuffMat.lookingAt(status)) {
3234 testPattern = quotedStuffMat.group(2, status);
3235 testLine.remove(0, quotedStuffMat.end(0, status));
3236 } else {
3237 errln("Bad pattern (missing quotes?) at %s:%d", srcPath, lineNum);
3238 continue;
3239 }
3240
3241
3242 //
3243 // Pull out the flags from the test file line.
3244 //
3245 flagsMat.reset(testLine);
3246 flagsMat.lookingAt(status); // Will always match, possibly an empty string.
3247 testFlags = flagsMat.group(1, status);
3248 if (flagsMat.group(2, status).length() > 0) {
3249 errln("Bad Match flag at line %d. Scanning %c\n",
3250 lineNum, flagsMat.group(2, status).charAt(0));
3251 continue;
3252 }
3253 testLine.remove(0, flagsMat.end(0, status));
3254
3255 //
3256 // Pull out the match string, as a whole.
3257 // We'll process the <tags> later.
3258 //
3259 quotedStuffMat.reset(testLine);
3260 if (quotedStuffMat.lookingAt(status)) {
3261 matchString = quotedStuffMat.group(2, status);
3262 testLine.remove(0, quotedStuffMat.end(0, status));
3263 } else {
3264 errln("Bad match string at test file line %d", lineNum);
3265 continue;
3266 }
3267
3268 //
3269 // The only thing left from the input line should be an optional trailing comment.
3270 //
3271 commentMat.reset(testLine);
3272 if (commentMat.lookingAt(status) == FALSE) {
3273 errln("Line %d: unexpected characters at end of test line.", lineNum);
3274 continue;
3275 }
3276
3277 //
3278 // Run the test
3279 //
3280 regex_find(testPattern, testFlags, matchString, srcPath, lineNum);
3281 }
3282
3283 delete [] testData;
3284
3285 }
3286
3287
3288
3289 //---------------------------------------------------------------------------
3290 //
3291 // regex_find(pattern, flags, inputString, lineNumber)
3292 //
3293 // Function to run a single test from the Extended (data driven) tests.
3294 // See file test/testdata/regextst.txt for a description of the
3295 // pattern and inputString fields, and the allowed flags.
3296 // lineNumber is the source line in regextst.txt of the test.
3297 //
3298 //---------------------------------------------------------------------------
3299
3300
3301 // Set a value into a UVector at position specified by a decimal number in
3302 // a UnicodeString. This is a utility function needed by the actual test function,
3303 // which follows.
set(UVector & vec,int32_t val,UnicodeString index)3304 static void set(UVector &vec, int32_t val, UnicodeString index) {
3305 UErrorCode status=U_ZERO_ERROR;
3306 int32_t idx = 0;
3307 for (int32_t i=0; i<index.length(); i++) {
3308 int32_t d=u_charDigitValue(index.charAt(i));
3309 if (d<0) {return;}
3310 idx = idx*10 + d;
3311 }
3312 while (vec.size()<idx+1) {vec.addElement(-1, status);}
3313 vec.setElementAt(val, idx);
3314 }
3315
setInt(UVector & vec,int32_t val,int32_t idx)3316 static void setInt(UVector &vec, int32_t val, int32_t idx) {
3317 UErrorCode status=U_ZERO_ERROR;
3318 while (vec.size()<idx+1) {vec.addElement(-1, status);}
3319 vec.setElementAt(val, idx);
3320 }
3321
utextOffsetToNative(UText * utext,int32_t unistrOffset,int32_t & nativeIndex)3322 static UBool utextOffsetToNative(UText *utext, int32_t unistrOffset, int32_t& nativeIndex)
3323 {
3324 UBool couldFind = TRUE;
3325 UTEXT_SETNATIVEINDEX(utext, 0);
3326 int32_t i = 0;
3327 while (i < unistrOffset) {
3328 UChar32 c = UTEXT_NEXT32(utext);
3329 if (c != U_SENTINEL) {
3330 i += U16_LENGTH(c);
3331 } else {
3332 couldFind = FALSE;
3333 break;
3334 }
3335 }
3336 nativeIndex = (int32_t)UTEXT_GETNATIVEINDEX(utext);
3337 return couldFind;
3338 }
3339
3340
regex_find(const UnicodeString & pattern,const UnicodeString & flags,const UnicodeString & inputString,const char * srcPath,int32_t line)3341 void RegexTest::regex_find(const UnicodeString &pattern,
3342 const UnicodeString &flags,
3343 const UnicodeString &inputString,
3344 const char *srcPath,
3345 int32_t line) {
3346 UnicodeString unEscapedInput;
3347 UnicodeString deTaggedInput;
3348
3349 int32_t patternUTF8Length, inputUTF8Length;
3350 char *patternChars = NULL, *inputChars = NULL;
3351 UText patternText = UTEXT_INITIALIZER;
3352 UText inputText = UTEXT_INITIALIZER;
3353 UConverter *UTF8Converter = NULL;
3354
3355 UErrorCode status = U_ZERO_ERROR;
3356 UParseError pe;
3357 RegexPattern *parsePat = NULL;
3358 RegexMatcher *parseMatcher = NULL;
3359 RegexPattern *callerPattern = NULL, *UTF8Pattern = NULL;
3360 RegexMatcher *matcher = NULL, *UTF8Matcher = NULL;
3361 UVector groupStarts(status);
3362 UVector groupEnds(status);
3363 UVector groupStartsUTF8(status);
3364 UVector groupEndsUTF8(status);
3365 UBool isMatch = FALSE, isUTF8Match = FALSE;
3366 UBool failed = FALSE;
3367 int32_t numFinds;
3368 int32_t i;
3369 UBool useMatchesFunc = FALSE;
3370 UBool useLookingAtFunc = FALSE;
3371 int32_t regionStart = -1;
3372 int32_t regionEnd = -1;
3373 int32_t regionStartUTF8 = -1;
3374 int32_t regionEndUTF8 = -1;
3375
3376
3377 //
3378 // Compile the caller's pattern
3379 //
3380 uint32_t bflags = 0;
3381 if (flags.indexOf((UChar)0x69) >= 0) { // 'i' flag
3382 bflags |= UREGEX_CASE_INSENSITIVE;
3383 }
3384 if (flags.indexOf((UChar)0x78) >= 0) { // 'x' flag
3385 bflags |= UREGEX_COMMENTS;
3386 }
3387 if (flags.indexOf((UChar)0x73) >= 0) { // 's' flag
3388 bflags |= UREGEX_DOTALL;
3389 }
3390 if (flags.indexOf((UChar)0x6d) >= 0) { // 'm' flag
3391 bflags |= UREGEX_MULTILINE;
3392 }
3393
3394 if (flags.indexOf((UChar)0x65) >= 0) { // 'e' flag
3395 bflags |= UREGEX_ERROR_ON_UNKNOWN_ESCAPES;
3396 }
3397 if (flags.indexOf((UChar)0x44) >= 0) { // 'D' flag
3398 bflags |= UREGEX_UNIX_LINES;
3399 }
3400 if (flags.indexOf((UChar)0x51) >= 0) { // 'Q' flag
3401 bflags |= UREGEX_LITERAL;
3402 }
3403
3404
3405 callerPattern = RegexPattern::compile(pattern, bflags, pe, status);
3406 if (status != U_ZERO_ERROR) {
3407 #if UCONFIG_NO_BREAK_ITERATION==1
3408 // 'v' test flag means that the test pattern should not compile if ICU was configured
3409 // to not include break iteration. RBBI is needed for Unicode word boundaries.
3410 if (flags.indexOf((UChar)0x76) >= 0 /*'v'*/ && status == U_UNSUPPORTED_ERROR) {
3411 goto cleanupAndReturn;
3412 }
3413 #endif
3414 if (flags.indexOf((UChar)0x45) >= 0) { // flags contain 'E'
3415 // Expected pattern compilation error.
3416 if (flags.indexOf((UChar)0x64) >= 0) { // flags contain 'd'
3417 logln("Pattern Compile returns \"%s\"", u_errorName(status));
3418 }
3419 goto cleanupAndReturn;
3420 } else {
3421 // Unexpected pattern compilation error.
3422 dataerrln("Line %d: error %s compiling pattern.", line, u_errorName(status));
3423 goto cleanupAndReturn;
3424 }
3425 }
3426
3427 UTF8Converter = ucnv_open("UTF8", &status);
3428 ucnv_setFromUCallBack(UTF8Converter, UCNV_FROM_U_CALLBACK_STOP, NULL, NULL, NULL, &status);
3429
3430 patternUTF8Length = pattern.extract(NULL, 0, UTF8Converter, status);
3431 status = U_ZERO_ERROR; // buffer overflow
3432 patternChars = new char[patternUTF8Length+1];
3433 pattern.extract(patternChars, patternUTF8Length+1, UTF8Converter, status);
3434 utext_openUTF8(&patternText, patternChars, patternUTF8Length, &status);
3435
3436 if (status == U_ZERO_ERROR) {
3437 UTF8Pattern = RegexPattern::compile(&patternText, bflags, pe, status);
3438
3439 if (status != U_ZERO_ERROR) {
3440 #if UCONFIG_NO_BREAK_ITERATION==1
3441 // 'v' test flag means that the test pattern should not compile if ICU was configured
3442 // to not include break iteration. RBBI is needed for Unicode word boundaries.
3443 if (flags.indexOf((UChar)0x76) >= 0 /*'v'*/ && status == U_UNSUPPORTED_ERROR) {
3444 goto cleanupAndReturn;
3445 }
3446 #endif
3447 if (flags.indexOf((UChar)0x45) >= 0) { // flags contain 'E'
3448 // Expected pattern compilation error.
3449 if (flags.indexOf((UChar)0x64) >= 0) { // flags contain 'd'
3450 logln("Pattern Compile returns \"%s\" (UTF8)", u_errorName(status));
3451 }
3452 goto cleanupAndReturn;
3453 } else {
3454 // Unexpected pattern compilation error.
3455 errln("Line %d: error %s compiling pattern. (UTF8)", line, u_errorName(status));
3456 goto cleanupAndReturn;
3457 }
3458 }
3459 }
3460
3461 if (UTF8Pattern == NULL) {
3462 // UTF-8 does not allow unpaired surrogates, so this could actually happen without being a failure of the engine
3463 logln("Unable to create UTF-8 pattern, skipping UTF-8 tests for %s:%d", srcPath, line);
3464 status = U_ZERO_ERROR;
3465 }
3466
3467 if (flags.indexOf((UChar)0x64) >= 0) { // 'd' flag
3468 callerPattern->dumpPattern();
3469 }
3470
3471 if (flags.indexOf((UChar)0x45) >= 0) { // 'E' flag
3472 errln("%s, Line %d: Expected, but did not get, a pattern compilation error.", srcPath, line);
3473 goto cleanupAndReturn;
3474 }
3475
3476
3477 //
3478 // Number of times find() should be called on the test string, default to 1
3479 //
3480 numFinds = 1;
3481 for (i=2; i<=9; i++) {
3482 if (flags.indexOf((UChar)(0x30 + i)) >= 0) { // digit flag
3483 if (numFinds != 1) {
3484 errln("Line %d: more than one digit flag. Scanning %d.", line, i);
3485 goto cleanupAndReturn;
3486 }
3487 numFinds = i;
3488 }
3489 }
3490
3491 // 'M' flag. Use matches() instead of find()
3492 if (flags.indexOf((UChar)0x4d) >= 0) {
3493 useMatchesFunc = TRUE;
3494 }
3495 if (flags.indexOf((UChar)0x4c) >= 0) {
3496 useLookingAtFunc = TRUE;
3497 }
3498
3499 //
3500 // Find the tags in the input data, remove them, and record the group boundary
3501 // positions.
3502 //
3503 parsePat = RegexPattern::compile("<(/?)(r|[0-9]+)>", 0, pe, status);
3504 if (!assertSuccess(WHERE, status) ) {
3505 goto cleanupAndReturn;
3506 }
3507
3508 unEscapedInput = inputString.unescape();
3509 parseMatcher = parsePat->matcher(unEscapedInput, status);
3510 if (!assertSuccess(WHERE, status) ) {
3511 goto cleanupAndReturn;
3512 }
3513 while(parseMatcher->find()) {
3514 parseMatcher->appendReplacement(deTaggedInput, "", status);
3515 REGEX_CHECK_STATUS;
3516 UnicodeString groupNum = parseMatcher->group(2, status);
3517 if (groupNum == "r") {
3518 // <r> or </r>, a region specification within the string
3519 if (parseMatcher->group(1, status) == "/") {
3520 regionEnd = deTaggedInput.length();
3521 } else {
3522 regionStart = deTaggedInput.length();
3523 }
3524 } else {
3525 // <digits> or </digits>, a group match boundary tag.
3526 if (parseMatcher->group(1, status) == "/") {
3527 set(groupEnds, deTaggedInput.length(), groupNum);
3528 } else {
3529 set(groupStarts, deTaggedInput.length(), groupNum);
3530 }
3531 }
3532 }
3533 parseMatcher->appendTail(deTaggedInput);
3534
3535 if (groupStarts.size() != groupEnds.size()) {
3536 errln("Error at line %d: mismatched <n> group tags in expected results.", line);
3537 failed = true;
3538 goto cleanupAndReturn;
3539 }
3540 if ((regionStart>=0 || regionEnd>=0) && (regionStart<0 || regionStart>regionEnd)) {
3541 errln("mismatched <r> tags");
3542 failed = TRUE;
3543 goto cleanupAndReturn;
3544 }
3545
3546 //
3547 // Configure the matcher according to the flags specified with this test.
3548 //
3549 matcher = callerPattern->matcher(deTaggedInput, status);
3550 REGEX_CHECK_STATUS_L(line);
3551 if (flags.indexOf((UChar)0x74) >= 0) { // 't' trace flag
3552 matcher->setTrace(TRUE);
3553 }
3554
3555 if (UTF8Pattern != NULL) {
3556 inputUTF8Length = deTaggedInput.extract(NULL, 0, UTF8Converter, status);
3557 status = U_ZERO_ERROR; // buffer overflow
3558 inputChars = new char[inputUTF8Length+1];
3559 deTaggedInput.extract(inputChars, inputUTF8Length+1, UTF8Converter, status);
3560 utext_openUTF8(&inputText, inputChars, inputUTF8Length, &status);
3561
3562 if (status == U_ZERO_ERROR) {
3563 UTF8Matcher = &UTF8Pattern->matcher(status)->reset(&inputText);
3564 REGEX_CHECK_STATUS_L(line);
3565 }
3566
3567 if (UTF8Matcher == NULL) {
3568 // UTF-8 does not allow unpaired surrogates, so this could actually happen without being a failure of the engine
3569 logln("Unable to create UTF-8 matcher, skipping UTF-8 tests for %s:%d", srcPath, line);
3570 status = U_ZERO_ERROR;
3571 }
3572 }
3573
3574 //
3575 // Generate native indices for UTF8 versions of region and capture group info
3576 //
3577 if (UTF8Matcher != NULL) {
3578 if (flags.indexOf((UChar)0x74) >= 0) { // 't' trace flag
3579 UTF8Matcher->setTrace(TRUE);
3580 }
3581 if (regionStart>=0) (void) utextOffsetToNative(&inputText, regionStart, regionStartUTF8);
3582 if (regionEnd>=0) (void) utextOffsetToNative(&inputText, regionEnd, regionEndUTF8);
3583
3584 // Fill out the native index UVector info.
3585 // Only need 1 loop, from above we know groupStarts.size() = groupEnds.size()
3586 for (i=0; i<groupStarts.size(); i++) {
3587 int32_t start = groupStarts.elementAti(i);
3588 // -1 means there was no UVector slot and we won't be requesting that capture group for this test, don't bother inserting
3589 if (start >= 0) {
3590 int32_t startUTF8;
3591 if (!utextOffsetToNative(&inputText, start, startUTF8)) {
3592 errln("Error at line %d: could not find native index for group start %d. UTF16 index %d", line, i, start);
3593 failed = TRUE;
3594 goto cleanupAndReturn; // Good chance of subsequent bogus errors. Stop now.
3595 }
3596 setInt(groupStartsUTF8, startUTF8, i);
3597 }
3598
3599 int32_t end = groupEnds.elementAti(i);
3600 // -1 means there was no UVector slot and we won't be requesting that capture group for this test, don't bother inserting
3601 if (end >= 0) {
3602 int32_t endUTF8;
3603 if (!utextOffsetToNative(&inputText, end, endUTF8)) {
3604 errln("Error at line %d: could not find native index for group end %d. UTF16 index %d", line, i, end);
3605 failed = TRUE;
3606 goto cleanupAndReturn; // Good chance of subsequent bogus errors. Stop now.
3607 }
3608 setInt(groupEndsUTF8, endUTF8, i);
3609 }
3610 }
3611 }
3612
3613 if (regionStart>=0) {
3614 matcher->region(regionStart, regionEnd, status);
3615 REGEX_CHECK_STATUS_L(line);
3616 if (UTF8Matcher != NULL) {
3617 UTF8Matcher->region(regionStartUTF8, regionEndUTF8, status);
3618 REGEX_CHECK_STATUS_L(line);
3619 }
3620 }
3621 if (flags.indexOf((UChar)0x61) >= 0) { // 'a' anchoring bounds flag
3622 matcher->useAnchoringBounds(FALSE);
3623 if (UTF8Matcher != NULL) {
3624 UTF8Matcher->useAnchoringBounds(FALSE);
3625 }
3626 }
3627 if (flags.indexOf((UChar)0x62) >= 0) { // 'b' transparent bounds flag
3628 matcher->useTransparentBounds(TRUE);
3629 if (UTF8Matcher != NULL) {
3630 UTF8Matcher->useTransparentBounds(TRUE);
3631 }
3632 }
3633
3634
3635
3636 //
3637 // Do a find on the de-tagged input using the caller's pattern
3638 // TODO: error on count>1 and not find().
3639 // error on both matches() and lookingAt().
3640 //
3641 for (i=0; i<numFinds; i++) {
3642 if (useMatchesFunc) {
3643 isMatch = matcher->matches(status);
3644 if (UTF8Matcher != NULL) {
3645 isUTF8Match = UTF8Matcher->matches(status);
3646 }
3647 } else if (useLookingAtFunc) {
3648 isMatch = matcher->lookingAt(status);
3649 if (UTF8Matcher != NULL) {
3650 isUTF8Match = UTF8Matcher->lookingAt(status);
3651 }
3652 } else {
3653 isMatch = matcher->find();
3654 if (UTF8Matcher != NULL) {
3655 isUTF8Match = UTF8Matcher->find();
3656 }
3657 }
3658 }
3659 matcher->setTrace(FALSE);
3660 if (UTF8Matcher) {
3661 UTF8Matcher->setTrace(FALSE);
3662 }
3663 if (U_FAILURE(status)) {
3664 errln("Error at line %d. ICU ErrorCode is %s", u_errorName(status));
3665 }
3666
3667 //
3668 // Match up the groups from the find() with the groups from the tags
3669 //
3670
3671 // number of tags should match number of groups from find operation.
3672 // matcher->groupCount does not include group 0, the entire match, hence the +1.
3673 // G option in test means that capture group data is not available in the
3674 // expected results, so the check needs to be suppressed.
3675 if (isMatch == FALSE && groupStarts.size() != 0) {
3676 dataerrln("Error at line %d: Match expected, but none found.", line);
3677 failed = TRUE;
3678 goto cleanupAndReturn;
3679 } else if (UTF8Matcher != NULL && isUTF8Match == FALSE && groupStarts.size() != 0) {
3680 errln("Error at line %d: Match expected, but none found. (UTF8)", line);
3681 failed = TRUE;
3682 goto cleanupAndReturn;
3683 }
3684 if (isMatch && groupStarts.size() == 0) {
3685 errln("Error at line %d: No match expected, but one found at position %d.", line, matcher->start(status));
3686 failed = TRUE;
3687 }
3688 if (UTF8Matcher && isUTF8Match && groupStarts.size() == 0) {
3689 errln("Error at line %d: No match expected, but one found at position %d (UTF-8).", line, UTF8Matcher->start(status));
3690 failed = TRUE;
3691 }
3692
3693 if (flags.indexOf((UChar)0x47 /*G*/) >= 0) {
3694 // Only check for match / no match. Don't check capture groups.
3695 goto cleanupAndReturn;
3696 }
3697
3698 REGEX_CHECK_STATUS_L(line);
3699 for (i=0; i<=matcher->groupCount(); i++) {
3700 int32_t expectedStart = (i >= groupStarts.size()? -1 : groupStarts.elementAti(i));
3701 int32_t expectedStartUTF8 = (i >= groupStartsUTF8.size()? -1 : groupStartsUTF8.elementAti(i));
3702 if (matcher->start(i, status) != expectedStart) {
3703 errln("Error at line %d: incorrect start position for group %d. Expected %d, got %d",
3704 line, i, expectedStart, matcher->start(i, status));
3705 failed = TRUE;
3706 goto cleanupAndReturn; // Good chance of subsequent bogus errors. Stop now.
3707 } else if (UTF8Matcher != NULL && UTF8Matcher->start(i, status) != expectedStartUTF8) {
3708 errln("Error at line %d: incorrect start position for group %d. Expected %d, got %d (UTF8)",
3709 line, i, expectedStartUTF8, UTF8Matcher->start(i, status));
3710 failed = TRUE;
3711 goto cleanupAndReturn; // Good chance of subsequent bogus errors. Stop now.
3712 }
3713
3714 int32_t expectedEnd = (i >= groupEnds.size()? -1 : groupEnds.elementAti(i));
3715 int32_t expectedEndUTF8 = (i >= groupEndsUTF8.size()? -1 : groupEndsUTF8.elementAti(i));
3716 if (matcher->end(i, status) != expectedEnd) {
3717 errln("Error at line %d: incorrect end position for group %d. Expected %d, got %d",
3718 line, i, expectedEnd, matcher->end(i, status));
3719 failed = TRUE;
3720 // Error on end position; keep going; real error is probably yet to come as group
3721 // end positions work from end of the input data towards the front.
3722 } else if (UTF8Matcher != NULL && UTF8Matcher->end(i, status) != expectedEndUTF8) {
3723 errln("Error at line %d: incorrect end position for group %d. Expected %d, got %d (UTF8)",
3724 line, i, expectedEndUTF8, UTF8Matcher->end(i, status));
3725 failed = TRUE;
3726 // Error on end position; keep going; real error is probably yet to come as group
3727 // end positions work from end of the input data towards the front.
3728 }
3729 }
3730 if ( matcher->groupCount()+1 < groupStarts.size()) {
3731 errln("Error at line %d: Expected %d capture groups, found %d.",
3732 line, groupStarts.size()-1, matcher->groupCount());
3733 failed = TRUE;
3734 }
3735 else if (UTF8Matcher != NULL && UTF8Matcher->groupCount()+1 < groupStarts.size()) {
3736 errln("Error at line %d: Expected %d capture groups, found %d. (UTF8)",
3737 line, groupStarts.size()-1, UTF8Matcher->groupCount());
3738 failed = TRUE;
3739 }
3740
3741 if ((flags.indexOf((UChar)0x59) >= 0) && // 'Y' flag: RequireEnd() == false
3742 matcher->requireEnd() == TRUE) {
3743 errln("Error at line %d: requireEnd() returned TRUE. Expected FALSE", line);
3744 failed = TRUE;
3745 } else if (UTF8Matcher != NULL && (flags.indexOf((UChar)0x59) >= 0) && // 'Y' flag: RequireEnd() == false
3746 UTF8Matcher->requireEnd() == TRUE) {
3747 errln("Error at line %d: requireEnd() returned TRUE. Expected FALSE (UTF8)", line);
3748 failed = TRUE;
3749 }
3750
3751 if ((flags.indexOf((UChar)0x79) >= 0) && // 'y' flag: RequireEnd() == true
3752 matcher->requireEnd() == FALSE) {
3753 errln("Error at line %d: requireEnd() returned FALSE. Expected TRUE", line);
3754 failed = TRUE;
3755 } else if (UTF8Matcher != NULL && (flags.indexOf((UChar)0x79) >= 0) && // 'Y' flag: RequireEnd() == false
3756 UTF8Matcher->requireEnd() == FALSE) {
3757 errln("Error at line %d: requireEnd() returned FALSE. Expected TRUE (UTF8)", line);
3758 failed = TRUE;
3759 }
3760
3761 if ((flags.indexOf((UChar)0x5A) >= 0) && // 'Z' flag: hitEnd() == false
3762 matcher->hitEnd() == TRUE) {
3763 errln("Error at line %d: hitEnd() returned TRUE. Expected FALSE", line);
3764 failed = TRUE;
3765 } else if (UTF8Matcher != NULL && (flags.indexOf((UChar)0x5A) >= 0) && // 'Z' flag: hitEnd() == false
3766 UTF8Matcher->hitEnd() == TRUE) {
3767 errln("Error at line %d: hitEnd() returned TRUE. Expected FALSE (UTF8)", line);
3768 failed = TRUE;
3769 }
3770
3771 if ((flags.indexOf((UChar)0x7A) >= 0) && // 'z' flag: hitEnd() == true
3772 matcher->hitEnd() == FALSE) {
3773 errln("Error at line %d: hitEnd() returned FALSE. Expected TRUE", line);
3774 failed = TRUE;
3775 } else if (UTF8Matcher != NULL && (flags.indexOf((UChar)0x7A) >= 0) && // 'z' flag: hitEnd() == true
3776 UTF8Matcher->hitEnd() == FALSE) {
3777 errln("Error at line %d: hitEnd() returned FALSE. Expected TRUE (UTF8)", line);
3778 failed = TRUE;
3779 }
3780
3781
3782 cleanupAndReturn:
3783 if (failed) {
3784 infoln((UnicodeString)"\""+pattern+(UnicodeString)"\" "
3785 +flags+(UnicodeString)" \""+inputString+(UnicodeString)"\"");
3786 // callerPattern->dump();
3787 }
3788 delete parseMatcher;
3789 delete parsePat;
3790 delete UTF8Matcher;
3791 delete UTF8Pattern;
3792 delete matcher;
3793 delete callerPattern;
3794
3795 utext_close(&inputText);
3796 delete[] inputChars;
3797 utext_close(&patternText);
3798 delete[] patternChars;
3799 ucnv_close(UTF8Converter);
3800 }
3801
3802
3803
3804
3805 //---------------------------------------------------------------------------
3806 //
3807 // Errors Check for error handling in patterns.
3808 //
3809 //---------------------------------------------------------------------------
Errors()3810 void RegexTest::Errors() {
3811 // \escape sequences that aren't implemented yet.
3812 //REGEX_ERR("hex format \\x{abcd} not implemented", 1, 13, U_REGEX_UNIMPLEMENTED);
3813
3814 // Missing close parentheses
3815 REGEX_ERR("Comment (?# with no close", 1, 25, U_REGEX_MISMATCHED_PAREN);
3816 REGEX_ERR("Capturing Parenthesis(...", 1, 25, U_REGEX_MISMATCHED_PAREN);
3817 REGEX_ERR("Grouping only parens (?: blah blah", 1, 34, U_REGEX_MISMATCHED_PAREN);
3818
3819 // Extra close paren
3820 REGEX_ERR("Grouping only parens (?: blah)) blah", 1, 31, U_REGEX_MISMATCHED_PAREN);
3821 REGEX_ERR(")))))))", 1, 1, U_REGEX_MISMATCHED_PAREN);
3822 REGEX_ERR("(((((((", 1, 7, U_REGEX_MISMATCHED_PAREN);
3823
3824 // Look-ahead, Look-behind
3825 // TODO: add tests for unbounded length look-behinds.
3826 REGEX_ERR("abc(?<@xyz).*", 1, 7, U_REGEX_RULE_SYNTAX); // illegal construct
3827
3828 // Attempt to use non-default flags
3829 {
3830 UParseError pe;
3831 UErrorCode status = U_ZERO_ERROR;
3832 int32_t flags = UREGEX_CANON_EQ |
3833 UREGEX_COMMENTS | UREGEX_DOTALL |
3834 UREGEX_MULTILINE;
3835 RegexPattern *pat1= RegexPattern::compile(".*", flags, pe, status);
3836 REGEX_ASSERT(status == U_REGEX_UNIMPLEMENTED);
3837 delete pat1;
3838 }
3839
3840
3841 // Quantifiers are allowed only after something that can be quantified.
3842 REGEX_ERR("+", 1, 1, U_REGEX_RULE_SYNTAX);
3843 REGEX_ERR("abc\ndef(*2)", 2, 5, U_REGEX_RULE_SYNTAX);
3844 REGEX_ERR("abc**", 1, 5, U_REGEX_RULE_SYNTAX);
3845
3846 // Mal-formed {min,max} quantifiers
3847 REGEX_ERR("abc{a,2}",1,5, U_REGEX_BAD_INTERVAL);
3848 REGEX_ERR("abc{4,2}",1,8, U_REGEX_MAX_LT_MIN);
3849 REGEX_ERR("abc{1,b}",1,7, U_REGEX_BAD_INTERVAL);
3850 REGEX_ERR("abc{1,,2}",1,7, U_REGEX_BAD_INTERVAL);
3851 REGEX_ERR("abc{1,2a}",1,8, U_REGEX_BAD_INTERVAL);
3852 REGEX_ERR("abc{222222222222222222222}",1,14, U_REGEX_NUMBER_TOO_BIG);
3853 REGEX_ERR("abc{5,50000000000}", 1, 16, U_REGEX_NUMBER_TOO_BIG); // Overflows int during scan
3854 REGEX_ERR("abc{5,687865858}", 1, 16, U_REGEX_NUMBER_TOO_BIG); // Overflows regex binary format
3855 REGEX_ERR("abc{687865858,687865859}", 1, 24, U_REGEX_NUMBER_TOO_BIG);
3856
3857 // Ticket 5389
3858 REGEX_ERR("*c", 1, 1, U_REGEX_RULE_SYNTAX);
3859
3860 // Invalid Back Reference \0
3861 // For ICU 3.8 and earlier
3862 // For ICU versions newer than 3.8, \0 introduces an octal escape.
3863 //
3864 REGEX_ERR("(ab)\\0", 1, 6, U_REGEX_BAD_ESCAPE_SEQUENCE);
3865
3866 }
3867
3868 //-------------------------------------------------------------------------------
3869 //
3870 // PerlTests - Run Perl's regular expression tests
3871 // The input file for this test is re_tests, the standard regular
3872 // expression test data distributed with the Perl source code.
3873 //
3874 // Here is Perl's description of the test data file:
3875 //
3876 // # The tests are in a separate file 't/op/re_tests'.
3877 // # Each line in that file is a separate test.
3878 // # There are five columns, separated by tabs.
3879 // #
3880 // # Column 1 contains the pattern, optionally enclosed in C<''>.
3881 // # Modifiers can be put after the closing C<'>.
3882 // #
3883 // # Column 2 contains the string to be matched.
3884 // #
3885 // # Column 3 contains the expected result:
3886 // # y expect a match
3887 // # n expect no match
3888 // # c expect an error
3889 // # B test exposes a known bug in Perl, should be skipped
3890 // # b test exposes a known bug in Perl, should be skipped if noamp
3891 // #
3892 // # Columns 4 and 5 are used only if column 3 contains C<y> or C<c>.
3893 // #
3894 // # Column 4 contains a string, usually C<$&>.
3895 // #
3896 // # Column 5 contains the expected result of double-quote
3897 // # interpolating that string after the match, or start of error message.
3898 // #
3899 // # Column 6, if present, contains a reason why the test is skipped.
3900 // # This is printed with "skipped", for harness to pick up.
3901 // #
3902 // # \n in the tests are interpolated, as are variables of the form ${\w+}.
3903 // #
3904 // # If you want to add a regular expression test that can't be expressed
3905 // # in this format, don't add it here: put it in op/pat.t instead.
3906 //
3907 // For ICU, if field 3 contains an 'i', the test will be skipped.
3908 // The test exposes is some known incompatibility between ICU and Perl regexps.
3909 // (The i is in addition to whatever was there before.)
3910 //
3911 //-------------------------------------------------------------------------------
PerlTests()3912 void RegexTest::PerlTests() {
3913 char tdd[2048];
3914 const char *srcPath;
3915 UErrorCode status = U_ZERO_ERROR;
3916 UParseError pe;
3917
3918 //
3919 // Open and read the test data file.
3920 //
3921 srcPath=getPath(tdd, "re_tests.txt");
3922 if(srcPath==NULL) {
3923 return; /* something went wrong, error already output */
3924 }
3925
3926 int32_t len;
3927 UChar *testData = ReadAndConvertFile(srcPath, len, "iso-8859-1", status);
3928 if (U_FAILURE(status)) {
3929 return; /* something went wrong, error already output */
3930 }
3931
3932 //
3933 // Put the test data into a UnicodeString
3934 //
3935 UnicodeString testDataString(FALSE, testData, len);
3936
3937 //
3938 // Regex to break the input file into lines, and strip the new lines.
3939 // One line per match, capture group one is the desired data.
3940 //
3941 RegexPattern* linePat = RegexPattern::compile(UNICODE_STRING_SIMPLE("(.+?)[\\r\\n]+"), 0, pe, status);
3942 if (U_FAILURE(status)) {
3943 dataerrln("RegexPattern::compile() error");
3944 return;
3945 }
3946 RegexMatcher* lineMat = linePat->matcher(testDataString, status);
3947
3948 //
3949 // Regex to split a test file line into fields.
3950 // There are six fields, separated by tabs.
3951 //
3952 RegexPattern* fieldPat = RegexPattern::compile(UNICODE_STRING_SIMPLE("\\t"), 0, pe, status);
3953
3954 //
3955 // Regex to identify test patterns with flag settings, and to separate them.
3956 // Test patterns with flags look like 'pattern'i
3957 // Test patterns without flags are not quoted: pattern
3958 // Coming out, capture group 2 is the pattern, capture group 3 is the flags.
3959 //
3960 RegexPattern *flagPat = RegexPattern::compile(UNICODE_STRING_SIMPLE("('?)(.*)\\1(.*)"), 0, pe, status);
3961 RegexMatcher* flagMat = flagPat->matcher(status);
3962
3963 //
3964 // The Perl tests reference several perl-isms, which are evaluated/substituted
3965 // in the test data. Not being perl, this must be done explicitly. Here
3966 // are string constants and REs for these constructs.
3967 //
3968 UnicodeString nulnulSrc("${nulnul}");
3969 UnicodeString nulnul("\\u0000\\u0000", -1, US_INV);
3970 nulnul = nulnul.unescape();
3971
3972 UnicodeString ffffSrc("${ffff}");
3973 UnicodeString ffff("\\uffff", -1, US_INV);
3974 ffff = ffff.unescape();
3975
3976 // regexp for $-[0], $+[2], etc.
3977 RegexPattern *groupsPat = RegexPattern::compile(UNICODE_STRING_SIMPLE("\\$([+\\-])\\[(\\d+)\\]"), 0, pe, status);
3978 RegexMatcher *groupsMat = groupsPat->matcher(status);
3979
3980 // regexp for $0, $1, $2, etc.
3981 RegexPattern *cgPat = RegexPattern::compile(UNICODE_STRING_SIMPLE("\\$(\\d+)"), 0, pe, status);
3982 RegexMatcher *cgMat = cgPat->matcher(status);
3983
3984
3985 //
3986 // Main Loop for the Perl Tests, runs once per line from the
3987 // test data file.
3988 //
3989 int32_t lineNum = 0;
3990 int32_t skippedUnimplementedCount = 0;
3991 while (lineMat->find()) {
3992 lineNum++;
3993
3994 //
3995 // Get a line, break it into its fields, do the Perl
3996 // variable substitutions.
3997 //
3998 UnicodeString line = lineMat->group(1, status);
3999 UnicodeString fields[7];
4000 fieldPat->split(line, fields, 7, status);
4001
4002 flagMat->reset(fields[0]);
4003 flagMat->matches(status);
4004 UnicodeString pattern = flagMat->group(2, status);
4005 pattern.findAndReplace("${bang}", "!");
4006 pattern.findAndReplace(nulnulSrc, UNICODE_STRING_SIMPLE("\\u0000\\u0000"));
4007 pattern.findAndReplace(ffffSrc, ffff);
4008
4009 //
4010 // Identify patterns that include match flag settings,
4011 // split off the flags, remove the extra quotes.
4012 //
4013 UnicodeString flagStr = flagMat->group(3, status);
4014 if (U_FAILURE(status)) {
4015 errln("ucnv_toUChars: ICU Error \"%s\"\n", u_errorName(status));
4016 return;
4017 }
4018 int32_t flags = 0;
4019 const UChar UChar_c = 0x63; // Char constants for the flag letters.
4020 const UChar UChar_i = 0x69; // (Damn the lack of Unicode support in C)
4021 const UChar UChar_m = 0x6d;
4022 const UChar UChar_x = 0x78;
4023 const UChar UChar_y = 0x79;
4024 if (flagStr.indexOf(UChar_i) != -1) {
4025 flags |= UREGEX_CASE_INSENSITIVE;
4026 }
4027 if (flagStr.indexOf(UChar_m) != -1) {
4028 flags |= UREGEX_MULTILINE;
4029 }
4030 if (flagStr.indexOf(UChar_x) != -1) {
4031 flags |= UREGEX_COMMENTS;
4032 }
4033
4034 //
4035 // Compile the test pattern.
4036 //
4037 status = U_ZERO_ERROR;
4038 RegexPattern *testPat = RegexPattern::compile(pattern, flags, pe, status);
4039 if (status == U_REGEX_UNIMPLEMENTED) {
4040 //
4041 // Test of a feature that is planned for ICU, but not yet implemented.
4042 // skip the test.
4043 skippedUnimplementedCount++;
4044 delete testPat;
4045 status = U_ZERO_ERROR;
4046 continue;
4047 }
4048
4049 if (U_FAILURE(status)) {
4050 // Some tests are supposed to generate errors.
4051 // Only report an error for tests that are supposed to succeed.
4052 if (fields[2].indexOf(UChar_c) == -1 && // Compilation is not supposed to fail AND
4053 fields[2].indexOf(UChar_i) == -1) // it's not an accepted ICU incompatibility
4054 {
4055 errln("line %d: ICU Error \"%s\"\n", lineNum, u_errorName(status));
4056 }
4057 status = U_ZERO_ERROR;
4058 delete testPat;
4059 continue;
4060 }
4061
4062 if (fields[2].indexOf(UChar_i) >= 0) {
4063 // ICU should skip this test.
4064 delete testPat;
4065 continue;
4066 }
4067
4068 if (fields[2].indexOf(UChar_c) >= 0) {
4069 // This pattern should have caused a compilation error, but didn't/
4070 errln("line %d: Expected a pattern compile error, got success.", lineNum);
4071 delete testPat;
4072 continue;
4073 }
4074
4075 //
4076 // replace the Perl variables that appear in some of the
4077 // match data strings.
4078 //
4079 UnicodeString matchString = fields[1];
4080 matchString.findAndReplace(nulnulSrc, nulnul);
4081 matchString.findAndReplace(ffffSrc, ffff);
4082
4083 // Replace any \n in the match string with an actual new-line char.
4084 // Don't do full unescape, as this unescapes more than Perl does, which
4085 // causes other spurious failures in the tests.
4086 matchString.findAndReplace(UNICODE_STRING_SIMPLE("\\n"), "\n");
4087
4088
4089
4090 //
4091 // Run the test, check for expected match/don't match result.
4092 //
4093 RegexMatcher *testMat = testPat->matcher(matchString, status);
4094 UBool found = testMat->find();
4095 UBool expected = FALSE;
4096 if (fields[2].indexOf(UChar_y) >=0) {
4097 expected = TRUE;
4098 }
4099 if (expected != found) {
4100 errln("line %d: Expected %smatch, got %smatch",
4101 lineNum, expected?"":"no ", found?"":"no " );
4102 delete testMat;
4103 delete testPat;
4104 continue;
4105 }
4106
4107 // Don't try to check expected results if there is no match.
4108 // (Some have stuff in the expected fields)
4109 if (!found) {
4110 delete testMat;
4111 delete testPat;
4112 continue;
4113 }
4114
4115 //
4116 // Interpret the Perl expression from the fourth field of the data file,
4117 // building up an ICU string from the results of the ICU match.
4118 // The Perl expression will contain references to the results of
4119 // a regex match, including the matched string, capture group strings,
4120 // group starting and ending indices, etc.
4121 //
4122 UnicodeString resultString;
4123 UnicodeString perlExpr = fields[3];
4124 #if SUPPORT_MUTATING_INPUT_STRING
4125 groupsMat->reset(perlExpr);
4126 cgMat->reset(perlExpr);
4127 #endif
4128
4129 while (perlExpr.length() > 0) {
4130 #if !SUPPORT_MUTATING_INPUT_STRING
4131 // Preferred usage. Reset after any modification to input string.
4132 groupsMat->reset(perlExpr);
4133 cgMat->reset(perlExpr);
4134 #endif
4135
4136 if (perlExpr.startsWith("$&")) {
4137 resultString.append(testMat->group(status));
4138 perlExpr.remove(0, 2);
4139 }
4140
4141 else if (groupsMat->lookingAt(status)) {
4142 // $-[0] $+[2] etc.
4143 UnicodeString digitString = groupsMat->group(2, status);
4144 int32_t t = 0;
4145 int32_t groupNum = ICU_Utility::parseNumber(digitString, t, 10);
4146 UnicodeString plusOrMinus = groupsMat->group(1, status);
4147 int32_t matchPosition;
4148 if (plusOrMinus.compare("+") == 0) {
4149 matchPosition = testMat->end(groupNum, status);
4150 } else {
4151 matchPosition = testMat->start(groupNum, status);
4152 }
4153 if (matchPosition != -1) {
4154 ICU_Utility::appendNumber(resultString, matchPosition);
4155 }
4156 perlExpr.remove(0, groupsMat->end(status));
4157 }
4158
4159 else if (cgMat->lookingAt(status)) {
4160 // $1, $2, $3, etc.
4161 UnicodeString digitString = cgMat->group(1, status);
4162 int32_t t = 0;
4163 int32_t groupNum = ICU_Utility::parseNumber(digitString, t, 10);
4164 if (U_SUCCESS(status)) {
4165 resultString.append(testMat->group(groupNum, status));
4166 status = U_ZERO_ERROR;
4167 }
4168 perlExpr.remove(0, cgMat->end(status));
4169 }
4170
4171 else if (perlExpr.startsWith("@-")) {
4172 int32_t i;
4173 for (i=0; i<=testMat->groupCount(); i++) {
4174 if (i>0) {
4175 resultString.append(" ");
4176 }
4177 ICU_Utility::appendNumber(resultString, testMat->start(i, status));
4178 }
4179 perlExpr.remove(0, 2);
4180 }
4181
4182 else if (perlExpr.startsWith("@+")) {
4183 int32_t i;
4184 for (i=0; i<=testMat->groupCount(); i++) {
4185 if (i>0) {
4186 resultString.append(" ");
4187 }
4188 ICU_Utility::appendNumber(resultString, testMat->end(i, status));
4189 }
4190 perlExpr.remove(0, 2);
4191 }
4192
4193 else if (perlExpr.startsWith(UNICODE_STRING_SIMPLE("\\"))) { // \Escape. Take following char as a literal.
4194 // or as an escaped sequence (e.g. \n)
4195 if (perlExpr.length() > 1) {
4196 perlExpr.remove(0, 1); // Remove the '\', but only if not last char.
4197 }
4198 UChar c = perlExpr.charAt(0);
4199 switch (c) {
4200 case 'n': c = '\n'; break;
4201 // add any other escape sequences that show up in the test expected results.
4202 }
4203 resultString.append(c);
4204 perlExpr.remove(0, 1);
4205 }
4206
4207 else {
4208 // Any characters from the perl expression that we don't explicitly
4209 // recognize before here are assumed to be literals and copied
4210 // as-is to the expected results.
4211 resultString.append(perlExpr.charAt(0));
4212 perlExpr.remove(0, 1);
4213 }
4214
4215 if (U_FAILURE(status)) {
4216 errln("Line %d: ICU Error \"%s\"", lineNum, u_errorName(status));
4217 break;
4218 }
4219 }
4220
4221 //
4222 // Expected Results Compare
4223 //
4224 UnicodeString expectedS(fields[4]);
4225 expectedS.findAndReplace(nulnulSrc, nulnul);
4226 expectedS.findAndReplace(ffffSrc, ffff);
4227 expectedS.findAndReplace(UNICODE_STRING_SIMPLE("\\n"), "\n");
4228
4229
4230 if (expectedS.compare(resultString) != 0) {
4231 err("Line %d: Incorrect perl expression results.", lineNum);
4232 infoln((UnicodeString)"Expected \""+expectedS+(UnicodeString)"\"; got \""+resultString+(UnicodeString)"\"");
4233 }
4234
4235 delete testMat;
4236 delete testPat;
4237 }
4238
4239 //
4240 // All done. Clean up allocated stuff.
4241 //
4242 delete cgMat;
4243 delete cgPat;
4244
4245 delete groupsMat;
4246 delete groupsPat;
4247
4248 delete flagMat;
4249 delete flagPat;
4250
4251 delete lineMat;
4252 delete linePat;
4253
4254 delete fieldPat;
4255 delete [] testData;
4256
4257
4258 logln("%d tests skipped because of unimplemented regexp features.", skippedUnimplementedCount);
4259
4260 }
4261
4262
4263 //-------------------------------------------------------------------------------
4264 //
4265 // PerlTestsUTF8 Run Perl's regular expression tests on UTF-8-based UTexts
4266 // (instead of using UnicodeStrings) to test the alternate engine.
4267 // The input file for this test is re_tests, the standard regular
4268 // expression test data distributed with the Perl source code.
4269 // See PerlTests() for more information.
4270 //
4271 //-------------------------------------------------------------------------------
PerlTestsUTF8()4272 void RegexTest::PerlTestsUTF8() {
4273 char tdd[2048];
4274 const char *srcPath;
4275 UErrorCode status = U_ZERO_ERROR;
4276 UParseError pe;
4277 LocalUConverterPointer UTF8Converter(ucnv_open("UTF-8", &status));
4278 UText patternText = UTEXT_INITIALIZER;
4279 char *patternChars = NULL;
4280 int32_t patternLength;
4281 int32_t patternCapacity = 0;
4282 UText inputText = UTEXT_INITIALIZER;
4283 char *inputChars = NULL;
4284 int32_t inputLength;
4285 int32_t inputCapacity = 0;
4286
4287 ucnv_setFromUCallBack(UTF8Converter.getAlias(), UCNV_FROM_U_CALLBACK_STOP, NULL, NULL, NULL, &status);
4288
4289 //
4290 // Open and read the test data file.
4291 //
4292 srcPath=getPath(tdd, "re_tests.txt");
4293 if(srcPath==NULL) {
4294 return; /* something went wrong, error already output */
4295 }
4296
4297 int32_t len;
4298 UChar *testData = ReadAndConvertFile(srcPath, len, "iso-8859-1", status);
4299 if (U_FAILURE(status)) {
4300 return; /* something went wrong, error already output */
4301 }
4302
4303 //
4304 // Put the test data into a UnicodeString
4305 //
4306 UnicodeString testDataString(FALSE, testData, len);
4307
4308 //
4309 // Regex to break the input file into lines, and strip the new lines.
4310 // One line per match, capture group one is the desired data.
4311 //
4312 RegexPattern* linePat = RegexPattern::compile(UNICODE_STRING_SIMPLE("(.+?)[\\r\\n]+"), 0, pe, status);
4313 if (U_FAILURE(status)) {
4314 dataerrln("RegexPattern::compile() error");
4315 return;
4316 }
4317 RegexMatcher* lineMat = linePat->matcher(testDataString, status);
4318
4319 //
4320 // Regex to split a test file line into fields.
4321 // There are six fields, separated by tabs.
4322 //
4323 RegexPattern* fieldPat = RegexPattern::compile(UNICODE_STRING_SIMPLE("\\t"), 0, pe, status);
4324
4325 //
4326 // Regex to identify test patterns with flag settings, and to separate them.
4327 // Test patterns with flags look like 'pattern'i
4328 // Test patterns without flags are not quoted: pattern
4329 // Coming out, capture group 2 is the pattern, capture group 3 is the flags.
4330 //
4331 RegexPattern *flagPat = RegexPattern::compile(UNICODE_STRING_SIMPLE("('?)(.*)\\1(.*)"), 0, pe, status);
4332 RegexMatcher* flagMat = flagPat->matcher(status);
4333
4334 //
4335 // The Perl tests reference several perl-isms, which are evaluated/substituted
4336 // in the test data. Not being perl, this must be done explicitly. Here
4337 // are string constants and REs for these constructs.
4338 //
4339 UnicodeString nulnulSrc("${nulnul}");
4340 UnicodeString nulnul("\\u0000\\u0000", -1, US_INV);
4341 nulnul = nulnul.unescape();
4342
4343 UnicodeString ffffSrc("${ffff}");
4344 UnicodeString ffff("\\uffff", -1, US_INV);
4345 ffff = ffff.unescape();
4346
4347 // regexp for $-[0], $+[2], etc.
4348 RegexPattern *groupsPat = RegexPattern::compile(UNICODE_STRING_SIMPLE("\\$([+\\-])\\[(\\d+)\\]"), 0, pe, status);
4349 RegexMatcher *groupsMat = groupsPat->matcher(status);
4350
4351 // regexp for $0, $1, $2, etc.
4352 RegexPattern *cgPat = RegexPattern::compile(UNICODE_STRING_SIMPLE("\\$(\\d+)"), 0, pe, status);
4353 RegexMatcher *cgMat = cgPat->matcher(status);
4354
4355
4356 //
4357 // Main Loop for the Perl Tests, runs once per line from the
4358 // test data file.
4359 //
4360 int32_t lineNum = 0;
4361 int32_t skippedUnimplementedCount = 0;
4362 while (lineMat->find()) {
4363 lineNum++;
4364
4365 //
4366 // Get a line, break it into its fields, do the Perl
4367 // variable substitutions.
4368 //
4369 UnicodeString line = lineMat->group(1, status);
4370 UnicodeString fields[7];
4371 fieldPat->split(line, fields, 7, status);
4372
4373 flagMat->reset(fields[0]);
4374 flagMat->matches(status);
4375 UnicodeString pattern = flagMat->group(2, status);
4376 pattern.findAndReplace("${bang}", "!");
4377 pattern.findAndReplace(nulnulSrc, UNICODE_STRING_SIMPLE("\\u0000\\u0000"));
4378 pattern.findAndReplace(ffffSrc, ffff);
4379
4380 //
4381 // Identify patterns that include match flag settings,
4382 // split off the flags, remove the extra quotes.
4383 //
4384 UnicodeString flagStr = flagMat->group(3, status);
4385 if (U_FAILURE(status)) {
4386 errln("ucnv_toUChars: ICU Error \"%s\"\n", u_errorName(status));
4387 return;
4388 }
4389 int32_t flags = 0;
4390 const UChar UChar_c = 0x63; // Char constants for the flag letters.
4391 const UChar UChar_i = 0x69; // (Damn the lack of Unicode support in C)
4392 const UChar UChar_m = 0x6d;
4393 const UChar UChar_x = 0x78;
4394 const UChar UChar_y = 0x79;
4395 if (flagStr.indexOf(UChar_i) != -1) {
4396 flags |= UREGEX_CASE_INSENSITIVE;
4397 }
4398 if (flagStr.indexOf(UChar_m) != -1) {
4399 flags |= UREGEX_MULTILINE;
4400 }
4401 if (flagStr.indexOf(UChar_x) != -1) {
4402 flags |= UREGEX_COMMENTS;
4403 }
4404
4405 //
4406 // Put the pattern in a UTF-8 UText
4407 //
4408 status = U_ZERO_ERROR;
4409 patternLength = pattern.extract(patternChars, patternCapacity, UTF8Converter.getAlias(), status);
4410 if (status == U_BUFFER_OVERFLOW_ERROR) {
4411 status = U_ZERO_ERROR;
4412 delete[] patternChars;
4413 patternCapacity = patternLength + 1;
4414 patternChars = new char[patternCapacity];
4415 pattern.extract(patternChars, patternCapacity, UTF8Converter.getAlias(), status);
4416 }
4417 utext_openUTF8(&patternText, patternChars, patternLength, &status);
4418
4419 //
4420 // Compile the test pattern.
4421 //
4422 RegexPattern *testPat = RegexPattern::compile(&patternText, flags, pe, status);
4423 if (status == U_REGEX_UNIMPLEMENTED) {
4424 //
4425 // Test of a feature that is planned for ICU, but not yet implemented.
4426 // skip the test.
4427 skippedUnimplementedCount++;
4428 delete testPat;
4429 status = U_ZERO_ERROR;
4430 continue;
4431 }
4432
4433 if (U_FAILURE(status)) {
4434 // Some tests are supposed to generate errors.
4435 // Only report an error for tests that are supposed to succeed.
4436 if (fields[2].indexOf(UChar_c) == -1 && // Compilation is not supposed to fail AND
4437 fields[2].indexOf(UChar_i) == -1) // it's not an accepted ICU incompatibility
4438 {
4439 errln("line %d: ICU Error \"%s\"\n", lineNum, u_errorName(status));
4440 }
4441 status = U_ZERO_ERROR;
4442 delete testPat;
4443 continue;
4444 }
4445
4446 if (fields[2].indexOf(UChar_i) >= 0) {
4447 // ICU should skip this test.
4448 delete testPat;
4449 continue;
4450 }
4451
4452 if (fields[2].indexOf(UChar_c) >= 0) {
4453 // This pattern should have caused a compilation error, but didn't/
4454 errln("line %d: Expected a pattern compile error, got success.", lineNum);
4455 delete testPat;
4456 continue;
4457 }
4458
4459
4460 //
4461 // replace the Perl variables that appear in some of the
4462 // match data strings.
4463 //
4464 UnicodeString matchString = fields[1];
4465 matchString.findAndReplace(nulnulSrc, nulnul);
4466 matchString.findAndReplace(ffffSrc, ffff);
4467
4468 // Replace any \n in the match string with an actual new-line char.
4469 // Don't do full unescape, as this unescapes more than Perl does, which
4470 // causes other spurious failures in the tests.
4471 matchString.findAndReplace(UNICODE_STRING_SIMPLE("\\n"), "\n");
4472
4473 //
4474 // Put the input in a UTF-8 UText
4475 //
4476 status = U_ZERO_ERROR;
4477 inputLength = matchString.extract(inputChars, inputCapacity, UTF8Converter.getAlias(), status);
4478 if (status == U_BUFFER_OVERFLOW_ERROR) {
4479 status = U_ZERO_ERROR;
4480 delete[] inputChars;
4481 inputCapacity = inputLength + 1;
4482 inputChars = new char[inputCapacity];
4483 matchString.extract(inputChars, inputCapacity, UTF8Converter.getAlias(), status);
4484 }
4485 utext_openUTF8(&inputText, inputChars, inputLength, &status);
4486
4487 //
4488 // Run the test, check for expected match/don't match result.
4489 //
4490 RegexMatcher *testMat = &testPat->matcher(status)->reset(&inputText);
4491 UBool found = testMat->find();
4492 UBool expected = FALSE;
4493 if (fields[2].indexOf(UChar_y) >=0) {
4494 expected = TRUE;
4495 }
4496 if (expected != found) {
4497 errln("line %d: Expected %smatch, got %smatch",
4498 lineNum, expected?"":"no ", found?"":"no " );
4499 delete testMat;
4500 delete testPat;
4501 continue;
4502 }
4503
4504 // Don't try to check expected results if there is no match.
4505 // (Some have stuff in the expected fields)
4506 if (!found) {
4507 delete testMat;
4508 delete testPat;
4509 continue;
4510 }
4511
4512 //
4513 // Interpret the Perl expression from the fourth field of the data file,
4514 // building up an ICU string from the results of the ICU match.
4515 // The Perl expression will contain references to the results of
4516 // a regex match, including the matched string, capture group strings,
4517 // group starting and ending indices, etc.
4518 //
4519 UnicodeString resultString;
4520 UnicodeString perlExpr = fields[3];
4521
4522 while (perlExpr.length() > 0) {
4523 groupsMat->reset(perlExpr);
4524 cgMat->reset(perlExpr);
4525
4526 if (perlExpr.startsWith("$&")) {
4527 resultString.append(testMat->group(status));
4528 perlExpr.remove(0, 2);
4529 }
4530
4531 else if (groupsMat->lookingAt(status)) {
4532 // $-[0] $+[2] etc.
4533 UnicodeString digitString = groupsMat->group(2, status);
4534 int32_t t = 0;
4535 int32_t groupNum = ICU_Utility::parseNumber(digitString, t, 10);
4536 UnicodeString plusOrMinus = groupsMat->group(1, status);
4537 int32_t matchPosition;
4538 if (plusOrMinus.compare("+") == 0) {
4539 matchPosition = testMat->end(groupNum, status);
4540 } else {
4541 matchPosition = testMat->start(groupNum, status);
4542 }
4543 if (matchPosition != -1) {
4544 ICU_Utility::appendNumber(resultString, matchPosition);
4545 }
4546 perlExpr.remove(0, groupsMat->end(status));
4547 }
4548
4549 else if (cgMat->lookingAt(status)) {
4550 // $1, $2, $3, etc.
4551 UnicodeString digitString = cgMat->group(1, status);
4552 int32_t t = 0;
4553 int32_t groupNum = ICU_Utility::parseNumber(digitString, t, 10);
4554 if (U_SUCCESS(status)) {
4555 resultString.append(testMat->group(groupNum, status));
4556 status = U_ZERO_ERROR;
4557 }
4558 perlExpr.remove(0, cgMat->end(status));
4559 }
4560
4561 else if (perlExpr.startsWith("@-")) {
4562 int32_t i;
4563 for (i=0; i<=testMat->groupCount(); i++) {
4564 if (i>0) {
4565 resultString.append(" ");
4566 }
4567 ICU_Utility::appendNumber(resultString, testMat->start(i, status));
4568 }
4569 perlExpr.remove(0, 2);
4570 }
4571
4572 else if (perlExpr.startsWith("@+")) {
4573 int32_t i;
4574 for (i=0; i<=testMat->groupCount(); i++) {
4575 if (i>0) {
4576 resultString.append(" ");
4577 }
4578 ICU_Utility::appendNumber(resultString, testMat->end(i, status));
4579 }
4580 perlExpr.remove(0, 2);
4581 }
4582
4583 else if (perlExpr.startsWith(UNICODE_STRING_SIMPLE("\\"))) { // \Escape. Take following char as a literal.
4584 // or as an escaped sequence (e.g. \n)
4585 if (perlExpr.length() > 1) {
4586 perlExpr.remove(0, 1); // Remove the '\', but only if not last char.
4587 }
4588 UChar c = perlExpr.charAt(0);
4589 switch (c) {
4590 case 'n': c = '\n'; break;
4591 // add any other escape sequences that show up in the test expected results.
4592 }
4593 resultString.append(c);
4594 perlExpr.remove(0, 1);
4595 }
4596
4597 else {
4598 // Any characters from the perl expression that we don't explicitly
4599 // recognize before here are assumed to be literals and copied
4600 // as-is to the expected results.
4601 resultString.append(perlExpr.charAt(0));
4602 perlExpr.remove(0, 1);
4603 }
4604
4605 if (U_FAILURE(status)) {
4606 errln("Line %d: ICU Error \"%s\"", lineNum, u_errorName(status));
4607 break;
4608 }
4609 }
4610
4611 //
4612 // Expected Results Compare
4613 //
4614 UnicodeString expectedS(fields[4]);
4615 expectedS.findAndReplace(nulnulSrc, nulnul);
4616 expectedS.findAndReplace(ffffSrc, ffff);
4617 expectedS.findAndReplace(UNICODE_STRING_SIMPLE("\\n"), "\n");
4618
4619
4620 if (expectedS.compare(resultString) != 0) {
4621 err("Line %d: Incorrect perl expression results.", lineNum);
4622 infoln((UnicodeString)"Expected \""+expectedS+(UnicodeString)"\"; got \""+resultString+(UnicodeString)"\"");
4623 }
4624
4625 delete testMat;
4626 delete testPat;
4627 }
4628
4629 //
4630 // All done. Clean up allocated stuff.
4631 //
4632 delete cgMat;
4633 delete cgPat;
4634
4635 delete groupsMat;
4636 delete groupsPat;
4637
4638 delete flagMat;
4639 delete flagPat;
4640
4641 delete lineMat;
4642 delete linePat;
4643
4644 delete fieldPat;
4645 delete [] testData;
4646
4647 utext_close(&patternText);
4648 utext_close(&inputText);
4649
4650 delete [] patternChars;
4651 delete [] inputChars;
4652
4653
4654 logln("%d tests skipped because of unimplemented regexp features.", skippedUnimplementedCount);
4655
4656 }
4657
4658
4659 //--------------------------------------------------------------
4660 //
4661 // Bug6149 Verify limits to heap expansion for backtrack stack.
4662 // Use this pattern,
4663 // "(a?){1,8000000}"
4664 // Note: was an unbounded upperbounds, but that now has loop-breaking enabled.
4665 // This test is likely to be fragile, as further optimizations stop
4666 // more cases of pointless looping in the match engine.
4667 //
4668 //---------------------------------------------------------------
Bug6149()4669 void RegexTest::Bug6149() {
4670 UnicodeString pattern("(a?){1,8000000}");
4671 UnicodeString s("xyz");
4672 uint32_t flags = 0;
4673 UErrorCode status = U_ZERO_ERROR;
4674
4675 RegexMatcher matcher(pattern, s, flags, status);
4676 UBool result = false;
4677 REGEX_ASSERT_FAIL(result=matcher.matches(status), U_REGEX_STACK_OVERFLOW);
4678 REGEX_ASSERT(result == FALSE);
4679 }
4680
4681
4682 //
4683 // Callbacks() Test the callback function.
4684 // When set, callbacks occur periodically during matching operations,
4685 // giving the application code the ability to abort the operation
4686 // before it's normal completion.
4687 //
4688
4689 struct callBackContext {
4690 RegexTest *test;
4691 int32_t maxCalls;
4692 int32_t numCalls;
4693 int32_t lastSteps;
resetcallBackContext4694 void reset(int32_t max) {maxCalls=max; numCalls=0; lastSteps=0;}
4695 };
4696
4697 U_CDECL_BEGIN
4698 static UBool U_CALLCONV
testCallBackFn(const void * context,int32_t steps)4699 testCallBackFn(const void *context, int32_t steps) {
4700 callBackContext *info = (callBackContext *)context;
4701 if (info->lastSteps+1 != steps) {
4702 info->test->errln("incorrect steps in callback. Expected %d, got %d\n", info->lastSteps+1, steps);
4703 }
4704 info->lastSteps = steps;
4705 info->numCalls++;
4706 return (info->numCalls < info->maxCalls);
4707 }
4708 U_CDECL_END
4709
Callbacks()4710 void RegexTest::Callbacks() {
4711 {
4712 // Getter returns NULLs if no callback has been set
4713
4714 // The variables that the getter will fill in.
4715 // Init to non-null values so that the action of the getter can be seen.
4716 const void *returnedContext = &returnedContext;
4717 URegexMatchCallback *returnedFn = &testCallBackFn;
4718
4719 UErrorCode status = U_ZERO_ERROR;
4720 RegexMatcher matcher("x", 0, status);
4721 REGEX_CHECK_STATUS;
4722 matcher.getMatchCallback(returnedFn, returnedContext, status);
4723 REGEX_CHECK_STATUS;
4724 REGEX_ASSERT(returnedFn == NULL);
4725 REGEX_ASSERT(returnedContext == NULL);
4726 }
4727
4728 {
4729 // Set and Get work
4730 callBackContext cbInfo = {this, 0, 0, 0};
4731 const void *returnedContext;
4732 URegexMatchCallback *returnedFn;
4733 UErrorCode status = U_ZERO_ERROR;
4734 RegexMatcher matcher(UNICODE_STRING_SIMPLE("((.)+\\2)+x"), 0, status); // A pattern that can run long.
4735 REGEX_CHECK_STATUS;
4736 matcher.setMatchCallback(testCallBackFn, &cbInfo, status);
4737 REGEX_CHECK_STATUS;
4738 matcher.getMatchCallback(returnedFn, returnedContext, status);
4739 REGEX_CHECK_STATUS;
4740 REGEX_ASSERT(returnedFn == testCallBackFn);
4741 REGEX_ASSERT(returnedContext == &cbInfo);
4742
4743 // A short-running match shouldn't invoke the callback
4744 status = U_ZERO_ERROR;
4745 cbInfo.reset(1);
4746 UnicodeString s = "xxx";
4747 matcher.reset(s);
4748 REGEX_ASSERT(matcher.matches(status));
4749 REGEX_CHECK_STATUS;
4750 REGEX_ASSERT(cbInfo.numCalls == 0);
4751
4752 // A medium-length match that runs long enough to invoke the
4753 // callback, but not so long that the callback aborts it.
4754 status = U_ZERO_ERROR;
4755 cbInfo.reset(4);
4756 s = "aaaaaaaaaaaaaaaaaaab";
4757 matcher.reset(s);
4758 REGEX_ASSERT(matcher.matches(status)==FALSE);
4759 REGEX_CHECK_STATUS;
4760 REGEX_ASSERT(cbInfo.numCalls > 0);
4761
4762 // A longer running match that the callback function will abort.
4763 status = U_ZERO_ERROR;
4764 cbInfo.reset(4);
4765 s = "aaaaaaaaaaaaaaaaaaaaaaab";
4766 matcher.reset(s);
4767 REGEX_ASSERT(matcher.matches(status)==FALSE);
4768 REGEX_ASSERT(status == U_REGEX_STOPPED_BY_CALLER);
4769 REGEX_ASSERT(cbInfo.numCalls == 4);
4770
4771 // A longer running find that the callback function will abort.
4772 status = U_ZERO_ERROR;
4773 cbInfo.reset(4);
4774 s = "aaaaaaaaaaaaaaaaaaaaaaab";
4775 matcher.reset(s);
4776 REGEX_ASSERT(matcher.find(status)==FALSE);
4777 REGEX_ASSERT(status == U_REGEX_STOPPED_BY_CALLER);
4778 REGEX_ASSERT(cbInfo.numCalls == 4);
4779 }
4780
4781
4782 }
4783
4784
4785 //
4786 // FindProgressCallbacks() Test the find "progress" callback function.
4787 // When set, the find progress callback will be invoked during a find operations
4788 // after each return from a match attempt, giving the application the opportunity
4789 // to terminate a long-running find operation before it's normal completion.
4790 //
4791
4792 struct progressCallBackContext {
4793 RegexTest *test;
4794 int64_t lastIndex;
4795 int32_t maxCalls;
4796 int32_t numCalls;
resetprogressCallBackContext4797 void reset(int32_t max) {maxCalls=max; numCalls=0;lastIndex=0;}
4798 };
4799
4800 // call-back function for find().
4801 // Return TRUE to continue the find().
4802 // Return FALSE to stop the find().
4803 U_CDECL_BEGIN
4804 static UBool U_CALLCONV
testProgressCallBackFn(const void * context,int64_t matchIndex)4805 testProgressCallBackFn(const void *context, int64_t matchIndex) {
4806 progressCallBackContext *info = (progressCallBackContext *)context;
4807 info->numCalls++;
4808 info->lastIndex = matchIndex;
4809 // info->test->infoln("ProgressCallback - matchIndex = %d, numCalls = %d\n", matchIndex, info->numCalls);
4810 return (info->numCalls < info->maxCalls);
4811 }
4812 U_CDECL_END
4813
FindProgressCallbacks()4814 void RegexTest::FindProgressCallbacks() {
4815 {
4816 // Getter returns NULLs if no callback has been set
4817
4818 // The variables that the getter will fill in.
4819 // Init to non-null values so that the action of the getter can be seen.
4820 const void *returnedContext = &returnedContext;
4821 URegexFindProgressCallback *returnedFn = &testProgressCallBackFn;
4822
4823 UErrorCode status = U_ZERO_ERROR;
4824 RegexMatcher matcher("x", 0, status);
4825 REGEX_CHECK_STATUS;
4826 matcher.getFindProgressCallback(returnedFn, returnedContext, status);
4827 REGEX_CHECK_STATUS;
4828 REGEX_ASSERT(returnedFn == NULL);
4829 REGEX_ASSERT(returnedContext == NULL);
4830 }
4831
4832 {
4833 // Set and Get work
4834 progressCallBackContext cbInfo = {this, 0, 0, 0};
4835 const void *returnedContext;
4836 URegexFindProgressCallback *returnedFn;
4837 UErrorCode status = U_ZERO_ERROR;
4838 RegexMatcher matcher(UNICODE_STRING_SIMPLE("((.)\\2)x"), 0, status);
4839 REGEX_CHECK_STATUS;
4840 matcher.setFindProgressCallback(testProgressCallBackFn, &cbInfo, status);
4841 REGEX_CHECK_STATUS;
4842 matcher.getFindProgressCallback(returnedFn, returnedContext, status);
4843 REGEX_CHECK_STATUS;
4844 REGEX_ASSERT(returnedFn == testProgressCallBackFn);
4845 REGEX_ASSERT(returnedContext == &cbInfo);
4846
4847 // A find that matches on the initial position does NOT invoke the callback.
4848 status = U_ZERO_ERROR;
4849 cbInfo.reset(100);
4850 UnicodeString s = "aaxxx";
4851 matcher.reset(s);
4852 #if 0
4853 matcher.setTrace(TRUE);
4854 #endif
4855 REGEX_ASSERT(matcher.find(0, status));
4856 REGEX_CHECK_STATUS;
4857 REGEX_ASSERT(cbInfo.numCalls == 0);
4858
4859 // A medium running find() that causes matcher.find() to invoke our callback for each index,
4860 // but not so many times that we interrupt the operation.
4861 status = U_ZERO_ERROR;
4862 s = "aaaaaaaaaaaaaaaaaaab";
4863 cbInfo.reset(s.length()); // Some upper limit for number of calls that is greater than size of our input string
4864 matcher.reset(s);
4865 REGEX_ASSERT(matcher.find(0, status)==FALSE);
4866 REGEX_CHECK_STATUS;
4867 REGEX_ASSERT(cbInfo.numCalls > 0 && cbInfo.numCalls < 25);
4868
4869 // A longer running match that causes matcher.find() to invoke our callback which we cancel/interrupt at some point.
4870 status = U_ZERO_ERROR;
4871 UnicodeString s1 = "aaaaaaaaaaaaaaaaaaaaaaab";
4872 cbInfo.reset(s1.length() - 5); // Bail early somewhere near the end of input string
4873 matcher.reset(s1);
4874 REGEX_ASSERT(matcher.find(0, status)==FALSE);
4875 REGEX_ASSERT(status == U_REGEX_STOPPED_BY_CALLER);
4876 REGEX_ASSERT(cbInfo.numCalls == s1.length() - 5);
4877
4878 // Now a match that will succeed, but after an interruption
4879 status = U_ZERO_ERROR;
4880 UnicodeString s2 = "aaaaaaaaaaaaaa aaaaaaaaab xxx";
4881 cbInfo.reset(s2.length() - 10); // Bail early somewhere near the end of input string
4882 matcher.reset(s2);
4883 REGEX_ASSERT(matcher.find(0, status)==FALSE);
4884 REGEX_ASSERT(status == U_REGEX_STOPPED_BY_CALLER);
4885 // Now retry the match from where left off
4886 cbInfo.maxCalls = 100; // No callback limit
4887 status = U_ZERO_ERROR;
4888 REGEX_ASSERT(matcher.find(cbInfo.lastIndex, status));
4889 REGEX_CHECK_STATUS;
4890 }
4891
4892
4893 }
4894
4895
4896 //---------------------------------------------------------------------------
4897 //
4898 // PreAllocatedUTextCAPI Check the C API with pre-allocated mutable
4899 // UTexts. The pure-C implementation of UText
4900 // has no mutable backing stores, but we can
4901 // use UnicodeString here to test the functionality.
4902 //
4903 //---------------------------------------------------------------------------
PreAllocatedUTextCAPI()4904 void RegexTest::PreAllocatedUTextCAPI () {
4905 UErrorCode status = U_ZERO_ERROR;
4906 URegularExpression *re;
4907 UText patternText = UTEXT_INITIALIZER;
4908 UnicodeString buffer;
4909 UText bufferText = UTEXT_INITIALIZER;
4910
4911 utext_openUnicodeString(&bufferText, &buffer, &status);
4912
4913 /*
4914 * getText() and getUText()
4915 */
4916 {
4917 UText text1 = UTEXT_INITIALIZER;
4918 UText text2 = UTEXT_INITIALIZER;
4919 UChar text2Chars[20];
4920 UText *resultText;
4921
4922 status = U_ZERO_ERROR;
4923 regextst_openUTF8FromInvariant(&text1, "abcccd", -1, &status);
4924 regextst_openUTF8FromInvariant(&text2, "abcccxd", -1, &status);
4925 u_uastrncpy(text2Chars, "abcccxd", sizeof(text2)/2);
4926 utext_openUChars(&text2, text2Chars, -1, &status);
4927
4928 regextst_openUTF8FromInvariant(&patternText, "abc*d", -1, &status);
4929 re = uregex_openUText(&patternText, 0, NULL, &status);
4930
4931 /* First set a UText */
4932 uregex_setUText(re, &text1, &status);
4933 resultText = uregex_getUText(re, &bufferText, &status);
4934 REGEX_CHECK_STATUS;
4935 REGEX_ASSERT(resultText == &bufferText);
4936 utext_setNativeIndex(resultText, 0);
4937 utext_setNativeIndex(&text1, 0);
4938 REGEX_ASSERT(testUTextEqual(resultText, &text1));
4939
4940 resultText = uregex_getUText(re, &bufferText, &status);
4941 REGEX_CHECK_STATUS;
4942 REGEX_ASSERT(resultText == &bufferText);
4943 utext_setNativeIndex(resultText, 0);
4944 utext_setNativeIndex(&text1, 0);
4945 REGEX_ASSERT(testUTextEqual(resultText, &text1));
4946
4947 /* Then set a UChar * */
4948 uregex_setText(re, text2Chars, 7, &status);
4949 resultText = uregex_getUText(re, &bufferText, &status);
4950 REGEX_CHECK_STATUS;
4951 REGEX_ASSERT(resultText == &bufferText);
4952 utext_setNativeIndex(resultText, 0);
4953 utext_setNativeIndex(&text2, 0);
4954 REGEX_ASSERT(testUTextEqual(resultText, &text2));
4955
4956 uregex_close(re);
4957 utext_close(&text1);
4958 utext_close(&text2);
4959 }
4960
4961 /*
4962 * group()
4963 */
4964 {
4965 UChar text1[80];
4966 UText *actual;
4967 UBool result;
4968 int64_t length = 0;
4969
4970 u_uastrncpy(text1, "noise abc interior def, and this is off the end", UPRV_LENGTHOF(text1));
4971 // 012345678901234567890123456789012345678901234567
4972 // 0 1 2 3 4
4973
4974 status = U_ZERO_ERROR;
4975 re = uregex_openC("abc(.*?)def", 0, NULL, &status);
4976 REGEX_CHECK_STATUS;
4977
4978 uregex_setText(re, text1, -1, &status);
4979 result = uregex_find(re, 0, &status);
4980 REGEX_ASSERT(result==TRUE);
4981
4982 /* Capture Group 0, the full match. Should succeed. "abc interior def" */
4983 status = U_ZERO_ERROR;
4984 actual = uregex_groupUText(re, 0, &bufferText, &length, &status);
4985 REGEX_CHECK_STATUS;
4986 REGEX_ASSERT(actual == &bufferText);
4987 REGEX_ASSERT(utext_getNativeIndex(actual) == 6);
4988 REGEX_ASSERT(length == 16);
4989 REGEX_ASSERT(utext_nativeLength(actual) == 47);
4990
4991 /* Capture group #1. Should succeed, matching " interior ". */
4992 status = U_ZERO_ERROR;
4993 actual = uregex_groupUText(re, 1, &bufferText, &length, &status);
4994 REGEX_CHECK_STATUS;
4995 REGEX_ASSERT(actual == &bufferText);
4996 REGEX_ASSERT(utext_getNativeIndex(actual) == 9); // position of " interior "
4997 REGEX_ASSERT(length == 10);
4998 REGEX_ASSERT(utext_nativeLength(actual) == 47);
4999
5000 /* Capture group out of range. Error. */
5001 status = U_ZERO_ERROR;
5002 actual = uregex_groupUText(re, 2, &bufferText, &length, &status);
5003 REGEX_ASSERT(status == U_INDEX_OUTOFBOUNDS_ERROR);
5004 REGEX_ASSERT(actual == &bufferText);
5005 uregex_close(re);
5006
5007 }
5008
5009 /*
5010 * replaceFirst()
5011 */
5012 {
5013 UChar text1[80];
5014 UChar text2[80];
5015 UText replText = UTEXT_INITIALIZER;
5016 UText *result;
5017 status = U_ZERO_ERROR;
5018 utext_openUnicodeString(&bufferText, &buffer, &status);
5019
5020 status = U_ZERO_ERROR;
5021 u_uastrncpy(text1, "Replace xaax x1x x...x.", UPRV_LENGTHOF(text1));
5022 u_uastrncpy(text2, "No match here.", UPRV_LENGTHOF(text2)/2);
5023 regextst_openUTF8FromInvariant(&replText, "<$1>", -1, &status);
5024
5025 re = uregex_openC("x(.*?)x", 0, NULL, &status);
5026 REGEX_CHECK_STATUS;
5027
5028 /* Normal case, with match */
5029 uregex_setText(re, text1, -1, &status);
5030 REGEX_CHECK_STATUS;
5031 utext_replace(&bufferText, 0, utext_nativeLength(&bufferText), NULL, 0, &status);
5032 REGEX_CHECK_STATUS;
5033 result = uregex_replaceFirstUText(re, &replText, &bufferText, &status);
5034 REGEX_CHECK_STATUS;
5035 REGEX_ASSERT(result == &bufferText);
5036 REGEX_ASSERT_UTEXT_INVARIANT("Replace <aa> x1x x...x.", result);
5037
5038 /* No match. Text should copy to output with no changes. */
5039 uregex_setText(re, text2, -1, &status);
5040 utext_replace(&bufferText, 0, utext_nativeLength(&bufferText), NULL, 0, &status);
5041 result = uregex_replaceFirstUText(re, &replText, &bufferText, &status);
5042 REGEX_CHECK_STATUS;
5043 REGEX_ASSERT(result == &bufferText);
5044 REGEX_ASSERT_UTEXT_INVARIANT("No match here.", result);
5045
5046 /* Unicode escapes */
5047 uregex_setText(re, text1, -1, &status);
5048 regextst_openUTF8FromInvariant(&replText, "\\\\\\u0041$1\\U00000042\\$\\a", -1, &status);
5049 utext_replace(&bufferText, 0, utext_nativeLength(&bufferText), NULL, 0, &status);
5050 result = uregex_replaceFirstUText(re, &replText, &bufferText, &status);
5051 REGEX_CHECK_STATUS;
5052 REGEX_ASSERT(result == &bufferText);
5053 REGEX_ASSERT_UTEXT_INVARIANT("Replace \\AaaB$a x1x x...x.", result);
5054
5055 uregex_close(re);
5056 utext_close(&replText);
5057 }
5058
5059
5060 /*
5061 * replaceAll()
5062 */
5063 {
5064 UChar text1[80];
5065 UChar text2[80];
5066 UText replText = UTEXT_INITIALIZER;
5067 UText *result;
5068
5069 status = U_ZERO_ERROR;
5070 u_uastrncpy(text1, "Replace xaax x1x x...x.", sizeof(text1)/2);
5071 u_uastrncpy(text2, "No match here.", sizeof(text2)/2);
5072 regextst_openUTF8FromInvariant(&replText, "<$1>", -1, &status);
5073
5074 re = uregex_openC("x(.*?)x", 0, NULL, &status);
5075 REGEX_CHECK_STATUS;
5076
5077 /* Normal case, with match */
5078 uregex_setText(re, text1, -1, &status);
5079 utext_replace(&bufferText, 0, utext_nativeLength(&bufferText), NULL, 0, &status);
5080 result = uregex_replaceAllUText(re, &replText, &bufferText, &status);
5081 REGEX_CHECK_STATUS;
5082 REGEX_ASSERT(result == &bufferText);
5083 REGEX_ASSERT_UTEXT_INVARIANT("Replace <aa> <1> <...>.", result);
5084
5085 /* No match. Text should copy to output with no changes. */
5086 uregex_setText(re, text2, -1, &status);
5087 utext_replace(&bufferText, 0, utext_nativeLength(&bufferText), NULL, 0, &status);
5088 result = uregex_replaceAllUText(re, &replText, &bufferText, &status);
5089 REGEX_CHECK_STATUS;
5090 REGEX_ASSERT(result == &bufferText);
5091 REGEX_ASSERT_UTEXT_INVARIANT("No match here.", result);
5092
5093 uregex_close(re);
5094 utext_close(&replText);
5095 }
5096
5097
5098 /*
5099 * splitUText() uses the C++ API directly, and the UnicodeString version uses mutable UTexts,
5100 * so we don't need to test it here.
5101 */
5102
5103 utext_close(&bufferText);
5104 utext_close(&patternText);
5105 }
5106
5107
5108 //--------------------------------------------------------------
5109 //
5110 // NamedCapture Check basic named capture group functionality
5111 //
5112 //--------------------------------------------------------------
NamedCapture()5113 void RegexTest::NamedCapture() {
5114 UErrorCode status = U_ZERO_ERROR;
5115 RegexPattern *pat = RegexPattern::compile(UnicodeString(
5116 "abc()()(?<three>xyz)(de)(?<five>hmm)(?<six>oh)f\\k<five>"), 0, status);
5117 REGEX_CHECK_STATUS;
5118 int32_t group = pat->groupNumberFromName("five", -1, status);
5119 REGEX_CHECK_STATUS;
5120 REGEX_ASSERT(5 == group);
5121 group = pat->groupNumberFromName("three", -1, status);
5122 REGEX_CHECK_STATUS;
5123 REGEX_ASSERT(3 == group);
5124
5125 status = U_ZERO_ERROR;
5126 group = pat->groupNumberFromName(UnicodeString("six"), status);
5127 REGEX_CHECK_STATUS;
5128 REGEX_ASSERT(6 == group);
5129
5130 status = U_ZERO_ERROR;
5131 group = pat->groupNumberFromName(UnicodeString("nosuch"), status);
5132 U_ASSERT(status == U_REGEX_INVALID_CAPTURE_GROUP_NAME);
5133
5134 status = U_ZERO_ERROR;
5135
5136 // After copying a pattern, named capture should still work in the copy.
5137 RegexPattern *copiedPat = new RegexPattern(*pat);
5138 REGEX_ASSERT(*copiedPat == *pat);
5139 delete pat; pat = NULL; // Delete original, copy should have no references back to it.
5140
5141 group = copiedPat->groupNumberFromName("five", -1, status);
5142 REGEX_CHECK_STATUS;
5143 REGEX_ASSERT(5 == group);
5144 group = copiedPat->groupNumberFromName("three", -1, status);
5145 REGEX_CHECK_STATUS;
5146 REGEX_ASSERT(3 == group);
5147 delete copiedPat;
5148
5149 // ReplaceAll with named capture group.
5150 status = U_ZERO_ERROR;
5151 UnicodeString text("Substitution of <<quotes>> for <<double brackets>>");
5152 RegexMatcher *m = new RegexMatcher(UnicodeString("<<(?<mid>.+?)>>"), text, 0, status);
5153 REGEX_CHECK_STATUS;
5154 // m.pattern().dumpPattern();
5155 UnicodeString replacedText = m->replaceAll("'${mid}'", status);
5156 REGEX_CHECK_STATUS;
5157 REGEX_ASSERT(UnicodeString("Substitution of 'quotes' for 'double brackets'") == replacedText);
5158 delete m;
5159
5160 // ReplaceAll, allowed capture group numbers.
5161 text = UnicodeString("abcmxyz");
5162 m = new RegexMatcher(UnicodeString("..(?<one>m)(.)(.)"), text, 0, status);
5163 REGEX_CHECK_STATUS;
5164
5165 status = U_ZERO_ERROR;
5166 replacedText = m->replaceAll(UnicodeString("<$0>"), status); // group 0, full match, is allowed.
5167 REGEX_CHECK_STATUS;
5168 REGEX_ASSERT(UnicodeString("a<bcmxy>z") == replacedText);
5169
5170 status = U_ZERO_ERROR;
5171 replacedText = m->replaceAll(UnicodeString("<$1>"), status); // group 1 by number.
5172 REGEX_CHECK_STATUS;
5173 REGEX_ASSERT(UnicodeString("a<m>z") == replacedText);
5174
5175 status = U_ZERO_ERROR;
5176 replacedText = m->replaceAll(UnicodeString("<${one}>"), status); // group 1 by name.
5177 REGEX_CHECK_STATUS;
5178 REGEX_ASSERT(UnicodeString("a<m>z") == replacedText);
5179
5180 status = U_ZERO_ERROR;
5181 replacedText = m->replaceAll(UnicodeString("<$2>"), status); // group 2.
5182 REGEX_CHECK_STATUS;
5183 REGEX_ASSERT(UnicodeString("a<x>z") == replacedText);
5184
5185 status = U_ZERO_ERROR;
5186 replacedText = m->replaceAll(UnicodeString("<$3>"), status);
5187 REGEX_CHECK_STATUS;
5188 REGEX_ASSERT(UnicodeString("a<y>z") == replacedText);
5189
5190 status = U_ZERO_ERROR;
5191 replacedText = m->replaceAll(UnicodeString("<$4>"), status);
5192 REGEX_ASSERT(status == U_INDEX_OUTOFBOUNDS_ERROR);
5193
5194 status = U_ZERO_ERROR;
5195 replacedText = m->replaceAll(UnicodeString("<$04>"), status); // group 0, leading 0,
5196 REGEX_CHECK_STATUS; // trailing out-of-range 4 passes through.
5197 REGEX_ASSERT(UnicodeString("a<bcmxy4>z") == replacedText);
5198
5199 status = U_ZERO_ERROR;
5200 replacedText = m->replaceAll(UnicodeString("<$000016>"), status); // Consume leading zeroes. Don't consume digits
5201 REGEX_CHECK_STATUS; // that push group num out of range.
5202 REGEX_ASSERT(UnicodeString("a<m6>z") == replacedText); // This is group 1.
5203
5204 status = U_ZERO_ERROR;
5205 replacedText = m->replaceAll(UnicodeString("<$3$2$1${one}>"), status);
5206 REGEX_CHECK_STATUS;
5207 REGEX_ASSERT(UnicodeString("a<yxmm>z") == replacedText);
5208
5209 status = U_ZERO_ERROR;
5210 replacedText = m->replaceAll(UnicodeString("$3$2$1${one}"), status);
5211 REGEX_CHECK_STATUS;
5212 REGEX_ASSERT(UnicodeString("ayxmmz") == replacedText);
5213
5214 status = U_ZERO_ERROR;
5215 replacedText = m->replaceAll(UnicodeString("<${noSuchName}>"), status);
5216 REGEX_ASSERT(status == U_REGEX_INVALID_CAPTURE_GROUP_NAME);
5217
5218 status = U_ZERO_ERROR;
5219 replacedText = m->replaceAll(UnicodeString("<${invalid-name}>"), status);
5220 REGEX_ASSERT(status == U_REGEX_INVALID_CAPTURE_GROUP_NAME);
5221
5222 status = U_ZERO_ERROR;
5223 replacedText = m->replaceAll(UnicodeString("<${one"), status);
5224 REGEX_ASSERT(status == U_REGEX_INVALID_CAPTURE_GROUP_NAME);
5225
5226 status = U_ZERO_ERROR;
5227 replacedText = m->replaceAll(UnicodeString("$not a capture group"), status);
5228 REGEX_ASSERT(status == U_REGEX_INVALID_CAPTURE_GROUP_NAME);
5229
5230 delete m;
5231
5232 // Repeat the above replaceAll() tests using the plain C API, which
5233 // has a separate implementation internally.
5234 // TODO: factor out the test data.
5235
5236 status = U_ZERO_ERROR;
5237 URegularExpression *re = uregex_openC("..(?<one>m)(.)(.)", 0, NULL, &status);
5238 REGEX_CHECK_STATUS;
5239 text = UnicodeString("abcmxyz");
5240 uregex_setText(re, text.getBuffer(), text.length(), &status);
5241 REGEX_CHECK_STATUS;
5242
5243 UChar resultBuf[100];
5244 int32_t resultLength;
5245 UnicodeString repl;
5246
5247 status = U_ZERO_ERROR;
5248 repl = UnicodeString("<$0>");
5249 resultLength = uregex_replaceAll(re, repl.getBuffer(), repl.length(), resultBuf, UPRV_LENGTHOF(resultBuf), &status);
5250 REGEX_CHECK_STATUS;
5251 REGEX_ASSERT(UnicodeString("a<bcmxy>z") == UnicodeString(resultBuf, resultLength));
5252
5253 status = U_ZERO_ERROR;
5254 repl = UnicodeString("<$1>");
5255 resultLength = uregex_replaceAll(re, repl.getBuffer(), repl.length(), resultBuf, UPRV_LENGTHOF(resultBuf), &status);
5256 REGEX_CHECK_STATUS;
5257 REGEX_ASSERT(UnicodeString("a<m>z") == UnicodeString(resultBuf, resultLength));
5258
5259 status = U_ZERO_ERROR;
5260 repl = UnicodeString("<${one}>");
5261 resultLength = uregex_replaceAll(re, repl.getBuffer(), repl.length(), resultBuf, UPRV_LENGTHOF(resultBuf), &status);
5262 REGEX_CHECK_STATUS;
5263 REGEX_ASSERT(UnicodeString("a<m>z") == UnicodeString(resultBuf, resultLength));
5264
5265 status = U_ZERO_ERROR;
5266 repl = UnicodeString("<$2>");
5267 resultLength = uregex_replaceAll(re, repl.getBuffer(), repl.length(), resultBuf, UPRV_LENGTHOF(resultBuf), &status);
5268 REGEX_CHECK_STATUS;
5269 REGEX_ASSERT(UnicodeString("a<x>z") == UnicodeString(resultBuf, resultLength));
5270
5271 status = U_ZERO_ERROR;
5272 repl = UnicodeString("<$3>");
5273 resultLength = uregex_replaceAll(re, repl.getBuffer(), repl.length(), resultBuf, UPRV_LENGTHOF(resultBuf), &status);
5274 REGEX_CHECK_STATUS;
5275 REGEX_ASSERT(UnicodeString("a<y>z") == UnicodeString(resultBuf, resultLength));
5276
5277 status = U_ZERO_ERROR;
5278 repl = UnicodeString("<$4>");
5279 resultLength = uregex_replaceAll(re, repl.getBuffer(), repl.length(), resultBuf, UPRV_LENGTHOF(resultBuf), &status);
5280 REGEX_ASSERT(status == U_INDEX_OUTOFBOUNDS_ERROR);
5281
5282 status = U_ZERO_ERROR;
5283 repl = UnicodeString("<$04>");
5284 resultLength = uregex_replaceAll(re, repl.getBuffer(), repl.length(), resultBuf, UPRV_LENGTHOF(resultBuf), &status);
5285 REGEX_CHECK_STATUS;
5286 REGEX_ASSERT(UnicodeString("a<bcmxy4>z") == UnicodeString(resultBuf, resultLength));
5287
5288 status = U_ZERO_ERROR;
5289 repl = UnicodeString("<$000016>");
5290 resultLength = uregex_replaceAll(re, repl.getBuffer(), repl.length(), resultBuf, UPRV_LENGTHOF(resultBuf), &status);
5291 REGEX_CHECK_STATUS;
5292 REGEX_ASSERT(UnicodeString("a<m6>z") == UnicodeString(resultBuf, resultLength));
5293
5294 status = U_ZERO_ERROR;
5295 repl = UnicodeString("<$3$2$1${one}>");
5296 resultLength = uregex_replaceAll(re, repl.getBuffer(), repl.length(), resultBuf, UPRV_LENGTHOF(resultBuf), &status);
5297 REGEX_CHECK_STATUS;
5298 REGEX_ASSERT(UnicodeString("a<yxmm>z") == UnicodeString(resultBuf, resultLength));
5299
5300 status = U_ZERO_ERROR;
5301 repl = UnicodeString("$3$2$1${one}");
5302 resultLength = uregex_replaceAll(re, repl.getBuffer(), repl.length(), resultBuf, UPRV_LENGTHOF(resultBuf), &status);
5303 REGEX_CHECK_STATUS;
5304 REGEX_ASSERT(UnicodeString("ayxmmz") == UnicodeString(resultBuf, resultLength));
5305
5306 status = U_ZERO_ERROR;
5307 repl = UnicodeString("<${noSuchName}>");
5308 resultLength = uregex_replaceAll(re, repl.getBuffer(), repl.length(), resultBuf, UPRV_LENGTHOF(resultBuf), &status);
5309 REGEX_ASSERT(status == U_REGEX_INVALID_CAPTURE_GROUP_NAME);
5310
5311 status = U_ZERO_ERROR;
5312 repl = UnicodeString("<${invalid-name}>");
5313 resultLength = uregex_replaceAll(re, repl.getBuffer(), repl.length(), resultBuf, UPRV_LENGTHOF(resultBuf), &status);
5314 REGEX_ASSERT(status == U_REGEX_INVALID_CAPTURE_GROUP_NAME);
5315
5316 status = U_ZERO_ERROR;
5317 repl = UnicodeString("<${one");
5318 resultLength = uregex_replaceAll(re, repl.getBuffer(), repl.length(), resultBuf, UPRV_LENGTHOF(resultBuf), &status);
5319 REGEX_ASSERT(status == U_REGEX_INVALID_CAPTURE_GROUP_NAME);
5320
5321 status = U_ZERO_ERROR;
5322 repl = UnicodeString("$not a capture group");
5323 resultLength = uregex_replaceAll(re, repl.getBuffer(), repl.length(), resultBuf, UPRV_LENGTHOF(resultBuf), &status);
5324 REGEX_ASSERT(status == U_REGEX_INVALID_CAPTURE_GROUP_NAME);
5325
5326 uregex_close(re);
5327 }
5328
5329 //--------------------------------------------------------------
5330 //
5331 // NamedCaptureLimits Patterns with huge numbers of named capture groups.
5332 // The point is not so much what the exact limit is,
5333 // but that a largish number doesn't hit bad non-linear performance,
5334 // and that exceeding the limit fails cleanly.
5335 //
5336 //--------------------------------------------------------------
NamedCaptureLimits()5337 void RegexTest::NamedCaptureLimits() {
5338 if (quick) {
5339 logln("Skipping test. Runs in exhuastive mode only.");
5340 return;
5341 }
5342 const int32_t goodLimit = 1000000; // Pattern w this many groups builds successfully.
5343 const int32_t failLimit = 10000000; // Pattern exceeds internal limits, fails to compile.
5344 char nnbuf[100];
5345 UnicodeString pattern;
5346 int32_t nn;
5347
5348 for (nn=1; nn<goodLimit; nn++) {
5349 sprintf(nnbuf, "(?<nn%d>)", nn);
5350 pattern.append(UnicodeString(nnbuf, -1, US_INV));
5351 }
5352 UErrorCode status = U_ZERO_ERROR;
5353 RegexPattern *pat = RegexPattern::compile(pattern, 0, status);
5354 REGEX_CHECK_STATUS;
5355 for (nn=1; nn<goodLimit; nn++) {
5356 sprintf(nnbuf, "nn%d", nn);
5357 int32_t groupNum = pat->groupNumberFromName(nnbuf, -1, status);
5358 REGEX_ASSERT(nn == groupNum);
5359 if (nn != groupNum) {
5360 break;
5361 }
5362 }
5363 delete pat;
5364
5365 pattern.remove();
5366 for (nn=1; nn<failLimit; nn++) {
5367 sprintf(nnbuf, "(?<nn%d>)", nn);
5368 pattern.append(UnicodeString(nnbuf, -1, US_INV));
5369 }
5370 status = U_ZERO_ERROR;
5371 pat = RegexPattern::compile(pattern, 0, status);
5372 REGEX_ASSERT(status == U_REGEX_PATTERN_TOO_BIG);
5373 delete pat;
5374 }
5375
5376
5377 //--------------------------------------------------------------
5378 //
5379 // Bug7651 Regex pattern that exceeds default operator stack depth in matcher.
5380 //
5381 //---------------------------------------------------------------
Bug7651()5382 void RegexTest::Bug7651() {
5383 UnicodeString pattern1("((?<![A-Za-z0-9])[#\\uff03][A-Za-z0-9_][A-Za-z0-9_\\u00c0-\\u00d6\\u00c8-\\u00f6\\u00f8-\\u00ff]*|(?<![A-Za-z0-9_])[@\\uff20][A-Za-z0-9_]+(?:\\/[\\w-]+)?|(https?\\:\\/\\/|www\\.)\\S+(?<![\\!\\),\\.:;\\]\\u0080-\\uFFFF])|\\$[A-Za-z]+)");
5384 // The following should exceed the default operator stack depth in the matcher, i.e. force the matcher to malloc instead of using fSmallData.
5385 // It will cause a segfault if RegexMatcher tries to use fSmallData instead of malloc'ing the memory needed (see init2) for the pattern operator stack allocation.
5386 UnicodeString pattern2("((https?\\:\\/\\/|www\\.)\\S+(?<![\\!\\),\\.:;\\]\\u0080-\\uFFFF])|(?<![A-Za-z0-9_])[\\@\\uff20][A-Za-z0-9_]+(?:\\/[\\w\\-]+)?|(?<![A-Za-z0-9])[\\#\\uff03][A-Za-z0-9_][A-Za-z0-9_\\u00c0-\\u00d6\\u00c8-\\u00f6\\u00f8-\\u00ff]*|\\$[A-Za-z]+)");
5387 UnicodeString s("#ff @abcd This is test");
5388 RegexPattern *REPattern = NULL;
5389 RegexMatcher *REMatcher = NULL;
5390 UErrorCode status = U_ZERO_ERROR;
5391 UParseError pe;
5392
5393 REPattern = RegexPattern::compile(pattern1, 0, pe, status);
5394 REGEX_CHECK_STATUS;
5395 REMatcher = REPattern->matcher(s, status);
5396 REGEX_CHECK_STATUS;
5397 REGEX_ASSERT(REMatcher->find());
5398 REGEX_ASSERT(REMatcher->start(status) == 0);
5399 delete REPattern;
5400 delete REMatcher;
5401 status = U_ZERO_ERROR;
5402
5403 REPattern = RegexPattern::compile(pattern2, 0, pe, status);
5404 REGEX_CHECK_STATUS;
5405 REMatcher = REPattern->matcher(s, status);
5406 REGEX_CHECK_STATUS;
5407 REGEX_ASSERT(REMatcher->find());
5408 REGEX_ASSERT(REMatcher->start(status) == 0);
5409 delete REPattern;
5410 delete REMatcher;
5411 status = U_ZERO_ERROR;
5412 }
5413
Bug7740()5414 void RegexTest::Bug7740() {
5415 UErrorCode status = U_ZERO_ERROR;
5416 UnicodeString pattern = "(a)";
5417 UnicodeString text = "abcdef";
5418 RegexMatcher *m = new RegexMatcher(pattern, text, 0, status);
5419 REGEX_CHECK_STATUS;
5420 REGEX_ASSERT(m->lookingAt(status));
5421 REGEX_CHECK_STATUS;
5422 status = U_ILLEGAL_ARGUMENT_ERROR;
5423 UnicodeString s = m->group(1, status); // Bug 7740: segfault here.
5424 REGEX_ASSERT(status == U_ILLEGAL_ARGUMENT_ERROR);
5425 REGEX_ASSERT(s == "");
5426 delete m;
5427 }
5428
5429 // Bug 8479: was crashing whith a Bogus UnicodeString as input.
5430
Bug8479()5431 void RegexTest::Bug8479() {
5432 UErrorCode status = U_ZERO_ERROR;
5433
5434 RegexMatcher* const pMatcher = new RegexMatcher("\\Aboo\\z", UREGEX_DOTALL|UREGEX_CASE_INSENSITIVE, status);
5435 REGEX_CHECK_STATUS;
5436 if (U_SUCCESS(status))
5437 {
5438 UnicodeString str;
5439 str.setToBogus();
5440 pMatcher->reset(str);
5441 status = U_ZERO_ERROR;
5442 pMatcher->matches(status);
5443 REGEX_ASSERT(status == U_ILLEGAL_ARGUMENT_ERROR);
5444 delete pMatcher;
5445 }
5446 }
5447
5448
5449 // Bug 7029
Bug7029()5450 void RegexTest::Bug7029() {
5451 UErrorCode status = U_ZERO_ERROR;
5452
5453 RegexMatcher* const pMatcher = new RegexMatcher(".", 0, status);
5454 UnicodeString text = "abc.def";
5455 UnicodeString splits[10];
5456 REGEX_CHECK_STATUS;
5457 int32_t numFields = pMatcher->split(text, splits, 10, status);
5458 REGEX_CHECK_STATUS;
5459 REGEX_ASSERT(numFields == 8);
5460 delete pMatcher;
5461 }
5462
5463 // Bug 9283
5464 // This test is checking for the existence of any supplemental characters that case-fold
5465 // to a bmp character.
5466 //
5467 // At the time of this writing there are none. If any should appear in a subsequent release
5468 // of Unicode, the code in regular expressions compilation that determines the longest
5469 // possible match for a literal string will need to be enhanced.
5470 //
5471 // See file regexcmp.cpp, case URX_STRING_I in RegexCompile::maxMatchLength()
5472 // for details on what to do in case of a failure of this test.
5473 //
Bug9283()5474 void RegexTest::Bug9283() {
5475 #if !UCONFIG_NO_NORMALIZATION
5476 UErrorCode status = U_ZERO_ERROR;
5477 UnicodeSet supplementalsWithCaseFolding("[[:CWCF:]&[\\U00010000-\\U0010FFFF]]", status);
5478 REGEX_CHECK_STATUS;
5479 int32_t index;
5480 UChar32 c;
5481 for (index=0; ; index++) {
5482 c = supplementalsWithCaseFolding.charAt(index);
5483 if (c == -1) {
5484 break;
5485 }
5486 UnicodeString cf = UnicodeString(c).foldCase();
5487 REGEX_ASSERT(cf.length() >= 2);
5488 }
5489 #endif /* #if !UCONFIG_NO_NORMALIZATION */
5490 }
5491
5492
CheckInvBufSize()5493 void RegexTest::CheckInvBufSize() {
5494 if(inv_next>=INV_BUFSIZ) {
5495 errln("%s: increase #define of INV_BUFSIZ ( is %d but needs to be at least %d )\n",
5496 __FILE__, INV_BUFSIZ, inv_next);
5497 } else {
5498 logln("%s: INV_BUFSIZ is %d, usage %d\n", __FILE__, INV_BUFSIZ, inv_next);
5499 }
5500 }
5501
5502
Bug10459()5503 void RegexTest::Bug10459() {
5504 UErrorCode status = U_ZERO_ERROR;
5505 UnicodeString patternString("(txt)");
5506 UnicodeString txtString("txt");
5507
5508 UText *utext_pat = utext_openUnicodeString(NULL, &patternString, &status);
5509 REGEX_CHECK_STATUS;
5510 UText *utext_txt = utext_openUnicodeString(NULL, &txtString, &status);
5511 REGEX_CHECK_STATUS;
5512
5513 URegularExpression *icu_re = uregex_openUText(utext_pat, 0, NULL, &status);
5514 REGEX_CHECK_STATUS;
5515
5516 uregex_setUText(icu_re, utext_txt, &status);
5517 REGEX_CHECK_STATUS;
5518
5519 // The bug was that calling uregex_group() before doing a matching operation
5520 // was causing a segfault. Only for Regular Expressions created from UText.
5521 // It should set an U_REGEX_INVALID_STATE.
5522
5523 UChar buf[100];
5524 int32_t len = uregex_group(icu_re, 0, buf, UPRV_LENGTHOF(buf), &status);
5525 REGEX_ASSERT(status == U_REGEX_INVALID_STATE);
5526 REGEX_ASSERT(len == 0);
5527
5528 uregex_close(icu_re);
5529 utext_close(utext_pat);
5530 utext_close(utext_txt);
5531 }
5532
TestCaseInsensitiveStarters()5533 void RegexTest::TestCaseInsensitiveStarters() {
5534 // Test that the data used by RegexCompile::findCaseInsensitiveStarters() hasn't
5535 // become stale because of new Unicode characters.
5536 // If it is stale, rerun the generation tool
5537 // https://github.com/unicode-org/icu/tree/main/tools/unicode/c/genregexcasing
5538 // and replace the embedded data in i18n/regexcmp.cpp
5539
5540 for (UChar32 cp=0; cp<=0x10ffff; cp++) {
5541 if (!u_hasBinaryProperty(cp, UCHAR_CASE_SENSITIVE)) {
5542 continue;
5543 }
5544 UnicodeSet s(cp, cp);
5545 s.closeOver(USET_CASE_INSENSITIVE);
5546 UnicodeSetIterator setIter(s);
5547 while (setIter.next()) {
5548 if (!setIter.isString()) {
5549 continue;
5550 }
5551 const UnicodeString &str = setIter.getString();
5552 UChar32 firstChar = str.char32At(0);
5553 UnicodeSet starters;
5554 RegexCompile::findCaseInsensitiveStarters(firstChar, &starters);
5555 if (!starters.contains(cp)) {
5556 errln("CaseInsensitiveStarters for \\u%x is missing character \\u%x.", cp, firstChar);
5557 return;
5558 }
5559 }
5560 }
5561 }
5562
5563
TestBug11049()5564 void RegexTest::TestBug11049() {
5565 // Original bug report: pattern with match start consisting of one of several individual characters,
5566 // and the text being matched ending with a supplementary character. find() would read past the
5567 // end of the input text when searching for potential match starting points.
5568
5569 // To see the problem, the text must exactly fill an allocated buffer, so that valgrind will
5570 // detect the bad read.
5571
5572 TestCase11049("A|B|C", "a string \\ud800\\udc00", FALSE, __LINE__);
5573 TestCase11049("A|B|C", "string matches at end C", TRUE, __LINE__);
5574
5575 // Test again with a pattern starting with a single character,
5576 // which takes a different code path than starting with an OR expression,
5577 // but with similar logic.
5578 TestCase11049("C", "a string \\ud800\\udc00", FALSE, __LINE__);
5579 TestCase11049("C", "string matches at end C", TRUE, __LINE__);
5580 }
5581
5582 // Run a single test case from TestBug11049(). Internal function.
TestCase11049(const char * pattern,const char * data,UBool expectMatch,int32_t lineNumber)5583 void RegexTest::TestCase11049(const char *pattern, const char *data, UBool expectMatch, int32_t lineNumber) {
5584 UErrorCode status = U_ZERO_ERROR;
5585 UnicodeString patternString = UnicodeString(pattern).unescape();
5586 LocalPointer<RegexPattern> compiledPat(RegexPattern::compile(patternString, 0, status));
5587
5588 UnicodeString dataString = UnicodeString(data).unescape();
5589 UChar *exactBuffer = new UChar[dataString.length()];
5590 dataString.extract(exactBuffer, dataString.length(), status);
5591 UText *ut = utext_openUChars(NULL, exactBuffer, dataString.length(), &status);
5592
5593 LocalPointer<RegexMatcher> matcher(compiledPat->matcher(status));
5594 REGEX_CHECK_STATUS;
5595 matcher->reset(ut);
5596 UBool result = matcher->find();
5597 if (result != expectMatch) {
5598 errln("File %s, line %d: expected %d, got %d. Pattern = \"%s\", text = \"%s\"",
5599 __FILE__, lineNumber, expectMatch, result, pattern, data);
5600 }
5601
5602 // Rerun test with UTF-8 input text. Won't see buffer overreads, but could see
5603 // off-by-one on find() with match at the last code point.
5604 // Size of the original char * data (invariant charset) will be <= than the equivalent UTF-8
5605 // because string.unescape() will only shrink it.
5606 char * utf8Buffer = new char[uprv_strlen(data)+1];
5607 u_strToUTF8(utf8Buffer, static_cast<int32_t>(uprv_strlen(data)+1), NULL, dataString.getBuffer(), dataString.length(), &status);
5608 REGEX_CHECK_STATUS;
5609 ut = utext_openUTF8(ut, utf8Buffer, -1, &status);
5610 REGEX_CHECK_STATUS;
5611 matcher->reset(ut);
5612 result = matcher->find();
5613 if (result != expectMatch) {
5614 errln("File %s, line %d (UTF-8 check): expected %d, got %d. Pattern = \"%s\", text = \"%s\"",
5615 __FILE__, lineNumber, expectMatch, result, pattern, data);
5616 }
5617 delete [] utf8Buffer;
5618
5619 utext_close(ut);
5620 delete [] exactBuffer;
5621 }
5622
5623
TestBug11371()5624 void RegexTest::TestBug11371() {
5625 if (quick) {
5626 logln("Skipping test. Runs in exhuastive mode only.");
5627 return;
5628 }
5629 UErrorCode status = U_ZERO_ERROR;
5630 UnicodeString patternString;
5631
5632 for (int i=0; i<8000000; i++) {
5633 patternString.append(UnicodeString("()"));
5634 }
5635 LocalPointer<RegexPattern> compiledPat(RegexPattern::compile(patternString, 0, status));
5636 if (status != U_REGEX_PATTERN_TOO_BIG) {
5637 errln("File %s, line %d expected status=U_REGEX_PATTERN_TOO_BIG; got %s.",
5638 __FILE__, __LINE__, u_errorName(status));
5639 }
5640
5641 status = U_ZERO_ERROR;
5642 patternString = "(";
5643 for (int i=0; i<20000000; i++) {
5644 patternString.append(UnicodeString("A++"));
5645 }
5646 patternString.append(UnicodeString("){0}B++"));
5647 LocalPointer<RegexPattern> compiledPat2(RegexPattern::compile(patternString, 0, status));
5648 if (status != U_REGEX_PATTERN_TOO_BIG) {
5649 errln("File %s, line %d expected status=U_REGEX_PATTERN_TOO_BIG; got %s.",
5650 __FILE__, __LINE__, u_errorName(status));
5651 }
5652
5653 // Pattern with too much string data, such that string indexes overflow operand data field size
5654 // in compiled instruction.
5655 status = U_ZERO_ERROR;
5656 patternString = "";
5657 while (patternString.length() < 0x00ffffff) {
5658 patternString.append(UnicodeString("stuff and things dont you know, these are a few of my favorite strings\n"));
5659 }
5660 patternString.append(UnicodeString("X? trailing string"));
5661 LocalPointer<RegexPattern> compiledPat3(RegexPattern::compile(patternString, 0, status));
5662 if (status != U_REGEX_PATTERN_TOO_BIG) {
5663 errln("File %s, line %d expected status=U_REGEX_PATTERN_TOO_BIG; got %s.",
5664 __FILE__, __LINE__, u_errorName(status));
5665 }
5666 }
5667
TestBug11480()5668 void RegexTest::TestBug11480() {
5669 // C API, get capture group of a group that does not participate in the match.
5670 // (Returns a zero length string, with nul termination,
5671 // indistinguishable from a group with a zero length match.)
5672
5673 UErrorCode status = U_ZERO_ERROR;
5674 URegularExpression *re = uregex_openC("(A)|(B)", 0, NULL, &status);
5675 REGEX_CHECK_STATUS;
5676 UnicodeString text = UNICODE_STRING_SIMPLE("A");
5677 uregex_setText(re, text.getBuffer(), text.length(), &status);
5678 REGEX_CHECK_STATUS;
5679 REGEX_ASSERT(uregex_lookingAt(re, 0, &status));
5680 UChar buf[10] = {(UChar)13, (UChar)13, (UChar)13, (UChar)13};
5681 int32_t length = uregex_group(re, 2, buf+1, UPRV_LENGTHOF(buf)-1, &status);
5682 REGEX_ASSERT(length == 0);
5683 REGEX_ASSERT(buf[0] == 13);
5684 REGEX_ASSERT(buf[1] == 0);
5685 REGEX_ASSERT(buf[2] == 13);
5686 uregex_close(re);
5687
5688 // UText C++ API, length of match is 0 for non-participating matches.
5689 UText ut = UTEXT_INITIALIZER;
5690 utext_openUnicodeString(&ut, &text, &status);
5691 RegexMatcher matcher(UnicodeString("(A)|(B)"), 0, status);
5692 REGEX_CHECK_STATUS;
5693 matcher.reset(&ut);
5694 REGEX_ASSERT(matcher.lookingAt(0, status));
5695
5696 // UText C++ API, Capture group 1 matches "A", position 0, length 1.
5697 int64_t groupLen = -666;
5698 UText group = UTEXT_INITIALIZER;
5699 matcher.group(1, &group, groupLen, status);
5700 REGEX_CHECK_STATUS;
5701 REGEX_ASSERT(groupLen == 1);
5702 REGEX_ASSERT(utext_getNativeIndex(&group) == 0);
5703
5704 // Capture group 2, the (B), does not participate in the match.
5705 matcher.group(2, &group, groupLen, status);
5706 REGEX_CHECK_STATUS;
5707 REGEX_ASSERT(groupLen == 0);
5708 REGEX_ASSERT(matcher.start(2, status) == -1);
5709 REGEX_CHECK_STATUS;
5710 }
5711
TestBug12884()5712 void RegexTest::TestBug12884() {
5713 // setTimeLimit() was not effective for empty sub-patterns with large {minimum counts}
5714 UnicodeString pattern(u"(((((((){120}){11}){11}){11}){80}){11}){4}");
5715 UnicodeString text(u"hello");
5716 UErrorCode status = U_ZERO_ERROR;
5717 RegexMatcher m(pattern, text, 0, status);
5718 REGEX_CHECK_STATUS;
5719 m.setTimeLimit(5, status);
5720 m.find(status);
5721 REGEX_ASSERT(status == U_REGEX_TIME_OUT);
5722
5723 // Non-greedy loops. They take a different code path during matching.
5724 UnicodeString ngPattern(u"(((((((){120}?){11}?){11}?){11}?){80}?){11}?){4}?");
5725 status = U_ZERO_ERROR;
5726 RegexMatcher ngM(ngPattern, text, 0, status);
5727 REGEX_CHECK_STATUS;
5728 ngM.setTimeLimit(5, status);
5729 ngM.find(status);
5730 REGEX_ASSERT(status == U_REGEX_TIME_OUT);
5731
5732 // UText, wrapping non-UTF-16 text, also takes a different execution path.
5733 StringPiece text8(u8"¿Qué es Unicode? Unicode proporciona un número único para cada"
5734 "carácter, sin importar la plataforma, sin importar el programa,"
5735 "sin importar el idioma.");
5736 status = U_ZERO_ERROR;
5737 LocalUTextPointer ut(utext_openUTF8(NULL, text8.data(), text8.length(), &status));
5738 REGEX_CHECK_STATUS;
5739 m.reset(ut.getAlias());
5740 m.find(status);
5741 REGEX_ASSERT(status == U_REGEX_TIME_OUT);
5742
5743 status = U_ZERO_ERROR;
5744 ngM.reset(ut.getAlias());
5745 ngM.find(status);
5746 REGEX_ASSERT(status == U_REGEX_TIME_OUT);
5747 }
5748
5749 // Bug 13631. A find() of a pattern with a zero length look-behind assertions
5750 // can cause a read past the end of the input text.
5751 // The failure is seen when running this test with Clang's Address Sanitizer.
5752
TestBug13631()5753 void RegexTest::TestBug13631() {
5754 const UChar *pats[] = { u"(?<!^)",
5755 u"(?<=^)",
5756 nullptr
5757 };
5758 for (const UChar **pat=pats; *pat; ++pat) {
5759 UErrorCode status = U_ZERO_ERROR;
5760 UnicodeString upat(*pat);
5761 RegexMatcher matcher(upat, 0, status);
5762 const UChar s =u'a';
5763 UText *ut = utext_openUChars(nullptr, &s, 1, &status);
5764 REGEX_CHECK_STATUS;
5765 matcher.reset(ut);
5766 while (matcher.find()) {
5767 }
5768 utext_close(ut);
5769 }
5770 }
5771
5772 // Bug 13632 Out of bounds memory reference if a replacement string ends with a '$',
5773 // where a following group specification would be expected.
5774 // Failure shows when running the test under Clang's Address Sanitizer.
5775
TestBug13632()5776 void RegexTest::TestBug13632() {
5777 UErrorCode status = U_ZERO_ERROR;
5778 URegularExpression *re = uregex_openC(" ", 0, nullptr, &status);
5779 const char16_t *sourceString = u"Hello, world.";
5780 uregex_setText(re, sourceString, u_strlen(sourceString), &status);
5781
5782 const int32_t destCap = 20;
5783 char16_t dest[destCap] = {};
5784 const char16_t replacement[] = {u'x', u'$'}; // Not nul terminated string.
5785 uregex_replaceAll(re, replacement, 2, dest, destCap, &status);
5786
5787 assertEquals("", U_REGEX_INVALID_CAPTURE_GROUP_NAME, status);
5788 uregex_close(re);
5789 }
5790
TestBug20359()5791 void RegexTest::TestBug20359() {
5792 // The bug was stack overflow while parsing a pattern with a huge number of adjacent \Q\E
5793 // pairs. (Enter and exit pattern literal quote mode). Logic was correct.
5794 // Changed implementation to loop instead of recursing.
5795
5796 UnicodeString pattern;
5797 for (int i=0; i<50000; ++i) {
5798 pattern += u"\\Q\\E";
5799 }
5800 pattern += u"x";
5801
5802 UErrorCode status = U_ZERO_ERROR;
5803 LocalURegularExpressionPointer re(uregex_open(pattern.getBuffer(), pattern.length(),
5804 0, nullptr, &status));
5805 assertSuccess(WHERE, status);
5806
5807 // We have passed the point where the bug crashed. The following is a small sanity
5808 // check that the pattern works, that all the \Q\E\Q\E... didn't cause other problems.
5809
5810 uregex_setText(re.getAlias(), u"abcxyz", -1, &status);
5811 assertSuccess(WHERE, status);
5812 assertTrue(WHERE, uregex_find(re.getAlias(), 0, &status));
5813 assertEquals(WHERE, 3, uregex_start(re.getAlias(), 0, &status));
5814 assertSuccess(WHERE, status);
5815 }
5816
5817
TestBug20863()5818 void RegexTest::TestBug20863() {
5819 // Test that patterns with a large number of named capture groups work correctly.
5820 //
5821 // The ticket was not for a bug per se, but to reduce memory usage by using lazy
5822 // construction of the map from capture names to numbers, and decreasing the
5823 // default size of the map.
5824
5825 constexpr int GROUP_COUNT = 2000;
5826 std::vector<UnicodeString> groupNames;
5827 for (int32_t i=0; i<GROUP_COUNT; ++i) {
5828 UnicodeString name;
5829 name.append(u"name");
5830 name.append(Int64ToUnicodeString(i));
5831 groupNames.push_back(name);
5832 }
5833
5834 UnicodeString patternString;
5835 for (UnicodeString name: groupNames) {
5836 patternString.append(u"(?<");
5837 patternString.append(name);
5838 patternString.append(u">.)");
5839 }
5840
5841 UErrorCode status = U_ZERO_ERROR;
5842 UParseError pe;
5843 LocalPointer<RegexPattern> pattern(RegexPattern::compile(patternString, pe, status), status);
5844 if (!assertSuccess(WHERE, status)) {
5845 return;
5846 }
5847
5848 for (int32_t i=0; i<GROUP_COUNT; ++i) {
5849 int32_t group = pattern->groupNumberFromName(groupNames[i], status);
5850 if (!assertSuccess(WHERE, status)) {
5851 return;
5852 }
5853 assertEquals(WHERE, i+1, group);
5854 // Note: group 0 is the overall match; group 1 is the first separate capture group.
5855 }
5856
5857 // Verify that assignment of patterns with various combinations of named capture work.
5858 // Lazy creation of the internal named capture map changed the implementation logic here.
5859 {
5860 LocalPointer<RegexPattern> pat1(RegexPattern::compile(u"abc", pe, status), status);
5861 LocalPointer<RegexPattern> pat2(RegexPattern::compile(u"a(?<name>b)c", pe, status), status);
5862 assertSuccess(WHERE, status);
5863 assertFalse(WHERE, *pat1 == *pat2);
5864 *pat1 = *pat2;
5865 assertTrue(WHERE, *pat1 == *pat2);
5866 assertEquals(WHERE, 1, pat1->groupNumberFromName(u"name", status));
5867 assertEquals(WHERE, 1, pat2->groupNumberFromName(u"name", status));
5868 assertSuccess(WHERE, status);
5869 }
5870
5871 {
5872 LocalPointer<RegexPattern> pat1(RegexPattern::compile(u"abc", pe, status), status);
5873 LocalPointer<RegexPattern> pat2(RegexPattern::compile(u"a(?<name>b)c", pe, status), status);
5874 assertSuccess(WHERE, status);
5875 assertFalse(WHERE, *pat1 == *pat2);
5876 *pat2 = *pat1;
5877 assertTrue(WHERE, *pat1 == *pat2);
5878 assertEquals(WHERE, 0, pat1->groupNumberFromName(u"name", status));
5879 assertEquals(WHERE, U_REGEX_INVALID_CAPTURE_GROUP_NAME, status);
5880 status = U_ZERO_ERROR;
5881 assertEquals(WHERE, 0, pat2->groupNumberFromName(u"name", status));
5882 assertEquals(WHERE, U_REGEX_INVALID_CAPTURE_GROUP_NAME, status);
5883 status = U_ZERO_ERROR;
5884 }
5885
5886 {
5887 LocalPointer<RegexPattern> pat1(RegexPattern::compile(u"a(?<name1>b)c", pe, status), status);
5888 LocalPointer<RegexPattern> pat2(RegexPattern::compile(u"a(?<name2>b)c", pe, status), status);
5889 assertSuccess(WHERE, status);
5890 assertFalse(WHERE, *pat1 == *pat2);
5891 *pat2 = *pat1;
5892 assertTrue(WHERE, *pat1 == *pat2);
5893 assertEquals(WHERE, 1, pat1->groupNumberFromName(u"name1", status));
5894 assertSuccess(WHERE, status);
5895 assertEquals(WHERE, 1, pat2->groupNumberFromName(u"name1", status));
5896 assertSuccess(WHERE, status);
5897 assertEquals(WHERE, 0, pat1->groupNumberFromName(u"name2", status));
5898 assertEquals(WHERE, U_REGEX_INVALID_CAPTURE_GROUP_NAME, status);
5899 status = U_ZERO_ERROR;
5900 assertEquals(WHERE, 0, pat2->groupNumberFromName(u"name2", status));
5901 assertEquals(WHERE, U_REGEX_INVALID_CAPTURE_GROUP_NAME, status);
5902 status = U_ZERO_ERROR;
5903 }
5904
5905 }
5906
5907
5908 #endif /* !UCONFIG_NO_REGULAR_EXPRESSIONS */
5909