1 // © 2016 and later: Unicode, Inc. and others.
2 // License & terms of use: http://www.unicode.org/copyright.html
3 /*
4 ********************************************************************************
5 *   Copyright (C) 1999-2016 International Business Machines Corporation and
6 *   others. All Rights Reserved.
7 ********************************************************************************
8 *   Date        Name        Description
9 *   10/20/99    alan        Creation.
10 *   03/22/2000  Madhu       Added additional tests
11 ********************************************************************************
12 */
13 
14 #include <stdio.h>
15 
16 #include <string.h>
17 #include "unicode/utypes.h"
18 #include "usettest.h"
19 #include "unicode/ucnv.h"
20 #include "unicode/uniset.h"
21 #include "unicode/uchar.h"
22 #include "unicode/usetiter.h"
23 #include "unicode/ustring.h"
24 #include "unicode/parsepos.h"
25 #include "unicode/symtable.h"
26 #include "unicode/utf8.h"
27 #include "unicode/utf16.h"
28 #include "unicode/uversion.h"
29 #include "cmemory.h"
30 #include "hash.h"
31 
32 #define TEST_ASSERT_SUCCESS(status) UPRV_BLOCK_MACRO_BEGIN { \
33     if (U_FAILURE(status)) { \
34         dataerrln("fail in file \"%s\", line %d: \"%s\"", __FILE__, __LINE__, \
35                   u_errorName(status)); \
36     } \
37 } UPRV_BLOCK_MACRO_END
38 
39 #define TEST_ASSERT(expr) UPRV_BLOCK_MACRO_BEGIN { \
40     if (!(expr)) { \
41         dataerrln("fail in file \"%s\", line %d", __FILE__, __LINE__); \
42     } \
43 } UPRV_BLOCK_MACRO_END
44 
operator +(const UnicodeString & left,const UnicodeSet & set)45 UnicodeString operator+(const UnicodeString& left, const UnicodeSet& set) {
46     UnicodeString pat;
47     set.toPattern(pat);
48     return left + UnicodeSetTest::escape(pat);
49 }
50 
UnicodeSetTest()51 UnicodeSetTest::UnicodeSetTest() : utf8Cnv(NULL) {
52 }
53 
openUTF8Converter()54 UConverter *UnicodeSetTest::openUTF8Converter() {
55     if(utf8Cnv==NULL) {
56         UErrorCode errorCode=U_ZERO_ERROR;
57         utf8Cnv=ucnv_open("UTF-8", &errorCode);
58     }
59     return utf8Cnv;
60 }
61 
~UnicodeSetTest()62 UnicodeSetTest::~UnicodeSetTest() {
63     ucnv_close(utf8Cnv);
64 }
65 
66 void
runIndexedTest(int32_t index,UBool exec,const char * & name,char *)67 UnicodeSetTest::runIndexedTest(int32_t index, UBool exec,
68                                const char* &name, char* /*par*/) {
69     if (exec) {
70         logln(u"TestSuite UnicodeSetTest");
71     }
72     TESTCASE_AUTO_BEGIN;
73     TESTCASE_AUTO(TestPatterns);
74     TESTCASE_AUTO(TestAddRemove);
75     TESTCASE_AUTO(TestCategories);
76     TESTCASE_AUTO(TestCloneEqualHash);
77     TESTCASE_AUTO(TestMinimalRep);
78     TESTCASE_AUTO(TestAPI);
79     TESTCASE_AUTO(TestScriptSet);
80     TESTCASE_AUTO(TestPropertySet);
81     TESTCASE_AUTO(TestClone);
82     TESTCASE_AUTO(TestExhaustive);
83     TESTCASE_AUTO(TestToPattern);
84     TESTCASE_AUTO(TestIndexOf);
85     TESTCASE_AUTO(TestStrings);
86     TESTCASE_AUTO(Testj2268);
87     TESTCASE_AUTO(TestCloseOver);
88     TESTCASE_AUTO(TestEscapePattern);
89     TESTCASE_AUTO(TestInvalidCodePoint);
90     TESTCASE_AUTO(TestSymbolTable);
91     TESTCASE_AUTO(TestSurrogate);
92     TESTCASE_AUTO(TestPosixClasses);
93     TESTCASE_AUTO(TestIteration);
94     TESTCASE_AUTO(TestFreezable);
95     TESTCASE_AUTO(TestSpan);
96     TESTCASE_AUTO(TestStringSpan);
97     TESTCASE_AUTO(TestPatternWithSurrogates);
98     TESTCASE_AUTO(TestIntOverflow);
99     TESTCASE_AUTO(TestUnusedCcc);
100     TESTCASE_AUTO(TestDeepPattern);
101     TESTCASE_AUTO(TestEmptyString);
102     TESTCASE_AUTO(TestSkipToStrings);
103     TESTCASE_AUTO(TestPatternCodePointComplement);
104     TESTCASE_AUTO_END;
105 }
106 
107 static const char NOT[] = "%%%%";
108 
109 /**
110  * UVector was improperly copying contents
111  * This code will crash this is still true
112  */
Testj2268()113 void UnicodeSetTest::Testj2268() {
114   UnicodeSet t;
115   t.add(UnicodeString("abc"));
116   UnicodeSet test(t);
117   UnicodeString ustrPat;
118   test.toPattern(ustrPat, TRUE);
119 }
120 
121 /**
122  * Test toPattern().
123  */
TestToPattern()124 void UnicodeSetTest::TestToPattern() {
125     UErrorCode ec = U_ZERO_ERROR;
126 
127     // Test that toPattern() round trips with syntax characters and
128     // whitespace.
129     {
130         static const char* OTHER_TOPATTERN_TESTS[] = {
131             "[[:latin:]&[:greek:]]",
132             "[[:latin:]-[:greek:]]",
133             "[:nonspacing mark:]",
134             NULL
135         };
136 
137         for (int32_t j=0; OTHER_TOPATTERN_TESTS[j]!=NULL; ++j) {
138             ec = U_ZERO_ERROR;
139             UnicodeSet s(OTHER_TOPATTERN_TESTS[j], ec);
140             if (U_FAILURE(ec)) {
141                 dataerrln((UnicodeString)"FAIL: bad pattern " + OTHER_TOPATTERN_TESTS[j] + " - " + UnicodeString(u_errorName(ec)));
142                 continue;
143             }
144             checkPat(OTHER_TOPATTERN_TESTS[j], s);
145         }
146 
147         for (UChar32 i = 0; i <= 0x10FFFF; ++i) {
148             if ((i <= 0xFF && !u_isalpha(i)) || u_isspace(i)) {
149 
150                 // check various combinations to make sure they all work.
151                 if (i != 0 && !toPatternAux(i, i)){
152                     continue;
153                 }
154                 if (!toPatternAux(0, i)){
155                     continue;
156                 }
157                 if (!toPatternAux(i, 0xFFFF)){
158                     continue;
159                 }
160             }
161         }
162     }
163 
164     // Test pattern behavior of multicharacter strings.
165     {
166         ec = U_ZERO_ERROR;
167         UnicodeSet* s = new UnicodeSet("[a-z {aa} {ab}]", ec);
168 
169         // This loop isn't a loop.  It's here to make the compiler happy.
170         // If you're curious, try removing it and changing the 'break'
171         // statements (except for the last) to goto's.
172         for (;;) {
173             if (U_FAILURE(ec)) break;
174             const char* exp1[] = {"aa", "ab", NOT, "ac", NULL};
175             expectToPattern(*s, "[a-z{aa}{ab}]", exp1);
176 
177             s->add("ac");
178             const char* exp2[] = {"aa", "ab", "ac", NOT, "xy", NULL};
179             expectToPattern(*s, "[a-z{aa}{ab}{ac}]", exp2);
180 
181             s->applyPattern(u"[a-z {\\{l} {r\\}}]", ec);
182             if (U_FAILURE(ec)) break;
183             const char* exp3[] = {"{l", "r}", NOT, "xy", NULL};
184             expectToPattern(*s, u"[a-z{r\\}}{\\{l}]", exp3);
185 
186             s->add("[]");
187             const char* exp4[] = {"{l", "r}", "[]", NOT, "xy", NULL};
188             expectToPattern(*s, u"[a-z{\\[\\]}{r\\}}{\\{l}]", exp4);
189 
190             s->applyPattern(u"[a-z {\\u4E01\\u4E02}{\\n\\r}]", ec);
191             if (U_FAILURE(ec)) break;
192             const char* exp5[] = {"\\u4E01\\u4E02", "\n\r", NULL};
193             expectToPattern(*s, u"[a-z{\\u000A\\u000D}{\\u4E01\\u4E02}]", exp5);
194 
195             // j2189
196             s->clear();
197             s->add(UnicodeString("abc", ""));
198             s->add(UnicodeString("abc", ""));
199             const char* exp6[] = {"abc", NOT, "ab", NULL};
200             expectToPattern(*s, "[{abc}]", exp6);
201 
202             break;
203         }
204 
205         if (U_FAILURE(ec)) errln("FAIL: pattern parse error");
206         delete s;
207     }
208 
209     // JB#3400: For 2 character ranges prefer [ab] to [a-b]
210     UnicodeSet s;
211     s.add(u'a', u'b');
212     expectToPattern(s, "[ab]", NULL);
213 }
214 
toPatternAux(UChar32 start,UChar32 end)215 UBool UnicodeSetTest::toPatternAux(UChar32 start, UChar32 end) {
216 
217     // use Integer.toString because Utility.hex doesn't handle ints
218     UnicodeString pat = "";
219     // TODO do these in hex
220     //String source = "0x" + Integer.toString(start,16).toUpperCase();
221     //if (start != end) source += "..0x" + Integer.toString(end,16).toUpperCase();
222     UnicodeString source;
223     source = source + (uint32_t)start;
224     if (start != end)
225         source = source + ".." + (uint32_t)end;
226     UnicodeSet testSet;
227     testSet.add(start, end);
228     return checkPat(source, testSet);
229 }
230 
checkPat(const UnicodeString & source,const UnicodeSet & testSet)231 UBool UnicodeSetTest::checkPat(const UnicodeString& source,
232                                const UnicodeSet& testSet) {
233     // What we want to make sure of is that a pattern generated
234     // by toPattern(), with or without escaped unprintables, can
235     // be passed back into the UnicodeSet constructor.
236     UnicodeString pat0;
237 
238     testSet.toPattern(pat0, TRUE);
239 
240     if (!checkPat(source + " (escaped)", testSet, pat0)) return FALSE;
241 
242     //String pat1 = unescapeLeniently(pat0);
243     //if (!checkPat(source + " (in code)", testSet, pat1)) return false;
244 
245     UnicodeString pat2;
246     testSet.toPattern(pat2, FALSE);
247     if (!checkPat(source, testSet, pat2)) return FALSE;
248 
249     //String pat3 = unescapeLeniently(pat2);
250     // if (!checkPat(source + " (in code)", testSet, pat3)) return false;
251 
252     //logln(source + " => " + pat0 + ", " + pat1 + ", " + pat2 + ", " + pat3);
253     logln((UnicodeString)source + " => " + pat0 + ", " + pat2);
254     return TRUE;
255 }
256 
checkPat(const UnicodeString & source,const UnicodeSet & testSet,const UnicodeString & pat)257 UBool UnicodeSetTest::checkPat(const UnicodeString& source,
258                                const UnicodeSet& testSet,
259                                const UnicodeString& pat) {
260     UErrorCode ec = U_ZERO_ERROR;
261     UnicodeSet testSet2(pat, ec);
262     if (testSet2 != testSet) {
263         errln((UnicodeString)"Fail toPattern: " + source + " => " + pat);
264         return FALSE;
265     }
266     return TRUE;
267 }
268 
269 void
TestPatterns(void)270 UnicodeSetTest::TestPatterns(void) {
271     UnicodeSet set;
272     expectPattern(set, UnicodeString("[[a-m]&[d-z]&[k-y]]", ""),  "km");
273     expectPattern(set, UnicodeString("[[a-z]-[m-y]-[d-r]]", ""),  "aczz");
274     expectPattern(set, UnicodeString("[a\\-z]", ""),  "--aazz");
275     expectPattern(set, UnicodeString("[-az]", ""),  "--aazz");
276     expectPattern(set, UnicodeString("[az-]", ""),  "--aazz");
277     expectPattern(set, UnicodeString("[[[a-z]-[aeiou]i]]", ""), "bdfnptvz");
278 
279     // Throw in a test of complement
280     set.complement();
281     UnicodeString exp;
282     exp.append((UChar)0x0000).append("aeeoouu").append((UChar)(u'z'+1)).append(u'\uFFFF');
283     expectPairs(set, exp);
284 }
285 
286 void
TestCategories(void)287 UnicodeSetTest::TestCategories(void) {
288     UErrorCode status = U_ZERO_ERROR;
289     const char* pat = " [:Lu:] "; // Whitespace ok outside [:..:]
290     UnicodeSet set(pat, status);
291     if (U_FAILURE(status)) {
292         dataerrln((UnicodeString)"Fail: Can't construct set with " + pat + " - " + UnicodeString(u_errorName(status)));
293         return;
294     } else {
295         expectContainment(set, pat, "ABC", "abc");
296     }
297 
298     UChar32 i;
299     int32_t failures = 0;
300     // Make sure generation of L doesn't pollute cached Lu set
301     // First generate L, then Lu
302     set.applyPattern("[:L:]", status);
303     if (U_FAILURE(status)) { errln("FAIL"); return; }
304     for (i=0; i<0x200; ++i) {
305         UBool l = u_isalpha((UChar)i);
306         if (l != set.contains(i)) {
307             errln((UnicodeString)"FAIL: L contains " + (unsigned short)i + " = " +
308                   set.contains(i));
309             if (++failures == 10) break;
310         }
311     }
312 
313     set.applyPattern("[:Lu:]", status);
314     if (U_FAILURE(status)) { errln("FAIL"); return; }
315     for (i=0; i<0x200; ++i) {
316         UBool lu = (u_charType((UChar)i) == U_UPPERCASE_LETTER);
317         if (lu != set.contains(i)) {
318             errln((UnicodeString)"FAIL: Lu contains " + (unsigned short)i + " = " +
319                   set.contains(i));
320             if (++failures == 20) break;
321         }
322     }
323 }
324 void
TestCloneEqualHash(void)325 UnicodeSetTest::TestCloneEqualHash(void) {
326     UErrorCode status = U_ZERO_ERROR;
327     // set1 and set2 used to be built with the obsolete constructor taking
328     // UCharCategory values; replaced with pattern constructors
329     // markus 20030502
330     UnicodeSet *set1=new UnicodeSet(u"\\p{Lowercase Letter}", status); //  :Ll: Letter, lowercase
331     UnicodeSet *set1a=new UnicodeSet(u"[:Ll:]", status); //  Letter, lowercase
332     if (U_FAILURE(status)){
333         dataerrln((UnicodeString)"FAIL: Can't construst set with category->Ll" + " - " + UnicodeString(u_errorName(status)));
334         return;
335     }
336     UnicodeSet *set2=new UnicodeSet(u"\\p{Decimal Number}", status);   //Number, Decimal digit
337     UnicodeSet *set2a=new UnicodeSet(u"[:Nd:]", status);   //Number, Decimal digit
338     if (U_FAILURE(status)){
339         errln((UnicodeString)"FAIL: Can't construct set with category->Nd");
340         return;
341     }
342 
343     if (*set1 != *set1a) {
344         errln("FAIL: category constructor for Ll broken");
345     }
346     if (*set2 != *set2a) {
347         errln("FAIL: category constructor for Nd broken");
348     }
349     delete set1a;
350     delete set2a;
351 
352     logln("Testing copy construction");
353     UnicodeSet *set1copy=new UnicodeSet(*set1);
354     if(*set1 != *set1copy || *set1 == *set2 ||
355         getPairs(*set1) != getPairs(*set1copy) ||
356         set1->hashCode() != set1copy->hashCode()){
357         errln("FAIL : Error in copy construction");
358         return;
359     }
360 
361     logln("Testing =operator");
362     UnicodeSet set1equal=*set1;
363     UnicodeSet set2equal=*set2;
364     if(set1equal != *set1 || set1equal != *set1copy || set2equal != *set2 ||
365         set2equal == *set1 || set2equal == *set1copy || set2equal == set1equal){
366         errln("FAIL: Error in =operator");
367     }
368 
369     logln("Testing clone()");
370     UnicodeSet *set1clone=set1->clone();
371     UnicodeSet *set2clone=set2->clone();
372     if(*set1clone != *set1 || *set1clone != *set1copy || *set1clone != set1equal ||
373         *set2clone != *set2 || *set2clone == *set1copy || *set2clone != set2equal ||
374         *set2clone == *set1 || *set2clone == set1equal || *set2clone == *set1clone){
375         errln("FAIL: Error in clone");
376     }
377 
378     logln("Testing hashcode");
379     if(set1->hashCode() != set1equal.hashCode() || set1->hashCode() != set1clone->hashCode() ||
380         set2->hashCode() != set2equal.hashCode() || set2->hashCode() != set2clone->hashCode() ||
381         set1copy->hashCode() != set1equal.hashCode() || set1copy->hashCode() != set1clone->hashCode() ||
382         set1->hashCode() == set2->hashCode()  || set1copy->hashCode() == set2->hashCode() ||
383         set2->hashCode() == set1clone->hashCode() || set2->hashCode() == set1equal.hashCode() ){
384         errln("FAIL: Error in hashCode()");
385     }
386 
387     delete set1;
388     delete set1copy;
389     delete set2;
390     delete set1clone;
391     delete set2clone;
392 
393 
394 }
395 void
TestAddRemove(void)396 UnicodeSetTest::TestAddRemove(void) {
397     UnicodeSet set; // Construct empty set
398     doAssert(set.isEmpty() == TRUE, "set should be empty");
399     doAssert(set.size() == 0, "size should be 0");
400     set.complement();
401     doAssert(set.size() == 0x110000, "size should be 0x110000");
402     set.clear();
403     set.add(0x0061, 0x007a);
404     expectPairs(set, "az");
405     doAssert(set.isEmpty() == FALSE, "set should not be empty");
406     doAssert(set.size() != 0, "size should not be equal to 0");
407     doAssert(set.size() == 26, "size should be equal to 26");
408     set.remove(0x006d, 0x0070);
409     expectPairs(set, "alqz");
410     doAssert(set.size() == 22, "size should be equal to 22");
411     set.remove(0x0065, 0x0067);
412     expectPairs(set, "adhlqz");
413     doAssert(set.size() == 19, "size should be equal to 19");
414     set.remove(0x0064, 0x0069);
415     expectPairs(set, "acjlqz");
416     doAssert(set.size() == 16, "size should be equal to 16");
417     set.remove(0x0063, 0x0072);
418     expectPairs(set, "absz");
419     doAssert(set.size() == 10, "size should be equal to 10");
420     set.add(0x0066, 0x0071);
421     expectPairs(set, "abfqsz");
422     doAssert(set.size() == 22, "size should be equal to 22");
423     set.remove(0x0061, 0x0067);
424     expectPairs(set, "hqsz");
425     set.remove(0x0061, 0x007a);
426     expectPairs(set, "");
427     doAssert(set.isEmpty() == TRUE, "set should be empty");
428     doAssert(set.size() == 0, "size should be 0");
429     set.add(0x0061);
430     doAssert(set.isEmpty() == FALSE, "set should not be empty");
431     doAssert(set.size() == 1, "size should not be equal to 1");
432     set.add(0x0062);
433     set.add(0x0063);
434     expectPairs(set, "ac");
435     doAssert(set.size() == 3, "size should not be equal to 3");
436     set.add(0x0070);
437     set.add(0x0071);
438     expectPairs(set, "acpq");
439     doAssert(set.size() == 5, "size should not be equal to 5");
440     set.clear();
441     expectPairs(set, "");
442     doAssert(set.isEmpty() == TRUE, "set should be empty");
443     doAssert(set.size() == 0, "size should be 0");
444 
445     // Try removing an entire set from another set
446     expectPattern(set, "[c-x]", "cx");
447     UnicodeSet set2;
448     expectPattern(set2, "[f-ky-za-bc[vw]]", "acfkvwyz");
449     set.removeAll(set2);
450     expectPairs(set, "deluxx");
451 
452     // Try adding an entire set to another set
453     expectPattern(set, "[jackiemclean]", "aacceein");
454     expectPattern(set2, "[hitoshinamekatajamesanderson]", "aadehkmort");
455     set.addAll(set2);
456     expectPairs(set, "aacehort");
457     doAssert(set.containsAll(set2) == TRUE, "set should contain all the elements in set2");
458 
459     // Try retaining an set of elements contained in another set (intersection)
460     UnicodeSet set3;
461     expectPattern(set3, "[a-c]", "ac");
462     doAssert(set.containsAll(set3) == FALSE, "set doesn't contain all the elements in set3");
463     set3.remove(0x0062);
464     expectPairs(set3, "aacc");
465     doAssert(set.containsAll(set3) == TRUE, "set should contain all the elements in set3");
466     set.retainAll(set3);
467     expectPairs(set, "aacc");
468     doAssert(set.size() == set3.size(), "set.size() should be set3.size()");
469     doAssert(set.containsAll(set3) == TRUE, "set should contain all the elements in set3");
470     set.clear();
471     doAssert(set.size() != set3.size(), "set.size() != set3.size()");
472 
473     // Test commutativity
474     expectPattern(set, "[hitoshinamekatajamesanderson]", "aadehkmort");
475     expectPattern(set2, "[jackiemclean]", "aacceein");
476     set.addAll(set2);
477     expectPairs(set, "aacehort");
478     doAssert(set.containsAll(set2) == TRUE, "set should contain all the elements in set2");
479 
480 
481 
482 
483 }
484 
485 /**
486  * Make sure minimal representation is maintained.
487  */
TestMinimalRep()488 void UnicodeSetTest::TestMinimalRep() {
489     UErrorCode status = U_ZERO_ERROR;
490     // This is pretty thoroughly tested by checkCanonicalRep()
491     // run against the exhaustive operation results.  Use the code
492     // here for debugging specific spot problems.
493 
494     // 1 overlap against 2
495     UnicodeSet set("[h-km-q]", status);
496     if (U_FAILURE(status)) { errln("FAIL"); return; }
497     UnicodeSet set2("[i-o]", status);
498     if (U_FAILURE(status)) { errln("FAIL"); return; }
499     set.addAll(set2);
500     expectPairs(set, "hq");
501     // right
502     set.applyPattern("[a-m]", status);
503     if (U_FAILURE(status)) { errln("FAIL"); return; }
504     set2.applyPattern("[e-o]", status);
505     if (U_FAILURE(status)) { errln("FAIL"); return; }
506     set.addAll(set2);
507     expectPairs(set, "ao");
508     // left
509     set.applyPattern("[e-o]", status);
510     if (U_FAILURE(status)) { errln("FAIL"); return; }
511     set2.applyPattern("[a-m]", status);
512     if (U_FAILURE(status)) { errln("FAIL"); return; }
513     set.addAll(set2);
514     expectPairs(set, "ao");
515     // 1 overlap against 3
516     set.applyPattern("[a-eg-mo-w]", status);
517     if (U_FAILURE(status)) { errln("FAIL"); return; }
518     set2.applyPattern("[d-q]", status);
519     if (U_FAILURE(status)) { errln("FAIL"); return; }
520     set.addAll(set2);
521     expectPairs(set, "aw");
522 }
523 
TestAPI()524 void UnicodeSetTest::TestAPI() {
525     UErrorCode status = U_ZERO_ERROR;
526     // default ct
527     UnicodeSet set;
528     if (!set.isEmpty() || set.getRangeCount() != 0) {
529         errln((UnicodeString)"FAIL, set should be empty but isn't: " +
530               set);
531     }
532 
533     // clear(), isEmpty()
534     set.add(0x0061);
535     if (set.isEmpty()) {
536         errln((UnicodeString)"FAIL, set shouldn't be empty but is: " +
537               set);
538     }
539     set.clear();
540     if (!set.isEmpty()) {
541         errln((UnicodeString)"FAIL, set should be empty but isn't: " +
542               set);
543     }
544 
545     // size()
546     set.clear();
547     if (set.size() != 0) {
548         errln((UnicodeString)"FAIL, size should be 0, but is " + set.size() +
549               ": " + set);
550     }
551     set.add(0x0061);
552     if (set.size() != 1) {
553         errln((UnicodeString)"FAIL, size should be 1, but is " + set.size() +
554               ": " + set);
555     }
556     set.add(0x0031, 0x0039);
557     if (set.size() != 10) {
558         errln((UnicodeString)"FAIL, size should be 10, but is " + set.size() +
559               ": " + set);
560     }
561 
562     // contains(first, last)
563     set.clear();
564     set.applyPattern("[A-Y 1-8 b-d l-y]", status);
565     if (U_FAILURE(status)) { errln("FAIL"); return; }
566     for (int32_t i = 0; i<set.getRangeCount(); ++i) {
567         UChar32 a = set.getRangeStart(i);
568         UChar32 b = set.getRangeEnd(i);
569         if (!set.contains(a, b)) {
570             errln((UnicodeString)"FAIL, should contain " + (unsigned short)a + '-' + (unsigned short)b +
571                   " but doesn't: " + set);
572         }
573         if (set.contains((UChar32)(a-1), b)) {
574             errln((UnicodeString)"FAIL, shouldn't contain " +
575                   (unsigned short)(a-1) + '-' + (unsigned short)b +
576                   " but does: " + set);
577         }
578         if (set.contains(a, (UChar32)(b+1))) {
579             errln((UnicodeString)"FAIL, shouldn't contain " +
580                   (unsigned short)a + '-' + (unsigned short)(b+1) +
581                   " but does: " + set);
582         }
583     }
584 
585     // Ported InversionList test.
586     UnicodeSet a((UChar32)3,(UChar32)10);
587     UnicodeSet b((UChar32)7,(UChar32)15);
588     UnicodeSet c;
589 
590     logln((UnicodeString)"a [3-10]: " + a);
591     logln((UnicodeString)"b [7-15]: " + b);
592     c = a;
593     c.addAll(b);
594     UnicodeSet exp((UChar32)3,(UChar32)15);
595     if (c == exp) {
596         logln((UnicodeString)"c.set(a).add(b): " + c);
597     } else {
598         errln((UnicodeString)"FAIL: c.set(a).add(b) = " + c + ", expect " + exp);
599     }
600     c.complement();
601     exp.set((UChar32)0, (UChar32)2);
602     exp.add((UChar32)16, UnicodeSet::MAX_VALUE);
603     if (c == exp) {
604         logln((UnicodeString)"c.complement(): " + c);
605     } else {
606         errln((UnicodeString)"FAIL: c.complement() = " + c + ", expect " + exp);
607     }
608     c.complement();
609     exp.set((UChar32)3, (UChar32)15);
610     if (c == exp) {
611         logln((UnicodeString)"c.complement(): " + c);
612     } else {
613         errln((UnicodeString)"FAIL: c.complement() = " + c + ", expect " + exp);
614     }
615     c = a;
616     c.complementAll(b);
617     exp.set((UChar32)3,(UChar32)6);
618     exp.add((UChar32)11,(UChar32) 15);
619     if (c == exp) {
620         logln((UnicodeString)"c.set(a).exclusiveOr(b): " + c);
621     } else {
622         errln((UnicodeString)"FAIL: c.set(a).exclusiveOr(b) = " + c + ", expect " + exp);
623     }
624 
625     exp = c;
626     bitsToSet(setToBits(c), c);
627     if (c == exp) {
628         logln((UnicodeString)"bitsToSet(setToBits(c)): " + c);
629     } else {
630         errln((UnicodeString)"FAIL: bitsToSet(setToBits(c)) = " + c + ", expect " + exp);
631     }
632 
633     // Additional tests for coverage JB#2118
634     //UnicodeSet::complement(class UnicodeString const &)
635     //UnicodeSet::complementAll(class UnicodeString const &)
636     //UnicodeSet::containsNone(class UnicodeSet const &)
637     //UnicodeSet::containsNone(long,long)
638     //UnicodeSet::containsSome(class UnicodeSet const &)
639     //UnicodeSet::containsSome(long,long)
640     //UnicodeSet::removeAll(class UnicodeString const &)
641     //UnicodeSet::retain(long)
642     //UnicodeSet::retainAll(class UnicodeString const &)
643     //UnicodeSet::serialize(unsigned short *,long,enum UErrorCode &)
644     //UnicodeSetIterator::getString(void)
645     set.clear();
646     set.complement("ab");
647     exp.applyPattern("[{ab}]", status);
648     if (U_FAILURE(status)) { errln("FAIL"); return; }
649     if (set != exp) { errln("FAIL: complement(\"ab\")"); return; }
650 
651     UnicodeSetIterator iset(set);
652     if (!iset.next() || !iset.isString()) {
653         errln("FAIL: UnicodeSetIterator::next/isString");
654     } else if (iset.getString() != "ab") {
655         errln("FAIL: UnicodeSetIterator::getString");
656     }
657 
658     set.add(u'a', u'z');
659     set.complementAll("alan");
660     exp.applyPattern("[{ab}b-kmo-z]", status);
661     if (U_FAILURE(status)) { errln("FAIL"); return; }
662     if (set != exp) { errln("FAIL: complementAll(\"alan\")"); return; }
663 
664     exp.applyPattern("[a-z]", status);
665     if (U_FAILURE(status)) { errln("FAIL"); return; }
666     if (set.containsNone(exp)) { errln("FAIL: containsNone(UnicodeSet)"); }
667     if (!set.containsSome(exp)) { errln("FAIL: containsSome(UnicodeSet)"); }
668     exp.applyPattern("[aln]", status);
669     if (U_FAILURE(status)) { errln("FAIL"); return; }
670     if (!set.containsNone(exp)) { errln("FAIL: containsNone(UnicodeSet)"); }
671     if (set.containsSome(exp)) { errln("FAIL: containsSome(UnicodeSet)"); }
672 
673     if (set.containsNone(u'a', u'z')) {
674         errln("FAIL: containsNone(UChar32, UChar32)");
675     }
676     if (!set.containsSome(u'a', u'z')) {
677         errln("FAIL: containsSome(UChar32, UChar32)");
678     }
679     if (!set.containsNone(u'A', u'Z')) {
680         errln("FAIL: containsNone(UChar32, UChar32)");
681     }
682     if (set.containsSome(u'A', u'Z')) {
683         errln("FAIL: containsSome(UChar32, UChar32)");
684     }
685 
686     set.removeAll("liu");
687     exp.applyPattern("[{ab}b-hj-kmo-tv-z]", status);
688     if (U_FAILURE(status)) { errln("FAIL"); return; }
689     if (set != exp) { errln("FAIL: removeAll(\"liu\")"); return; }
690 
691     set.retainAll("star");
692     exp.applyPattern("[rst]", status);
693     if (U_FAILURE(status)) { errln("FAIL"); return; }
694     if (set != exp) { errln("FAIL: retainAll(\"star\")"); return; }
695 
696     set.retain(u's');
697     exp.applyPattern("[s]", status);
698     if (U_FAILURE(status)) { errln("FAIL"); return; }
699     if (set != exp) { errln("FAIL: retain('s')"); return; }
700 
701     // ICU 2.6 coverage tests
702     // public final UnicodeSet retain(String s);
703     // public final UnicodeSet remove(int c);
704     // public final UnicodeSet remove(String s);
705     // public int hashCode();
706     set.applyPattern(u"[a-z{ab}{cd}]", status);
707     if (U_FAILURE(status)) { errln("FAIL"); return; }
708     set.retain(u"cd");
709     exp.applyPattern(u"[{cd}]", status);
710     if (U_FAILURE(status)) { errln("FAIL"); return; }
711     if (set != exp) { errln("FAIL: (with cd).retain(\"cd\")"); return; }
712 
713     set.applyPattern(u"[a-z{ab}{yz}]", status);
714     if (U_FAILURE(status)) { errln("FAIL"); return; }
715     set.retain(u"cd");
716     exp.clear();
717     if (set != exp) { errln("FAIL: (without cd).retain(\"cd\")"); return; }
718 
719     set.applyPattern(u"[a-z{ab}{cd}]", status);
720     if (U_FAILURE(status)) { errln("FAIL"); return; }
721     set.remove(u'c');
722     exp.applyPattern(u"[abd-z{ab}{cd}]", status);
723     if (set != exp) { errln("FAIL: remove('c')"); return; }
724 
725     set.remove(u"cd");
726     exp.applyPattern(u"[abd-z{ab}]", status);
727     if (U_FAILURE(status)) { errln("FAIL"); return; }
728     if (set != exp) { errln("FAIL: remove(\"cd\")"); return; }
729 
730     set.applyPattern("[s]", status);
731     if (U_FAILURE(status)) { errln("FAIL"); return; }
732     uint16_t buf[32];
733     int32_t slen = set.serialize(buf, UPRV_LENGTHOF(buf), status);
734     if (U_FAILURE(status)) { errln("FAIL: serialize"); return; }
735     if (slen != 3 || buf[0] != 2 || buf[1] != u's' || buf[2] != u't') {
736         errln("FAIL: serialize");
737         return;
738     }
739 
740     // Conversions to and from USet
741     UnicodeSet *uniset = &set;
742     USet *uset = uniset->toUSet();
743     TEST_ASSERT((void *)uset == (void *)uniset);
744     UnicodeSet *setx = UnicodeSet::fromUSet(uset);
745     TEST_ASSERT((void *)setx == (void *)uset);
746     const UnicodeSet *constSet = uniset;
747     const USet *constUSet = constSet->toUSet();
748     TEST_ASSERT((void *)constUSet == (void *)constSet);
749     const UnicodeSet *constSetx = UnicodeSet::fromUSet(constUSet);
750     TEST_ASSERT((void *)constSetx == (void *)constUSet);
751 
752     // span(UnicodeString) and spanBack(UnicodeString) convenience methods
753     UnicodeString longString=u"aaaaaaaaaabbbbbbbbbbcccccccccc";
754     UnicodeSet ac(0x61, 0x63);
755     ac.remove(0x62).freeze();
756     if( ac.span(longString, -5, USET_SPAN_CONTAINED)!=10 ||
757         ac.span(longString, 0, USET_SPAN_CONTAINED)!=10 ||
758         ac.span(longString, 5, USET_SPAN_CONTAINED)!=10 ||
759         ac.span(longString, 10, USET_SPAN_CONTAINED)!=10 ||
760         ac.span(longString, 15, USET_SPAN_CONTAINED)!=15 ||
761         ac.span(longString, 20, USET_SPAN_CONTAINED)!=30 ||
762         ac.span(longString, 25, USET_SPAN_CONTAINED)!=30 ||
763         ac.span(longString, 30, USET_SPAN_CONTAINED)!=30 ||
764         ac.span(longString, 35, USET_SPAN_CONTAINED)!=30 ||
765         ac.span(longString, INT32_MAX, USET_SPAN_CONTAINED)!=30
766     ) {
767         errln("UnicodeSet.span(UnicodeString, ...) returns incorrect end indexes");
768     }
769     if( ac.spanBack(longString, -5, USET_SPAN_CONTAINED)!=0 ||
770         ac.spanBack(longString, 0, USET_SPAN_CONTAINED)!=0 ||
771         ac.spanBack(longString, 5, USET_SPAN_CONTAINED)!=0 ||
772         ac.spanBack(longString, 10, USET_SPAN_CONTAINED)!=0 ||
773         ac.spanBack(longString, 15, USET_SPAN_CONTAINED)!=15 ||
774         ac.spanBack(longString, 20, USET_SPAN_CONTAINED)!=20 ||
775         ac.spanBack(longString, 25, USET_SPAN_CONTAINED)!=20 ||
776         ac.spanBack(longString, 30, USET_SPAN_CONTAINED)!=20 ||
777         ac.spanBack(longString, 35, USET_SPAN_CONTAINED)!=20 ||
778         ac.spanBack(longString, INT32_MAX, USET_SPAN_CONTAINED)!=20
779     ) {
780         errln("UnicodeSet.spanBack(UnicodeString, ...) returns incorrect start indexes");
781     }
782 }
783 
TestIteration()784 void UnicodeSetTest::TestIteration() {
785     UErrorCode ec = U_ZERO_ERROR;
786     int i = 0;
787     int outerLoop;
788 
789     // 6 code points, 3 ranges, 2 strings, 8 total elements
790     //   Iteration will access them in sorted order -  a, b, c, y, z, U0001abcd, "str1", "str2"
791     UnicodeSet set(u"[zabyc\\U0001abcd{str1}{str2}]", ec);
792     TEST_ASSERT_SUCCESS(ec);
793     UnicodeSetIterator it(set);
794 
795     for (outerLoop=0; outerLoop<3; outerLoop++) {
796         // Run the test multiple times, to check that iterator.reset() is working.
797         for (i=0; i<10; i++) {
798             UBool         nextv        = it.next();
799             UBool         isString     = it.isString();
800             int32_t       codePoint    = it.getCodepoint();
801             //int32_t       codePointEnd = it.getCodepointEnd();
802             UnicodeString s   = it.getString();
803             switch (i) {
804             case 0:
805                 TEST_ASSERT(nextv == TRUE);
806                 TEST_ASSERT(isString == FALSE);
807                 TEST_ASSERT(codePoint==0x61);
808                 TEST_ASSERT(s == "a");
809                 break;
810             case 1:
811                 TEST_ASSERT(nextv == TRUE);
812                 TEST_ASSERT(isString == FALSE);
813                 TEST_ASSERT(codePoint==0x62);
814                 TEST_ASSERT(s == "b");
815                 break;
816             case 2:
817                 TEST_ASSERT(nextv == TRUE);
818                 TEST_ASSERT(isString == FALSE);
819                 TEST_ASSERT(codePoint==0x63);
820                 TEST_ASSERT(s == "c");
821                 break;
822             case 3:
823                 TEST_ASSERT(nextv == TRUE);
824                 TEST_ASSERT(isString == FALSE);
825                 TEST_ASSERT(codePoint==0x79);
826                 TEST_ASSERT(s == "y");
827                 break;
828             case 4:
829                 TEST_ASSERT(nextv == TRUE);
830                 TEST_ASSERT(isString == FALSE);
831                 TEST_ASSERT(codePoint==0x7a);
832                 TEST_ASSERT(s == "z");
833                 break;
834             case 5:
835                 TEST_ASSERT(nextv == TRUE);
836                 TEST_ASSERT(isString == FALSE);
837                 TEST_ASSERT(codePoint==0x1abcd);
838                 TEST_ASSERT(s == UnicodeString((UChar32)0x1abcd));
839                 break;
840             case 6:
841                 TEST_ASSERT(nextv == TRUE);
842                 TEST_ASSERT(isString == TRUE);
843                 TEST_ASSERT(s == "str1");
844                 break;
845             case 7:
846                 TEST_ASSERT(nextv == TRUE);
847                 TEST_ASSERT(isString == TRUE);
848                 TEST_ASSERT(s == "str2");
849                 break;
850             case 8:
851                 TEST_ASSERT(nextv == FALSE);
852                 break;
853             case 9:
854                 TEST_ASSERT(nextv == FALSE);
855                 break;
856             }
857         }
858         it.reset();  // prepare to run the iteration again.
859     }
860 }
861 
862 
863 
864 
TestStrings()865 void UnicodeSetTest::TestStrings() {
866     UErrorCode ec = U_ZERO_ERROR;
867 
868     UnicodeSet* testList[] = {
869         UnicodeSet::createFromAll("abc"),
870         new UnicodeSet("[a-c]", ec),
871 
872         &(UnicodeSet::createFrom("ch")->add('a','z').add("ll")),
873         new UnicodeSet("[{ll}{ch}a-z]", ec),
874 
875         UnicodeSet::createFrom("ab}c"),
876         new UnicodeSet("[{ab\\}c}]", ec),
877 
878         &((new UnicodeSet('a','z'))->add('A', 'Z').retain('M','m').complement('X')),
879         new UnicodeSet("[[a-zA-Z]&[M-m]-[X]]", ec),
880 
881         NULL
882     };
883 
884     if (U_FAILURE(ec)) {
885         errln("FAIL: couldn't construct test sets");
886     }
887     assertFalse("[a-c].hasStrings()", testList[0]->hasStrings());
888     assertTrue("[{ll}{ch}a-z].hasStrings()", testList[2]->hasStrings());
889 
890     for (int32_t i = 0; testList[i] != NULL; i+=2) {
891         if (U_SUCCESS(ec)) {
892             UnicodeString pat0, pat1;
893             testList[i]->toPattern(pat0, TRUE);
894             testList[i+1]->toPattern(pat1, TRUE);
895             if (*testList[i] == *testList[i+1]) {
896                 logln((UnicodeString)"Ok: " + pat0 + " == " + pat1);
897             } else {
898                 logln((UnicodeString)"FAIL: " + pat0 + " != " + pat1);
899             }
900         }
901         delete testList[i];
902         delete testList[i+1];
903     }
904 }
905 
906 /**
907  * Test the [:Latin:] syntax.
908  */
TestScriptSet()909 void UnicodeSetTest::TestScriptSet() {
910     expectContainment(u"[:Latin:]", "aA", CharsToUnicodeString("\\u0391\\u03B1"));
911 
912     expectContainment(u"[:Greek:]", CharsToUnicodeString("\\u0391\\u03B1"), "aA");
913 
914     /* Jitterbug 1423 */
915     expectContainment(u"[[:Common:][:Inherited:]]", CharsToUnicodeString("\\U00003099\\U0001D169\\u0000"), "aA");
916 
917 }
918 
919 /**
920  * Test the [:Latin:] syntax.
921  */
TestPropertySet()922 void UnicodeSetTest::TestPropertySet() {
923     static const char* const DATA[] = {
924         // Pattern, Chars IN, Chars NOT in
925 
926         "[:Latin:]",
927         "aA",
928         "\\u0391\\u03B1",
929 
930         "[\\p{Greek}]",
931         "\\u0391\\u03B1",
932         "aA",
933 
934         "\\P{ GENERAL Category = upper case letter }",
935         "abc",
936         "ABC",
937 
938 #if !UCONFIG_NO_NORMALIZATION
939         // Combining class: @since ICU 2.2
940         // Check both symbolic and numeric
941         "\\p{ccc=Nukta}",
942         "\\u0ABC",
943         "abc",
944 
945         "\\p{Canonical Combining Class = 11}",
946         "\\u05B1",
947         "\\u05B2",
948 
949         "[:c c c = iota subscript :]",
950         "\\u0345",
951         "xyz",
952 #endif
953 
954         // Bidi class: @since ICU 2.2
955         "\\p{bidiclass=lefttoright}",
956         "abc",
957         "\\u0671\\u0672",
958 
959         // Binary properties: @since ICU 2.2
960         "\\p{ideographic}",
961         "\\u4E0A",
962         "x",
963 
964         "[:math=false:]",
965         "q)*(",
966         // weiv: )(and * were removed from math in Unicode 4.0.1
967         //"(*+)",
968         "+<>^",
969 
970         // JB#1767 \N{}, \p{ASCII}
971         "[:Ascii:]",
972         "abc\\u0000\\u007F",
973         "\\u0080\\u4E00",
974 
975         "[\\N{ latin small letter  a  }[:name= latin small letter z:]]",
976         "az",
977         "qrs",
978 
979         // JB#2015
980         "[:any:]",
981         "a\\U0010FFFF",
982         "",
983 
984         "[:nv=0.5:]",
985         "\\u00BD\\u0F2A",
986         "\\u00BC",
987 
988         // JB#2653: Age
989         "[:Age=1.1:]",
990         "\\u03D6", // 1.1
991         "\\u03D8\\u03D9", // 3.2
992 
993         "[:Age=3.1:]",
994         "\\u1800\\u3400\\U0002f800",
995         "\\u0220\\u034f\\u30ff\\u33ff\\ufe73\\U00010000\\U00050000",
996 
997         // JB#2350: Case_Sensitive
998         "[:Case Sensitive:]",
999         "A\\u1FFC\\U00010410",
1000         ";\\u00B4\\U00010500",
1001 
1002         // JB#2832: C99-compatibility props
1003         "[:blank:]",
1004         " \\u0009",
1005         "1-9A-Z",
1006 
1007         "[:graph:]",
1008         "19AZ",
1009         " \\u0003\\u0007\\u0009\\u000A\\u000D",
1010 
1011         "[:punct:]",
1012         "!@#%&*()[]{}-_\\/;:,.?'\"",
1013         "09azAZ",
1014 
1015         "[:xdigit:]",
1016         "09afAF",
1017         "gG!",
1018 
1019         // Regex compatibility test
1020         "[-b]", // leading '-' is literal
1021         "-b",
1022         "ac",
1023 
1024         "[^-b]", // leading '-' is literal
1025         "ac",
1026         "-b",
1027 
1028         "[b-]", // trailing '-' is literal
1029         "-b",
1030         "ac",
1031 
1032         "[^b-]", // trailing '-' is literal
1033         "ac",
1034         "-b",
1035 
1036         "[a-b-]", // trailing '-' is literal
1037         "ab-",
1038         "c=",
1039 
1040         "[[a-q]&[p-z]-]", // trailing '-' is literal
1041         "pq-",
1042         "or=",
1043 
1044         "[\\s|\\)|:|$|\\>]", // from regex tests
1045         "s|):$>",
1046         "abc",
1047 
1048         "[\\uDC00cd]", // JB#2906: isolated trail at start
1049         "cd\\uDC00",
1050         "ab\\uD800\\U00010000",
1051 
1052         "[ab\\uD800]", // JB#2906: isolated trail at start
1053         "ab\\uD800",
1054         "cd\\uDC00\\U00010000",
1055 
1056         "[ab\\uD800cd]", // JB#2906: isolated lead in middle
1057         "abcd\\uD800",
1058         "ef\\uDC00\\U00010000",
1059 
1060         "[ab\\uDC00cd]", // JB#2906: isolated trail in middle
1061         "abcd\\uDC00",
1062         "ef\\uD800\\U00010000",
1063 
1064 #if !UCONFIG_NO_NORMALIZATION
1065         "[:^lccc=0:]", // Lead canonical class
1066         "\\u0300\\u0301",
1067         "abcd\\u00c0\\u00c5",
1068 
1069         "[:^tccc=0:]", // Trail canonical class
1070         "\\u0300\\u0301\\u00c0\\u00c5",
1071         "abcd",
1072 
1073         "[[:^lccc=0:][:^tccc=0:]]", // Lead and trail canonical class
1074         "\\u0300\\u0301\\u00c0\\u00c5",
1075         "abcd",
1076 
1077         "[[:^lccc=0:]-[:^tccc=0:]]", // Stuff that starts with an accent but ends with a base (none right now)
1078         "",
1079         "abcd\\u0300\\u0301\\u00c0\\u00c5",
1080 
1081         "[[:ccc=0:]-[:lccc=0:]-[:tccc=0:]]", // Weirdos. Complete canonical class is zero, but both lead and trail are not
1082         "\\u0F73\\u0F75\\u0F81",
1083         "abcd\\u0300\\u0301\\u00c0\\u00c5",
1084 #endif /* !UCONFIG_NO_NORMALIZATION */
1085 
1086         "[:Assigned:]",
1087         "A\\uE000\\uF8FF\\uFDC7\\U00010000\\U0010FFFD",
1088         "\\u0558\\uFDD3\\uFFFE\\U00050005",
1089 
1090         // Script_Extensions, new in Unicode 6.0
1091         "[:scx=Arab:]",
1092         "\\u061E\\u061F\\u0620\\u0621\\u063F\\u0640\\u0650\\u065E\\uFDF1\\uFDF2\\uFDF3",
1093         "\\u088F\\uFDEF\\uFEFE",
1094 
1095         // U+FDF2 has Script=Arabic and also Arab in its Script_Extensions,
1096         // so scx-sc is missing U+FDF2.
1097         "[[:Script_Extensions=Arabic:]-[:Arab:]]",
1098         "\\u0640\\u064B\\u0650\\u0655",
1099         "\\uFDF2"
1100     };
1101 
1102     static const int32_t DATA_LEN = UPRV_LENGTHOF(DATA);
1103 
1104     for (int32_t i=0; i<DATA_LEN; i+=3) {
1105         expectContainment(UnicodeString(DATA[i], -1, US_INV), CharsToUnicodeString(DATA[i+1]),
1106                           CharsToUnicodeString(DATA[i+2]));
1107     }
1108 }
1109 
1110 /**
1111   * Test that Posix style character classes [:digit:], etc.
1112   *   have the Unicode definitions from TR 18.
1113   */
TestPosixClasses()1114 void UnicodeSetTest::TestPosixClasses() {
1115     {
1116         UErrorCode status = U_ZERO_ERROR;
1117         UnicodeSet s1("[:alpha:]", status);
1118         UnicodeSet s2(u"\\p{Alphabetic}", status);
1119         TEST_ASSERT_SUCCESS(status);
1120         TEST_ASSERT(s1==s2);
1121     }
1122     {
1123         UErrorCode status = U_ZERO_ERROR;
1124         UnicodeSet s1("[:lower:]", status);
1125         UnicodeSet s2(u"\\p{lowercase}", status);
1126         TEST_ASSERT_SUCCESS(status);
1127         TEST_ASSERT(s1==s2);
1128     }
1129     {
1130         UErrorCode status = U_ZERO_ERROR;
1131         UnicodeSet s1("[:upper:]", status);
1132         UnicodeSet s2(u"\\p{Uppercase}", status);
1133         TEST_ASSERT_SUCCESS(status);
1134         TEST_ASSERT(s1==s2);
1135     }
1136     {
1137         UErrorCode status = U_ZERO_ERROR;
1138         UnicodeSet s1("[:punct:]", status);
1139         UnicodeSet s2(u"\\p{gc=Punctuation}", status);
1140         TEST_ASSERT_SUCCESS(status);
1141         TEST_ASSERT(s1==s2);
1142     }
1143     {
1144         UErrorCode status = U_ZERO_ERROR;
1145         UnicodeSet s1("[:digit:]", status);
1146         UnicodeSet s2(u"\\p{gc=DecimalNumber}", status);
1147         TEST_ASSERT_SUCCESS(status);
1148         TEST_ASSERT(s1==s2);
1149     }
1150     {
1151         UErrorCode status = U_ZERO_ERROR;
1152         UnicodeSet s1("[:xdigit:]", status);
1153         UnicodeSet s2(u"[\\p{DecimalNumber}\\p{HexDigit}]", status);
1154         TEST_ASSERT_SUCCESS(status);
1155         TEST_ASSERT(s1==s2);
1156     }
1157     {
1158         UErrorCode status = U_ZERO_ERROR;
1159         UnicodeSet s1("[:alnum:]", status);
1160         UnicodeSet s2(u"[\\p{Alphabetic}\\p{DecimalNumber}]", status);
1161         TEST_ASSERT_SUCCESS(status);
1162         TEST_ASSERT(s1==s2);
1163     }
1164     {
1165         UErrorCode status = U_ZERO_ERROR;
1166         UnicodeSet s1("[:space:]", status);
1167         UnicodeSet s2(u"\\p{Whitespace}", status);
1168         TEST_ASSERT_SUCCESS(status);
1169         TEST_ASSERT(s1==s2);
1170     }
1171     {
1172         UErrorCode status = U_ZERO_ERROR;
1173         UnicodeSet s1("[:blank:]", status);
1174         TEST_ASSERT_SUCCESS(status);
1175         UnicodeSet s2(u"[\\p{Whitespace}-[\\u000a\\u000B\\u000c\\u000d\\u0085\\p{LineSeparator}\\p{ParagraphSeparator}]]",
1176             status);
1177         TEST_ASSERT_SUCCESS(status);
1178         TEST_ASSERT(s1==s2);
1179     }
1180     {
1181         UErrorCode status = U_ZERO_ERROR;
1182         UnicodeSet s1("[:cntrl:]", status);
1183         TEST_ASSERT_SUCCESS(status);
1184         UnicodeSet s2(u"\\p{Control}", status);
1185         TEST_ASSERT_SUCCESS(status);
1186         TEST_ASSERT(s1==s2);
1187     }
1188     {
1189         UErrorCode status = U_ZERO_ERROR;
1190         UnicodeSet s1("[:graph:]", status);
1191         TEST_ASSERT_SUCCESS(status);
1192         UnicodeSet s2(u"[^\\p{Whitespace}\\p{Control}\\p{Surrogate}\\p{Unassigned}]", status);
1193         TEST_ASSERT_SUCCESS(status);
1194         TEST_ASSERT(s1==s2);
1195     }
1196     {
1197         UErrorCode status = U_ZERO_ERROR;
1198         UnicodeSet s1("[:print:]", status);
1199         TEST_ASSERT_SUCCESS(status);
1200         UnicodeSet s2(u"[[:graph:][:blank:]-[\\p{Control}]]", status);
1201         TEST_ASSERT_SUCCESS(status);
1202         TEST_ASSERT(s1==s2);
1203     }
1204 }
1205 /**
1206  * Test cloning of UnicodeSet.  For C++, we test the copy constructor.
1207  */
TestClone()1208 void UnicodeSetTest::TestClone() {
1209     UErrorCode ec = U_ZERO_ERROR;
1210     UnicodeSet s("[abcxyz]", ec);
1211     UnicodeSet t(s);
1212     expectContainment(t, "abc", "def");
1213 }
1214 
1215 /**
1216  * Test the indexOf() and charAt() methods.
1217  */
TestIndexOf()1218 void UnicodeSetTest::TestIndexOf() {
1219     UErrorCode ec = U_ZERO_ERROR;
1220     UnicodeSet set("[a-cx-y3578]", ec);
1221     if (U_FAILURE(ec)) {
1222         errln("FAIL: UnicodeSet constructor");
1223         return;
1224     }
1225     for (int32_t i=0; i<set.size(); ++i) {
1226         UChar32 c = set.charAt(i);
1227         if (set.indexOf(c) != i) {
1228             errln("FAIL: charAt(%d) = %X => indexOf() => %d",
1229                 i, c, set.indexOf(c));
1230         }
1231     }
1232     UChar32 c = set.charAt(set.size());
1233     if (c != -1) {
1234         errln("FAIL: charAt(<out of range>) = %X", c);
1235     }
1236     int32_t j = set.indexOf(u'q');
1237     if (j != -1) {
1238         errln((UnicodeString)"FAIL: indexOf('q') = " + j);
1239     }
1240 }
1241 
1242 /**
1243  * Test closure API.
1244  */
TestCloseOver()1245 void UnicodeSetTest::TestCloseOver() {
1246     UErrorCode ec = U_ZERO_ERROR;
1247 
1248     char CASE[] = {(char)USET_CASE_INSENSITIVE};
1249     char CASE_MAPPINGS[] = {(char)USET_ADD_CASE_MAPPINGS};
1250     const char* DATA[] = {
1251         // selector, input, output
1252         CASE,
1253         "[aq\\u00DF{Bc}{bC}{Fi}]",
1254         "[aAqQ\\u00DF\\u1E9E\\uFB01{ss}{bc}{fi}]",  // U+1E9E LATIN CAPITAL LETTER SHARP S is new in Unicode 5.1
1255 
1256         CASE,
1257         "[\\u01F1]", // 'DZ'
1258         "[\\u01F1\\u01F2\\u01F3]",
1259 
1260         CASE,
1261         "[\\u1FB4]",
1262         "[\\u1FB4{\\u03AC\\u03B9}]",
1263 
1264         CASE,
1265         "[{F\\uFB01}]",
1266         "[\\uFB03{ffi}]",
1267 
1268         CASE, // make sure binary search finds limits
1269         "[a\\uFF3A]",
1270         "[aA\\uFF3A\\uFF5A]",
1271 
1272         CASE,
1273         "[a-z]","[A-Za-z\\u017F\\u212A]",
1274         CASE,
1275         "[abc]","[A-Ca-c]",
1276         CASE,
1277         "[ABC]","[A-Ca-c]",
1278 
1279         CASE, "[i]", "[iI]",
1280 
1281         CASE, "[\\u0130]",          "[\\u0130{i\\u0307}]", // dotted I
1282         CASE, "[{i\\u0307}]",       "[\\u0130{i\\u0307}]", // i with dot
1283 
1284         CASE, "[\\u0131]",          "[\\u0131]", // dotless i
1285 
1286         CASE, "[\\u0390]",          "[\\u0390\\u1FD3{\\u03B9\\u0308\\u0301}]",
1287 
1288         CASE, "[\\u03c2]",          "[\\u03a3\\u03c2\\u03c3]", // sigmas
1289 
1290         CASE, "[\\u03f2]",          "[\\u03f2\\u03f9]", // lunate sigmas
1291 
1292         CASE, "[\\u03f7]",          "[\\u03f7\\u03f8]",
1293 
1294         CASE, "[\\u1fe3]",          "[\\u03b0\\u1fe3{\\u03c5\\u0308\\u0301}]",
1295 
1296         CASE, "[\\ufb05]",          "[\\ufb05\\ufb06{st}]",
1297         CASE, "[{st}]",             "[\\ufb05\\ufb06{st}]",
1298 
1299         CASE, "[\\U0001044F]",      "[\\U00010427\\U0001044F]",
1300 
1301         CASE, "[{a\\u02BE}]",       "[\\u1E9A{a\\u02BE}]", // first in sorted table
1302 
1303         CASE, "[{\\u1f7c\\u03b9}]", "[\\u1ff2{\\u1f7c\\u03b9}]", // last in sorted table
1304 
1305 #if !UCONFIG_NO_FILE_IO
1306         CASE_MAPPINGS,
1307         "[aq\\u00DF{Bc}{bC}{Fi}]",
1308         "[aAqQ\\u00DF{ss}{Ss}{SS}{Bc}{BC}{bC}{bc}{FI}{Fi}{fi}]",
1309 #endif
1310 
1311         CASE_MAPPINGS,
1312         "[\\u01F1]", // 'DZ'
1313         "[\\u01F1\\u01F2\\u01F3]",
1314 
1315         CASE_MAPPINGS,
1316         "[a-z]",
1317         "[A-Za-z]",
1318 
1319         NULL
1320     };
1321 
1322     UnicodeSet s;
1323     UnicodeSet t;
1324     UnicodeString buf;
1325     for (int32_t i=0; DATA[i]!=NULL; i+=3) {
1326         int32_t selector = DATA[i][0];
1327         UnicodeString pat(DATA[i+1], -1, US_INV);
1328         UnicodeString exp(DATA[i+2], -1, US_INV);
1329         s.applyPattern(pat, ec);
1330         s.closeOver(selector);
1331         t.applyPattern(exp, ec);
1332         if (U_FAILURE(ec)) {
1333             errln("FAIL: applyPattern failed");
1334             continue;
1335         }
1336         if (s == t) {
1337             logln((UnicodeString)"Ok: " + pat + ".closeOver(" + selector + ") => " + exp);
1338         } else {
1339             dataerrln((UnicodeString)"FAIL: " + pat + ".closeOver(" + selector + ") => " +
1340                   s.toPattern(buf, TRUE) + ", expected " + exp);
1341         }
1342     }
1343 
1344 #if 0
1345     /*
1346      * Unused test code.
1347      * This was used to compare the old implementation (using USET_CASE)
1348      * with the new one (using 0x100 temporarily)
1349      * while transitioning from hardcoded case closure tables in uniset.cpp
1350      * (moved to uniset_props.cpp) to building the data by gencase into ucase.icu.
1351      * and using ucase.c functions for closure.
1352      * See Jitterbug 3432 RFE: Move uniset.cpp data to a data file
1353      *
1354      * Note: The old and new implementation never fully matched because
1355      * the old implementation turned out to not map U+0130 and U+0131 correctly
1356      * (dotted I and dotless i) and because the old implementation's data tables
1357      * were outdated compared to Unicode 4.0.1 at the time of the change to the
1358      * new implementation. (So sigmas and some other characters were not handled
1359      * according to the newer Unicode version.)
1360      */
1361     UnicodeSet sens("[:case_sensitive:]", ec), sens2, s2;
1362     UnicodeSetIterator si(sens);
1363     UnicodeString str, buf2;
1364     const UnicodeString *pStr;
1365     UChar32 c;
1366     while(si.next()) {
1367         if(!si.isString()) {
1368             c=si.getCodepoint();
1369             s.clear();
1370             s.add(c);
1371 
1372             str.setTo(c);
1373             str.foldCase();
1374             sens2.add(str);
1375 
1376             t=s;
1377             s.closeOver(USET_CASE);
1378             t.closeOver(0x100);
1379             if(s!=t) {
1380                 errln("FAIL: closeOver(U+%04x) differs: ", c);
1381                 errln((UnicodeString)"old "+s.toPattern(buf, TRUE)+" new: "+t.toPattern(buf2, TRUE));
1382             }
1383         }
1384     }
1385     // remove all code points
1386     // should contain all full case folding mapping strings
1387     sens2.remove(0, 0x10ffff);
1388     si.reset(sens2);
1389     while(si.next()) {
1390         if(si.isString()) {
1391             pStr=&si.getString();
1392             s.clear();
1393             s.add(*pStr);
1394             t=s2=s;
1395             s.closeOver(USET_CASE);
1396             t.closeOver(0x100);
1397             if(s!=t) {
1398                 errln((UnicodeString)"FAIL: closeOver("+s2.toPattern(buf, TRUE)+") differs: ");
1399                 errln((UnicodeString)"old "+s.toPattern(buf, TRUE)+" new: "+t.toPattern(buf2, TRUE));
1400             }
1401         }
1402     }
1403 #endif
1404 
1405     // Test the pattern API
1406     s.applyPattern("[abc]", USET_CASE_INSENSITIVE, NULL, ec);
1407     if (U_FAILURE(ec)) {
1408         errln("FAIL: applyPattern failed");
1409     } else {
1410         expectContainment(s, "abcABC", "defDEF");
1411     }
1412     UnicodeSet v("[^abc]", USET_CASE_INSENSITIVE, NULL, ec);
1413     if (U_FAILURE(ec)) {
1414         errln("FAIL: constructor failed");
1415     } else {
1416         expectContainment(v, "defDEF", "abcABC");
1417     }
1418     UnicodeSet cm("[abck]", USET_ADD_CASE_MAPPINGS, NULL, ec);
1419     if (U_FAILURE(ec)) {
1420         errln("FAIL: construct w/case mappings failed");
1421     } else {
1422         expectContainment(cm, "abckABCK", CharsToUnicodeString("defDEF\\u212A"));
1423     }
1424 }
1425 
TestEscapePattern()1426 void UnicodeSetTest::TestEscapePattern() {
1427     const char pattern[] =
1428         "[\\uFEFF \\u200A-\\u200E \\U0001D173-\\U0001D17A \\U000F0000-\\U000FFFFD ]";
1429     const char exp[] =
1430         "[\\u200A-\\u200E\\uFEFF\\U0001D173-\\U0001D17A\\U000F0000-\\U000FFFFD]";
1431     // We test this with two passes; in the second pass we
1432     // pre-unescape the pattern.  Since U+200E is Pattern_White_Space,
1433     // this fails -- which is what we expect.
1434     for (int32_t pass=1; pass<=2; ++pass) {
1435         UErrorCode ec = U_ZERO_ERROR;
1436         UnicodeString pat(pattern, -1, US_INV);
1437         if (pass==2) {
1438             pat = pat.unescape();
1439         }
1440         // Pattern is only good for pass 1
1441         UBool isPatternValid = (pass==1);
1442 
1443         UnicodeSet set(pat, ec);
1444         if (U_SUCCESS(ec) != isPatternValid){
1445             errln((UnicodeString)"FAIL: applyPattern(" +
1446                   escape(pat) + ") => " +
1447                   u_errorName(ec));
1448             continue;
1449         }
1450         if (U_FAILURE(ec)) {
1451             continue;
1452         }
1453         if (set.contains(u'\u0644')){
1454             errln((UnicodeString)"FAIL: " + escape(pat) + " contains(U+0664)");
1455         }
1456 
1457         UnicodeString newpat;
1458         set.toPattern(newpat, TRUE);
1459         if (newpat == UnicodeString(exp, -1, US_INV)) {
1460             logln(escape(pat) + " => " + newpat);
1461         } else {
1462             errln((UnicodeString)"FAIL: " + escape(pat) + " => " + newpat);
1463         }
1464 
1465         for (int32_t i=0; i<set.getRangeCount(); ++i) {
1466             UnicodeString str("Range ");
1467             str.append((UChar)(u'0' + i))
1468                 .append(": ")
1469                 .append((UChar32)set.getRangeStart(i))
1470                 .append(" - ")
1471                 .append((UChar32)set.getRangeEnd(i));
1472             str = str + " (" + set.getRangeStart(i) + " - " +
1473                 set.getRangeEnd(i) + ")";
1474             if (set.getRangeStart(i) < 0) {
1475                 errln((UnicodeString)"FAIL: " + escape(str));
1476             } else {
1477                 logln(escape(str));
1478             }
1479         }
1480     }
1481 }
1482 
expectRange(const UnicodeString & label,const UnicodeSet & set,UChar32 start,UChar32 end)1483 void UnicodeSetTest::expectRange(const UnicodeString& label,
1484                                  const UnicodeSet& set,
1485                                  UChar32 start, UChar32 end) {
1486     UnicodeSet exp(start, end);
1487     UnicodeString pat;
1488     if (set == exp) {
1489         logln(label + " => " + set.toPattern(pat, TRUE));
1490     } else {
1491         UnicodeString xpat;
1492         errln((UnicodeString)"FAIL: " + label + " => " +
1493               set.toPattern(pat, TRUE) +
1494               ", expected " + exp.toPattern(xpat, TRUE));
1495     }
1496 }
1497 
TestInvalidCodePoint()1498 void UnicodeSetTest::TestInvalidCodePoint() {
1499 
1500     const UChar32 DATA[] = {
1501         // Test range             Expected range
1502         0, 0x10FFFF,              0, 0x10FFFF,
1503         (UChar32)-1, 8,           0, 8,
1504         8, 0x110000,              8, 0x10FFFF
1505     };
1506     const int32_t DATA_LENGTH = UPRV_LENGTHOF(DATA);
1507 
1508     UnicodeString pat;
1509     int32_t i;
1510 
1511     for (i=0; i<DATA_LENGTH; i+=4) {
1512         UChar32 start  = DATA[i];
1513         UChar32 end    = DATA[i+1];
1514         UChar32 xstart = DATA[i+2];
1515         UChar32 xend   = DATA[i+3];
1516 
1517         // Try various API using the test code points
1518 
1519         UnicodeSet set(start, end);
1520         expectRange((UnicodeString)"ct(" + start + "," + end + ")",
1521                     set, xstart, xend);
1522 
1523         set.clear();
1524         set.set(start, end);
1525         expectRange((UnicodeString)"set(" + start + "," + end + ")",
1526                     set, xstart, xend);
1527 
1528         UBool b = set.contains(start);
1529         b = set.contains(start, end);
1530         b = set.containsNone(start, end);
1531         b = set.containsSome(start, end);
1532         (void)b;   // Suppress set but not used warning.
1533 
1534         /*int32_t index = set.indexOf(start);*/
1535 
1536         set.clear();
1537         set.add(start);
1538         set.add(start, end);
1539         expectRange((UnicodeString)"add(" + start + "," + end + ")",
1540                     set, xstart, xend);
1541 
1542         set.set(0, 0x10FFFF);
1543         set.retain(start, end);
1544         expectRange((UnicodeString)"retain(" + start + "," + end + ")",
1545                     set, xstart, xend);
1546         set.retain(start);
1547 
1548         set.set(0, 0x10FFFF);
1549         set.remove(start);
1550         set.remove(start, end);
1551         set.complement();
1552         expectRange((UnicodeString)"!remove(" + start + "," + end + ")",
1553                     set, xstart, xend);
1554 
1555         set.set(0, 0x10FFFF);
1556         set.complement(start, end);
1557         set.complement();
1558         expectRange((UnicodeString)"!complement(" + start + "," + end + ")",
1559                     set, xstart, xend);
1560         set.complement(start);
1561     }
1562 
1563     const UChar32 DATA2[] = {
1564         0,
1565         0x10FFFF,
1566         (UChar32)-1,
1567         0x110000
1568     };
1569     const int32_t DATA2_LENGTH = UPRV_LENGTHOF(DATA2);
1570 
1571     for (i=0; i<DATA2_LENGTH; ++i) {
1572         UChar32 c = DATA2[i], end = 0x10FFFF;
1573         UBool valid = (c >= 0 && c <= 0x10FFFF);
1574 
1575         UnicodeSet set(0, 0x10FFFF);
1576 
1577         // For single-codepoint contains, invalid codepoints are NOT contained
1578         UBool b = set.contains(c);
1579         if (b == valid) {
1580             logln((UnicodeString)"[\\u0000-\\U0010FFFF].contains(" + c +
1581                   ") = " + b);
1582         } else {
1583             errln((UnicodeString)"FAIL: [\\u0000-\\U0010FFFF].contains(" + c +
1584                   ") = " + b);
1585         }
1586 
1587         // For codepoint range contains, containsNone, and containsSome,
1588         // invalid or empty (start > end) ranges have UNDEFINED behavior.
1589         b = set.contains(c, end);
1590         logln((UnicodeString)"* [\\u0000-\\U0010FFFF].contains(" + c +
1591               "," + end + ") = " + b);
1592 
1593         b = set.containsNone(c, end);
1594         logln((UnicodeString)"* [\\u0000-\\U0010FFFF].containsNone(" + c +
1595               "," + end + ") = " + b);
1596 
1597         b = set.containsSome(c, end);
1598         logln((UnicodeString)"* [\\u0000-\\U0010FFFF].containsSome(" + c +
1599               "," + end + ") = " + b);
1600 
1601         int32_t index = set.indexOf(c);
1602         if ((index >= 0) == valid) {
1603             logln((UnicodeString)"[\\u0000-\\U0010FFFF].indexOf(" + c +
1604                   ") = " + index);
1605         } else {
1606             errln((UnicodeString)"FAIL: [\\u0000-\\U0010FFFF].indexOf(" + c +
1607                   ") = " + index);
1608         }
1609     }
1610 }
1611 
1612 // Used by TestSymbolTable
1613 class TokenSymbolTable : public SymbolTable {
1614 public:
1615     Hashtable contents;
1616 
TokenSymbolTable(UErrorCode & ec)1617     TokenSymbolTable(UErrorCode& ec) : contents(FALSE, ec) {
1618         contents.setValueDeleter(uprv_deleteUObject);
1619     }
1620 
~TokenSymbolTable()1621     ~TokenSymbolTable() {}
1622 
1623     /**
1624      * (Non-SymbolTable API) Add the given variable and value to
1625      * the table.  Variable should NOT contain leading '$'.
1626      */
add(const UnicodeString & var,const UnicodeString & value,UErrorCode & ec)1627     void add(const UnicodeString& var, const UnicodeString& value,
1628              UErrorCode& ec) {
1629         if (U_SUCCESS(ec)) {
1630             contents.put(var, new UnicodeString(value), ec);
1631         }
1632     }
1633 
1634     /**
1635      * SymbolTable API
1636      */
lookup(const UnicodeString & s) const1637     virtual const UnicodeString* lookup(const UnicodeString& s) const override {
1638         return (const UnicodeString*) contents.get(s);
1639     }
1640 
1641     /**
1642      * SymbolTable API
1643      */
lookupMatcher(UChar32) const1644     virtual const UnicodeFunctor* lookupMatcher(UChar32 /*ch*/) const override {
1645         return NULL;
1646     }
1647 
1648     /**
1649      * SymbolTable API
1650      */
parseReference(const UnicodeString & text,ParsePosition & pos,int32_t limit) const1651     virtual UnicodeString parseReference(const UnicodeString& text,
1652                                          ParsePosition& pos, int32_t limit) const override {
1653         int32_t start = pos.getIndex();
1654         int32_t i = start;
1655         UnicodeString result;
1656         while (i < limit) {
1657             UChar c = text.charAt(i);
1658             if ((i==start && !u_isIDStart(c)) || !u_isIDPart(c)) {
1659                 break;
1660             }
1661             ++i;
1662         }
1663         if (i == start) { // No valid name chars
1664             return result; // Indicate failure with empty string
1665         }
1666         pos.setIndex(i);
1667         text.extractBetween(start, i, result);
1668         return result;
1669     }
1670 };
1671 
TestSymbolTable()1672 void UnicodeSetTest::TestSymbolTable() {
1673     // Multiple test cases can be set up here.  Each test case
1674     // is terminated by null:
1675     // var, value, var, value,..., input pat., exp. output pat., null
1676     const char* DATA[] = {
1677         "us", "a-z", "[0-1$us]", "[0-1a-z]", NULL,
1678         "us", "[a-z]", "[0-1$us]", "[0-1[a-z]]", NULL,
1679         "us", "\\[a\\-z\\]", "[0-1$us]", "[-01\\[\\]az]", NULL,
1680         NULL
1681     };
1682 
1683     for (int32_t i=0; DATA[i]!=NULL; ++i) {
1684         UErrorCode ec = U_ZERO_ERROR;
1685         TokenSymbolTable sym(ec);
1686         if (U_FAILURE(ec)) {
1687             errln("FAIL: couldn't construct TokenSymbolTable");
1688             continue;
1689         }
1690 
1691         // Set up variables
1692         while (DATA[i+2] != NULL) {
1693             sym.add(UnicodeString(DATA[i], -1, US_INV), UnicodeString(DATA[i+1], -1, US_INV), ec);
1694             if (U_FAILURE(ec)) {
1695                 errln("FAIL: couldn't add to TokenSymbolTable");
1696                 continue;
1697             }
1698             i += 2;
1699         }
1700 
1701         // Input pattern and expected output pattern
1702         UnicodeString inpat = UnicodeString(DATA[i], -1, US_INV), exppat = UnicodeString(DATA[i+1], -1, US_INV);
1703         i += 2;
1704 
1705         ParsePosition pos(0);
1706         UnicodeSet us(inpat, pos, USET_IGNORE_SPACE, &sym, ec);
1707         if (U_FAILURE(ec)) {
1708             errln("FAIL: couldn't construct UnicodeSet");
1709             continue;
1710         }
1711 
1712         // results
1713         if (pos.getIndex() != inpat.length()) {
1714             errln((UnicodeString)"Failed to read to end of string \""
1715                   + inpat + "\": read to "
1716                   + pos.getIndex() + ", length is "
1717                   + inpat.length());
1718         }
1719 
1720         UnicodeSet us2(exppat, ec);
1721         if (U_FAILURE(ec)) {
1722             errln("FAIL: couldn't construct expected UnicodeSet");
1723             continue;
1724         }
1725 
1726         UnicodeString a, b;
1727         if (us != us2) {
1728             errln((UnicodeString)"Failed, got " + us.toPattern(a, TRUE) +
1729                   ", expected " + us2.toPattern(b, TRUE));
1730         } else {
1731             logln((UnicodeString)"Ok, got " + us.toPattern(a, TRUE));
1732         }
1733     }
1734 }
1735 
TestSurrogate()1736 void UnicodeSetTest::TestSurrogate() {
1737     const char* DATA[] = {
1738         // These should all behave identically
1739         "[abc\\uD800\\uDC00]",
1740         // "[abc\uD800\uDC00]", // Can't do this on C -- only Java
1741         "[abc\\U00010000]",
1742         0
1743     };
1744     for (int i=0; DATA[i] != 0; ++i) {
1745         UErrorCode ec = U_ZERO_ERROR;
1746         logln((UnicodeString)"Test pattern " + i + " :" + UnicodeString(DATA[i], -1, US_INV));
1747         UnicodeString str = UnicodeString(DATA[i], -1, US_INV);
1748         UnicodeSet set(str, ec);
1749         if (U_FAILURE(ec)) {
1750             errln("FAIL: UnicodeSet constructor");
1751             continue;
1752         }
1753         expectContainment(set,
1754                           CharsToUnicodeString("abc\\U00010000"),
1755                           CharsToUnicodeString("\\uD800;\\uDC00")); // split apart surrogate-pair
1756         if (set.size() != 4) {
1757             errln((UnicodeString)"FAIL: " + UnicodeString(DATA[i], -1, US_INV) + ".size() == " +
1758                   set.size() + ", expected 4");
1759         }
1760 
1761         {
1762           UErrorCode subErr = U_ZERO_ERROR;
1763           checkRoundTrip(set);
1764           checkSerializeRoundTrip(set, subErr);
1765         }
1766     }
1767 }
1768 
TestExhaustive()1769 void UnicodeSetTest::TestExhaustive() {
1770     // exhaustive tests. Simulate UnicodeSets with integers.
1771     // That gives us very solid tests (except for large memory tests).
1772 
1773     int32_t limit = 128;
1774 
1775     UnicodeSet x, y, z, aa;
1776 
1777     for (int32_t i = 0; i < limit; ++i) {
1778         bitsToSet(i, x);
1779         logln((UnicodeString)"Testing " + i + ", " + x);
1780         _testComplement(i, x, y);
1781 
1782         UnicodeSet &toTest = bitsToSet(i, aa);
1783 
1784         // AS LONG AS WE ARE HERE, check roundtrip
1785         checkRoundTrip(toTest);
1786         UErrorCode ec = U_ZERO_ERROR;
1787         checkSerializeRoundTrip(toTest, ec);
1788 
1789         for (int32_t j = 0; j < limit; ++j) {
1790             _testAdd(i,j,  x,y,z);
1791             _testXor(i,j,  x,y,z);
1792             _testRetain(i,j,  x,y,z);
1793             _testRemove(i,j,  x,y,z);
1794         }
1795     }
1796 }
1797 
_testComplement(int32_t a,UnicodeSet & x,UnicodeSet & z)1798 void UnicodeSetTest::_testComplement(int32_t a, UnicodeSet& x, UnicodeSet& z) {
1799     bitsToSet(a, x);
1800     z = x;
1801     z.complement();
1802     int32_t c = setToBits(z);
1803     if (c != (~a)) {
1804         errln((UnicodeString)"FAILED: add: ~" + x +  " != " + z);
1805         errln((UnicodeString)"FAILED: add: ~" + a + " != " + c);
1806     }
1807     checkCanonicalRep(z, (UnicodeString)"complement " + a);
1808 }
1809 
_testAdd(int32_t a,int32_t b,UnicodeSet & x,UnicodeSet & y,UnicodeSet & z)1810 void UnicodeSetTest::_testAdd(int32_t a, int32_t b, UnicodeSet& x, UnicodeSet& y, UnicodeSet& z) {
1811     bitsToSet(a, x);
1812     bitsToSet(b, y);
1813     z = x;
1814     z.addAll(y);
1815     int32_t c = setToBits(z);
1816     if (c != (a | b)) {
1817         errln((UnicodeString)"FAILED: add: " + x + " | " + y + " != " + z);
1818         errln((UnicodeString)"FAILED: add: " + a + " | " + b + " != " + c);
1819     }
1820     checkCanonicalRep(z, (UnicodeString)"add " + a + "," + b);
1821 }
1822 
_testRetain(int32_t a,int32_t b,UnicodeSet & x,UnicodeSet & y,UnicodeSet & z)1823 void UnicodeSetTest::_testRetain(int32_t a, int32_t b, UnicodeSet& x, UnicodeSet& y, UnicodeSet& z) {
1824     bitsToSet(a, x);
1825     bitsToSet(b, y);
1826     z = x;
1827     z.retainAll(y);
1828     int32_t c = setToBits(z);
1829     if (c != (a & b)) {
1830         errln((UnicodeString)"FAILED: retain: " + x + " & " + y + " != " + z);
1831         errln((UnicodeString)"FAILED: retain: " + a + " & " + b + " != " + c);
1832     }
1833     checkCanonicalRep(z, (UnicodeString)"retain " + a + "," + b);
1834 }
1835 
_testRemove(int32_t a,int32_t b,UnicodeSet & x,UnicodeSet & y,UnicodeSet & z)1836 void UnicodeSetTest::_testRemove(int32_t a, int32_t b, UnicodeSet& x, UnicodeSet& y, UnicodeSet& z) {
1837     bitsToSet(a, x);
1838     bitsToSet(b, y);
1839     z = x;
1840     z.removeAll(y);
1841     int32_t c = setToBits(z);
1842     if (c != (a &~ b)) {
1843         errln((UnicodeString)"FAILED: remove: " + x + " &~ " + y + " != " + z);
1844         errln((UnicodeString)"FAILED: remove: " + a + " &~ " + b + " != " + c);
1845     }
1846     checkCanonicalRep(z, (UnicodeString)"remove " + a + "," + b);
1847 }
1848 
_testXor(int32_t a,int32_t b,UnicodeSet & x,UnicodeSet & y,UnicodeSet & z)1849 void UnicodeSetTest::_testXor(int32_t a, int32_t b, UnicodeSet& x, UnicodeSet& y, UnicodeSet& z) {
1850     bitsToSet(a, x);
1851     bitsToSet(b, y);
1852     z = x;
1853     z.complementAll(y);
1854     int32_t c = setToBits(z);
1855     if (c != (a ^ b)) {
1856         errln((UnicodeString)"FAILED: complement: " + x + " ^ " + y + " != " + z);
1857         errln((UnicodeString)"FAILED: complement: " + a + " ^ " + b + " != " + c);
1858     }
1859     checkCanonicalRep(z, (UnicodeString)"complement " + a + "," + b);
1860 }
1861 
1862 /**
1863  * Check that ranges are monotonically increasing and non-
1864  * overlapping.
1865  */
checkCanonicalRep(const UnicodeSet & set,const UnicodeString & msg)1866 void UnicodeSetTest::checkCanonicalRep(const UnicodeSet& set, const UnicodeString& msg) {
1867     int32_t n = set.getRangeCount();
1868     if (n < 0) {
1869         errln((UnicodeString)"FAIL result of " + msg +
1870               ": range count should be >= 0 but is " +
1871               n /*+ " for " + set.toPattern())*/);
1872         return;
1873     }
1874     UChar32 last = 0;
1875     for (int32_t i=0; i<n; ++i) {
1876         UChar32 start = set.getRangeStart(i);
1877         UChar32 end = set.getRangeEnd(i);
1878         if (start > end) {
1879             errln((UnicodeString)"FAIL result of " + msg +
1880                   ": range " + (i+1) +
1881                   " start > end: " + (int)start + ", " + (int)end +
1882                   " for " + set);
1883         }
1884         if (i > 0 && start <= last) {
1885             errln((UnicodeString)"FAIL result of " + msg +
1886                   ": range " + (i+1) +
1887                   " overlaps previous range: " + (int)start + ", " + (int)end +
1888                   " for " + set);
1889         }
1890         last = end;
1891     }
1892 }
1893 
1894 /**
1895  * Convert a bitmask to a UnicodeSet.
1896  */
bitsToSet(int32_t a,UnicodeSet & result)1897 UnicodeSet& UnicodeSetTest::bitsToSet(int32_t a, UnicodeSet& result) {
1898     result.clear();
1899     for (UChar32 i = 0; i < 32; ++i) {
1900         if ((a & (1<<i)) != 0) {
1901             result.add(i);
1902         }
1903     }
1904     return result;
1905 }
1906 
1907 /**
1908  * Convert a UnicodeSet to a bitmask.  Only the characters
1909  * U+0000 to U+0020 are represented in the bitmask.
1910  */
setToBits(const UnicodeSet & x)1911 int32_t UnicodeSetTest::setToBits(const UnicodeSet& x) {
1912     int32_t result = 0;
1913     for (int32_t i = 0; i < 32; ++i) {
1914         if (x.contains((UChar32)i)) {
1915             result |= (1<<i);
1916         }
1917     }
1918     return result;
1919 }
1920 
1921 /**
1922  * Return the representation of an inversion list based UnicodeSet
1923  * as a pairs list.  Ranges are listed in ascending Unicode order.
1924  * For example, the set [a-zA-M3] is represented as "33AMaz".
1925  */
getPairs(const UnicodeSet & set)1926 UnicodeString UnicodeSetTest::getPairs(const UnicodeSet& set) {
1927     UnicodeString pairs;
1928     for (int32_t i=0; i<set.getRangeCount(); ++i) {
1929         UChar32 start = set.getRangeStart(i);
1930         UChar32 end = set.getRangeEnd(i);
1931         if (end > 0xFFFF) {
1932             end = 0xFFFF;
1933             i = set.getRangeCount(); // Should be unnecessary
1934         }
1935         pairs.append((UChar)start).append((UChar)end);
1936     }
1937     return pairs;
1938 }
1939 
1940 /**
1941  * Basic consistency check for a few items.
1942  * That the iterator works, and that we can create a pattern and
1943  * get the same thing back
1944  */
checkRoundTrip(const UnicodeSet & s)1945 void UnicodeSetTest::checkRoundTrip(const UnicodeSet& s) {
1946     {
1947         UnicodeSet t(s);
1948         checkEqual(s, t, "copy ct");
1949     }
1950 
1951     {
1952         UnicodeSet t(0xabcd, 0xdef0);  // dummy contents should be overwritten
1953         t = s;
1954         checkEqual(s, t, "operator=");
1955     }
1956 
1957     {
1958         UnicodeSet t;
1959         copyWithIterator(t, s, FALSE);
1960         checkEqual(s, t, "iterator roundtrip");
1961     }
1962 
1963     {
1964         UnicodeSet t;
1965         copyWithIterator(t, s, TRUE); // try range
1966         checkEqual(s, t, "iterator roundtrip");
1967     }
1968 
1969     {
1970         UnicodeSet t;
1971         UnicodeString pat;
1972         UErrorCode ec = U_ZERO_ERROR;
1973         s.toPattern(pat, FALSE);
1974         t.applyPattern(pat, ec);
1975         if (U_FAILURE(ec)) {
1976             errln("FAIL: toPattern(escapeUnprintable=FALSE), applyPattern - %s", u_errorName(ec));
1977             return;
1978         } else {
1979             checkEqual(s, t, "toPattern(false)");
1980         }
1981     }
1982 
1983     {
1984         UnicodeSet t;
1985         UnicodeString pat;
1986         UErrorCode ec = U_ZERO_ERROR;
1987         s.toPattern(pat, TRUE);
1988         t.applyPattern(pat, ec);
1989         if (U_FAILURE(ec)) {
1990             errln("FAIL: toPattern(escapeUnprintable=TRUE), applyPattern - %s", u_errorName(ec));
1991             return;
1992         } else {
1993             checkEqual(s, t, "toPattern(true)");
1994         }
1995     }
1996 }
1997 
checkSerializeRoundTrip(const UnicodeSet & t,UErrorCode & status)1998 void UnicodeSetTest::checkSerializeRoundTrip(const UnicodeSet& t, UErrorCode &status) {
1999   if(U_FAILURE(status)) return;
2000   int32_t len = t.serialize(serializeBuffer.getAlias(), serializeBuffer.getCapacity(), status);
2001   if(status == U_BUFFER_OVERFLOW_ERROR) {
2002     status = U_ZERO_ERROR;
2003     serializeBuffer.resize(len);
2004     len = t.serialize(serializeBuffer.getAlias(), serializeBuffer.getCapacity(), status);
2005     // let 2nd error stand
2006   }
2007   if(U_FAILURE(status)) {
2008     errln("checkSerializeRoundTrip: error %s serializing buffer\n", u_errorName(status));
2009     return;
2010   }
2011   UnicodeSet deserialized(serializeBuffer.getAlias(), len, UnicodeSet::kSerialized, status);
2012   if(U_FAILURE(status)) {
2013     errln("checkSerializeRoundTrip: error %s deserializing buffer: buf %p len %d, original %d\n", u_errorName(status), serializeBuffer.getAlias(), len, t.getRangeCount());
2014     return;
2015   }
2016 
2017   checkEqual(t, deserialized, "Set was unequal when deserialized");
2018 }
2019 
copyWithIterator(UnicodeSet & t,const UnicodeSet & s,UBool withRange)2020 void UnicodeSetTest::copyWithIterator(UnicodeSet& t, const UnicodeSet& s, UBool withRange) {
2021     t.clear();
2022     UnicodeSetIterator it(s);
2023     if (withRange) {
2024         while (it.nextRange()) {
2025             if (it.isString()) {
2026                 t.add(it.getString());
2027             } else {
2028                 t.add(it.getCodepoint(), it.getCodepointEnd());
2029             }
2030         }
2031     } else {
2032         while (it.next()) {
2033             if (it.isString()) {
2034                 t.add(it.getString());
2035             } else {
2036                 t.add(it.getCodepoint());
2037             }
2038         }
2039     }
2040 }
2041 
checkEqual(const UnicodeSet & s,const UnicodeSet & t,const char * message)2042 UBool UnicodeSetTest::checkEqual(const UnicodeSet& s, const UnicodeSet& t, const char* message) {
2043   assertEquals(UnicodeString("RangeCount: ","") + message, s.getRangeCount(), t.getRangeCount());
2044   assertEquals(UnicodeString("size: ","") + message, s.size(), t.size());
2045     UnicodeString source; s.toPattern(source, TRUE);
2046     UnicodeString result; t.toPattern(result, TRUE);
2047     if (s != t) {
2048         errln((UnicodeString)"FAIL: " + message
2049               + "; source = " + source
2050               + "; result = " + result
2051               );
2052         return FALSE;
2053     } else {
2054         logln((UnicodeString)"Ok: " + message
2055               + "; source = " + source
2056               + "; result = " + result
2057               );
2058     }
2059     return TRUE;
2060 }
2061 
2062 void
expectContainment(const UnicodeString & pat,const UnicodeString & charsIn,const UnicodeString & charsOut)2063 UnicodeSetTest::expectContainment(const UnicodeString& pat,
2064                                   const UnicodeString& charsIn,
2065                                   const UnicodeString& charsOut) {
2066     UErrorCode ec = U_ZERO_ERROR;
2067     UnicodeSet set(pat, ec);
2068     if (U_FAILURE(ec)) {
2069         dataerrln((UnicodeString)"FAIL: pattern \"" +
2070               pat + "\" => " + u_errorName(ec));
2071         return;
2072     }
2073     expectContainment(set, pat, charsIn, charsOut);
2074 }
2075 
2076 void
expectContainment(const UnicodeSet & set,const UnicodeString & charsIn,const UnicodeString & charsOut)2077 UnicodeSetTest::expectContainment(const UnicodeSet& set,
2078                                   const UnicodeString& charsIn,
2079                                   const UnicodeString& charsOut) {
2080     UnicodeString pat;
2081     set.toPattern(pat);
2082     expectContainment(set, pat, charsIn, charsOut);
2083 }
2084 
2085 void
expectContainment(const UnicodeSet & set,const UnicodeString & setName,const UnicodeString & charsIn,const UnicodeString & charsOut)2086 UnicodeSetTest::expectContainment(const UnicodeSet& set,
2087                                   const UnicodeString& setName,
2088                                   const UnicodeString& charsIn,
2089                                   const UnicodeString& charsOut) {
2090     UnicodeString bad;
2091     UChar32 c;
2092     int32_t i;
2093 
2094     for (i=0; i<charsIn.length(); i+=U16_LENGTH(c)) {
2095         c = charsIn.char32At(i);
2096         if (!set.contains(c)) {
2097             bad.append(c);
2098         }
2099     }
2100     if (bad.length() > 0) {
2101         errln((UnicodeString)"Fail: set " + setName + " does not contain " + prettify(bad) +
2102               ", expected containment of " + prettify(charsIn));
2103     } else {
2104         logln((UnicodeString)"Ok: set " + setName + " contains " + prettify(charsIn));
2105     }
2106 
2107     bad.truncate(0);
2108     for (i=0; i<charsOut.length(); i+=U16_LENGTH(c)) {
2109         c = charsOut.char32At(i);
2110         if (set.contains(c)) {
2111             bad.append(c);
2112         }
2113     }
2114     if (bad.length() > 0) {
2115         errln((UnicodeString)"Fail: set " + setName + " contains " + prettify(bad) +
2116               ", expected non-containment of " + prettify(charsOut));
2117     } else {
2118         logln((UnicodeString)"Ok: set " + setName + " does not contain " + prettify(charsOut));
2119     }
2120 }
2121 
2122 void
expectPattern(UnicodeSet & set,const UnicodeString & pattern,const UnicodeString & expectedPairs)2123 UnicodeSetTest::expectPattern(UnicodeSet& set,
2124                               const UnicodeString& pattern,
2125                               const UnicodeString& expectedPairs){
2126     UErrorCode status = U_ZERO_ERROR;
2127     set.applyPattern(pattern, status);
2128     if (U_FAILURE(status)) {
2129         errln(UnicodeString("FAIL: applyPattern(\"") + pattern +
2130               "\") failed");
2131         return;
2132     } else {
2133         if (getPairs(set) != expectedPairs ) {
2134             errln(UnicodeString("FAIL: applyPattern(\"") + pattern +
2135                   "\") => pairs \"" +
2136                   escape(getPairs(set)) + "\", expected \"" +
2137                   escape(expectedPairs) + "\"");
2138         } else {
2139             logln(UnicodeString("Ok:   applyPattern(\"") + pattern +
2140                   "\") => pairs \"" +
2141                   escape(getPairs(set)) + "\"");
2142         }
2143     }
2144     // the result of calling set.toPattern(), which is the string representation of
2145     // this set(set), is passed to a  UnicodeSet constructor, and tested that it
2146     // will produce another set that is equal to this one.
2147     UnicodeString temppattern;
2148     set.toPattern(temppattern);
2149     UnicodeSet *tempset=new UnicodeSet(temppattern, status);
2150     if (U_FAILURE(status)) {
2151         errln(UnicodeString("FAIL: applyPattern(\""+ pattern + "\").toPattern() => " + temppattern + " => invalid pattern"));
2152         return;
2153     }
2154     if(*tempset != set || getPairs(*tempset) != getPairs(set)){
2155         errln(UnicodeString("FAIL: applyPattern(\""+ pattern + "\").toPattern() => " + temppattern + " => pairs \""+ escape(getPairs(*tempset)) + "\", expected pairs \"" +
2156             escape(getPairs(set)) + "\""));
2157     } else{
2158         logln(UnicodeString("Ok:   applyPattern(\""+ pattern + "\").toPattern() => " + temppattern + " => pairs \"" + escape(getPairs(*tempset)) + "\""));
2159     }
2160 
2161     delete tempset;
2162 
2163 }
2164 
2165 void
expectPairs(const UnicodeSet & set,const UnicodeString & expectedPairs)2166 UnicodeSetTest::expectPairs(const UnicodeSet& set, const UnicodeString& expectedPairs) {
2167     if (getPairs(set) != expectedPairs) {
2168         errln(UnicodeString("FAIL: Expected pair list \"") +
2169               escape(expectedPairs) + "\", got \"" +
2170               escape(getPairs(set)) + "\"");
2171     }
2172 }
2173 
expectToPattern(const UnicodeSet & set,const UnicodeString & expPat,const char ** expStrings)2174 void UnicodeSetTest::expectToPattern(const UnicodeSet& set,
2175                                      const UnicodeString& expPat,
2176                                      const char** expStrings) {
2177     UnicodeString pat;
2178     set.toPattern(pat, TRUE);
2179     if (pat == expPat) {
2180         logln((UnicodeString)"Ok:   toPattern() => \"" + pat + "\"");
2181     } else {
2182         errln((UnicodeString)"FAIL: toPattern() => \"" + pat + "\", expected \"" + expPat + "\"");
2183         return;
2184     }
2185     if (expStrings == NULL) {
2186         return;
2187     }
2188     UBool in = TRUE;
2189     for (int32_t i=0; expStrings[i] != NULL; ++i) {
2190         if (expStrings[i] == NOT) { // sic; pointer comparison
2191             in = FALSE;
2192             continue;
2193         }
2194         UnicodeString s = CharsToUnicodeString(expStrings[i]);
2195         UBool contained = set.contains(s);
2196         if (contained == in) {
2197             logln((UnicodeString)"Ok: " + expPat +
2198                   (contained ? " contains {" : " does not contain {") +
2199                   escape(expStrings[i]) + "}");
2200         } else {
2201             errln((UnicodeString)"FAIL: " + expPat +
2202                   (contained ? " contains {" : " does not contain {") +
2203                   escape(expStrings[i]) + "}");
2204         }
2205     }
2206 }
2207 
toHexString(int32_t i)2208 static UChar toHexString(int32_t i) { return (UChar)(i + (i < 10 ? u'0' : (u'A' - 10))); }
2209 
2210 void
doAssert(UBool condition,const char * message)2211 UnicodeSetTest::doAssert(UBool condition, const char *message)
2212 {
2213     if (!condition) {
2214         errln(UnicodeString("ERROR : ") + message);
2215     }
2216 }
2217 
2218 UnicodeString
escape(const UnicodeString & s)2219 UnicodeSetTest::escape(const UnicodeString& s) {
2220     UnicodeString buf;
2221     for (int32_t i=0; i<s.length(); )
2222     {
2223         UChar32 c = s.char32At(i);
2224         if (0x0020 <= c && c <= 0x007F) {
2225             buf += c;
2226         } else {
2227             if (c <= 0xFFFF) {
2228                 buf += u"\\u";
2229             } else {
2230                 buf += u"\\U";
2231                 buf += toHexString((c & 0xF0000000) >> 28);
2232                 buf += toHexString((c & 0x0F000000) >> 24);
2233                 buf += toHexString((c & 0x00F00000) >> 20);
2234                 buf += toHexString((c & 0x000F0000) >> 16);
2235             }
2236             buf += toHexString((c & 0xF000) >> 12);
2237             buf += toHexString((c & 0x0F00) >> 8);
2238             buf += toHexString((c & 0x00F0) >> 4);
2239             buf += toHexString(c & 0x000F);
2240         }
2241         i += U16_LENGTH(c);
2242     }
2243     return buf;
2244 }
2245 
TestFreezable()2246 void UnicodeSetTest::TestFreezable() {
2247     UErrorCode errorCode=U_ZERO_ERROR;
2248     UnicodeString idPattern=UNICODE_STRING("[:ID_Continue:]", 15);
2249     UnicodeSet idSet(idPattern, errorCode);
2250     if(U_FAILURE(errorCode)) {
2251         dataerrln("FAIL: unable to create UnicodeSet([:ID_Continue:]) - %s", u_errorName(errorCode));
2252         return;
2253     }
2254 
2255     UnicodeString wsPattern=UNICODE_STRING("[:White_Space:]", 15);
2256     UnicodeSet wsSet(wsPattern, errorCode);
2257     if(U_FAILURE(errorCode)) {
2258         dataerrln("FAIL: unable to create UnicodeSet([:White_Space:]) - %s", u_errorName(errorCode));
2259         return;
2260     }
2261 
2262     idSet.add(idPattern);
2263     UnicodeSet frozen(idSet);
2264     frozen.freeze();
2265 
2266     if(idSet.isFrozen() || !frozen.isFrozen()) {
2267         errln("FAIL: isFrozen() is wrong");
2268     }
2269     if(frozen!=idSet || !(frozen==idSet)) {
2270         errln("FAIL: a copy-constructed frozen set differs from its original");
2271     }
2272 
2273     frozen=wsSet;
2274     if(frozen!=idSet || !(frozen==idSet)) {
2275         errln("FAIL: a frozen set was modified by operator=");
2276     }
2277 
2278     UnicodeSet frozen2(frozen);
2279     if(frozen2!=frozen || frozen2!=idSet) {
2280         errln("FAIL: a copied frozen set differs from its frozen original");
2281     }
2282     if(!frozen2.isFrozen()) {
2283         errln("FAIL: copy-constructing a frozen set results in a thawed one");
2284     }
2285     UnicodeSet frozen3(5, 55);  // Set to some values to really test assignment below, not copy construction.
2286     if(frozen3.contains(0, 4) || !frozen3.contains(5, 55) || frozen3.contains(56, 0x10ffff)) {
2287         errln("FAIL: UnicodeSet(5, 55) failed");
2288     }
2289     frozen3=frozen;
2290     if(!frozen3.isFrozen()) {
2291         errln("FAIL: copying a frozen set results in a thawed one");
2292     }
2293 
2294     UnicodeSet *cloned=frozen.clone();
2295     if(!cloned->isFrozen() || *cloned!=frozen || cloned->containsSome(0xd802, 0xd805)) {
2296         errln("FAIL: clone() failed");
2297     }
2298     cloned->add(0xd802, 0xd805);
2299     if(cloned->containsSome(0xd802, 0xd805)) {
2300         errln("FAIL: unable to modify clone");
2301     }
2302     delete cloned;
2303 
2304     UnicodeSet *thawed=frozen.cloneAsThawed();
2305     if(thawed->isFrozen() || *thawed!=frozen || thawed->containsSome(0xd802, 0xd805)) {
2306         errln("FAIL: cloneAsThawed() failed");
2307     }
2308     thawed->add(0xd802, 0xd805);
2309     if(!thawed->contains(0xd802, 0xd805)) {
2310         errln("FAIL: unable to modify thawed clone");
2311     }
2312     delete thawed;
2313 
2314     frozen.set(5, 55);
2315     if(frozen!=idSet || !(frozen==idSet)) {
2316         errln("FAIL: UnicodeSet::set() modified a frozen set");
2317     }
2318 
2319     frozen.clear();
2320     if(frozen!=idSet || !(frozen==idSet)) {
2321         errln("FAIL: UnicodeSet::clear() modified a frozen set");
2322     }
2323 
2324     frozen.closeOver(USET_CASE_INSENSITIVE);
2325     if(frozen!=idSet || !(frozen==idSet)) {
2326         errln("FAIL: UnicodeSet::closeOver() modified a frozen set");
2327     }
2328 
2329     frozen.compact();
2330     if(frozen!=idSet || !(frozen==idSet)) {
2331         errln("FAIL: UnicodeSet::compact() modified a frozen set");
2332     }
2333 
2334     ParsePosition pos;
2335     frozen.
2336         applyPattern(wsPattern, errorCode).
2337         applyPattern(wsPattern, USET_IGNORE_SPACE, NULL, errorCode).
2338         applyPattern(wsPattern, pos, USET_IGNORE_SPACE, NULL, errorCode).
2339         applyIntPropertyValue(UCHAR_CANONICAL_COMBINING_CLASS, 230, errorCode).
2340         applyPropertyAlias(u"Assigned", UnicodeString(), errorCode);
2341     if(frozen!=idSet || !(frozen==idSet)) {
2342         errln("FAIL: UnicodeSet::applyXYZ() modified a frozen set");
2343     }
2344 
2345     frozen.
2346         add(0xd800).
2347         add(0xd802, 0xd805).
2348         add(wsPattern).
2349         addAll(idPattern).
2350         addAll(wsSet);
2351     if(frozen!=idSet || !(frozen==idSet)) {
2352         errln("FAIL: UnicodeSet::addXYZ() modified a frozen set");
2353     }
2354 
2355     frozen.
2356         retain(0x62).
2357         retain(0x64, 0x69).
2358         retainAll(wsPattern).
2359         retainAll(wsSet);
2360     if(frozen!=idSet || !(frozen==idSet)) {
2361         errln("FAIL: UnicodeSet::retainXYZ() modified a frozen set");
2362     }
2363 
2364     frozen.
2365         remove(0x62).
2366         remove(0x64, 0x69).
2367         remove(idPattern).
2368         removeAll(idPattern).
2369         removeAll(idSet);
2370     if(frozen!=idSet || !(frozen==idSet)) {
2371         errln("FAIL: UnicodeSet::removeXYZ() modified a frozen set");
2372     }
2373 
2374     frozen.
2375         complement().
2376         complement(0x62).
2377         complement(0x64, 0x69).
2378         complement(idPattern).
2379         complementAll(idPattern).
2380         complementAll(idSet);
2381     if(frozen!=idSet || !(frozen==idSet)) {
2382         errln("FAIL: UnicodeSet::complementXYZ() modified a frozen set");
2383     }
2384 }
2385 
2386 // Test span() etc. -------------------------------------------------------- ***
2387 
2388 // Append the UTF-8 version of the string to t and return the appended UTF-8 length.
2389 static int32_t
appendUTF8(const UChar * s,int32_t length,char * t,int32_t capacity)2390 appendUTF8(const UChar *s, int32_t length, char *t, int32_t capacity) {
2391     UErrorCode errorCode=U_ZERO_ERROR;
2392     int32_t length8=0;
2393     u_strToUTF8(t, capacity, &length8, s, length, &errorCode);
2394     if(U_SUCCESS(errorCode)) {
2395         return length8;
2396     } else {
2397         // The string contains an unpaired surrogate.
2398         // Ignore this string.
2399         return 0;
2400     }
2401 }
2402 
2403 class UnicodeSetWithStringsIterator;
2404 
2405 // Make the strings in a UnicodeSet easily accessible.
2406 class UnicodeSetWithStrings {
2407 public:
UnicodeSetWithStrings(const UnicodeSet & normalSet)2408     UnicodeSetWithStrings(const UnicodeSet &normalSet) :
2409             set(normalSet), stringsLength(0), hasSurrogates(FALSE) {
2410         int32_t size=set.size();
2411         if(size>0 && set.charAt(size-1)<0) {
2412             // If a set's last element is not a code point, then it must contain strings.
2413             // Iterate over the set, skip all code point ranges, and cache the strings.
2414             // Convert them to UTF-8 for spanUTF8().
2415             UnicodeSetIterator iter(set);
2416             const UnicodeString *s;
2417             char *s8=utf8;
2418             int32_t length8, utf8Count=0;
2419             while(iter.nextRange() && stringsLength<UPRV_LENGTHOF(strings)) {
2420                 if(iter.isString()) {
2421                     // Store the pointer to the set's string element
2422                     // which we happen to know is a stable pointer.
2423                     strings[stringsLength]=s=&iter.getString();
2424                     utf8Count+=
2425                         utf8Lengths[stringsLength]=length8=
2426                         appendUTF8(s->getBuffer(), s->length(),
2427                                    s8, (int32_t)(sizeof(utf8)-utf8Count));
2428                     if(length8==0) {
2429                         hasSurrogates=TRUE;  // Contains unpaired surrogates.
2430                     }
2431                     s8+=length8;
2432                     ++stringsLength;
2433                 }
2434             }
2435         }
2436     }
2437 
getSet() const2438     const UnicodeSet &getSet() const {
2439         return set;
2440     }
2441 
hasStrings() const2442     UBool hasStrings() const {
2443         return (UBool)(stringsLength>0);
2444     }
2445 
hasStringsWithSurrogates() const2446     UBool hasStringsWithSurrogates() const {
2447         return hasSurrogates;
2448     }
2449 
2450 private:
2451     friend class UnicodeSetWithStringsIterator;
2452 
2453     const UnicodeSet &set;
2454 
2455     const UnicodeString *strings[20];
2456     int32_t stringsLength;
2457     UBool hasSurrogates;
2458 
2459     char utf8[1024];
2460     int32_t utf8Lengths[20];
2461 };
2462 
2463 class UnicodeSetWithStringsIterator {
2464 public:
UnicodeSetWithStringsIterator(const UnicodeSetWithStrings & set)2465     UnicodeSetWithStringsIterator(const UnicodeSetWithStrings &set) :
2466             fSet(set), nextStringIndex(0), nextUTF8Start(0) {
2467     }
2468 
reset()2469     void reset() {
2470         nextStringIndex=nextUTF8Start=0;
2471     }
2472 
nextString()2473     const UnicodeString *nextString() {
2474         if(nextStringIndex<fSet.stringsLength) {
2475             return fSet.strings[nextStringIndex++];
2476         } else {
2477             return NULL;
2478         }
2479     }
2480 
2481     // Do not mix with calls to nextString().
nextUTF8(int32_t & length)2482     const char *nextUTF8(int32_t &length) {
2483         if(nextStringIndex<fSet.stringsLength) {
2484             const char *s8=fSet.utf8+nextUTF8Start;
2485             nextUTF8Start+=length=fSet.utf8Lengths[nextStringIndex++];
2486             return s8;
2487         } else {
2488             length=0;
2489             return NULL;
2490         }
2491     }
2492 
2493 private:
2494     const UnicodeSetWithStrings &fSet;
2495     int32_t nextStringIndex;
2496     int32_t nextUTF8Start;
2497 };
2498 
2499 // Compare 16-bit Unicode strings (which may be malformed UTF-16)
2500 // at code point boundaries.
2501 // That is, each edge of a match must not be in the middle of a surrogate pair.
2502 static inline UBool
matches16CPB(const UChar * s,int32_t start,int32_t limit,const UnicodeString & t)2503 matches16CPB(const UChar *s, int32_t start, int32_t limit, const UnicodeString &t) {
2504     s+=start;
2505     limit-=start;
2506     int32_t length=t.length();
2507     return 0==t.compare(s, length) &&
2508            !(0<start && U16_IS_LEAD(s[-1]) && U16_IS_TRAIL(s[0])) &&
2509            !(length<limit && U16_IS_LEAD(s[length-1]) && U16_IS_TRAIL(s[length]));
2510 }
2511 
2512 // Implement span() with contains() for comparison.
containsSpanUTF16(const UnicodeSetWithStrings & set,const UChar * s,int32_t length,USetSpanCondition spanCondition)2513 static int32_t containsSpanUTF16(const UnicodeSetWithStrings &set, const UChar *s, int32_t length,
2514                                  USetSpanCondition spanCondition) {
2515     const UnicodeSet &realSet(set.getSet());
2516     if(!set.hasStrings()) {
2517         if(spanCondition!=USET_SPAN_NOT_CONTAINED) {
2518             spanCondition=USET_SPAN_CONTAINED;  // Pin to 0/1 values.
2519         }
2520 
2521         UChar32 c;
2522         int32_t start=0, prev;
2523         while((prev=start)<length) {
2524             U16_NEXT(s, start, length, c);
2525             if(realSet.contains(c)!=spanCondition) {
2526                 break;
2527             }
2528         }
2529         return prev;
2530     } else if(spanCondition==USET_SPAN_NOT_CONTAINED) {
2531         UnicodeSetWithStringsIterator iter(set);
2532         UChar32 c;
2533         int32_t start, next;
2534         for(start=next=0; start<length;) {
2535             U16_NEXT(s, next, length, c);
2536             if(realSet.contains(c)) {
2537                 break;
2538             }
2539             const UnicodeString *str;
2540             iter.reset();
2541             while((str=iter.nextString())!=NULL) {
2542                 if(str->length()<=(length-start) && matches16CPB(s, start, length, *str)) {
2543                     // spanNeedsStrings=TRUE;
2544                     return start;
2545                 }
2546             }
2547             start=next;
2548         }
2549         return start;
2550     } else /* USET_SPAN_CONTAINED or USET_SPAN_SIMPLE */ {
2551         UnicodeSetWithStringsIterator iter(set);
2552         UChar32 c;
2553         int32_t start, next, maxSpanLimit=0;
2554         for(start=next=0; start<length;) {
2555             U16_NEXT(s, next, length, c);
2556             if(!realSet.contains(c)) {
2557                 next=start;  // Do not span this single, not-contained code point.
2558             }
2559             const UnicodeString *str;
2560             iter.reset();
2561             while((str=iter.nextString())!=NULL) {
2562                 if(str->length()<=(length-start) && matches16CPB(s, start, length, *str)) {
2563                     // spanNeedsStrings=TRUE;
2564                     int32_t matchLimit=start+str->length();
2565                     if(matchLimit==length) {
2566                         return length;
2567                     }
2568                     if(spanCondition==USET_SPAN_CONTAINED) {
2569                         // Iterate for the shortest match at each position.
2570                         // Recurse for each but the shortest match.
2571                         if(next==start) {
2572                             next=matchLimit;  // First match from start.
2573                         } else {
2574                             if(matchLimit<next) {
2575                                 // Remember shortest match from start for iteration.
2576                                 int32_t temp=next;
2577                                 next=matchLimit;
2578                                 matchLimit=temp;
2579                             }
2580                             // Recurse for non-shortest match from start.
2581                             int32_t spanLength=containsSpanUTF16(set, s+matchLimit, length-matchLimit,
2582                                                                  USET_SPAN_CONTAINED);
2583                             if((matchLimit+spanLength)>maxSpanLimit) {
2584                                 maxSpanLimit=matchLimit+spanLength;
2585                                 if(maxSpanLimit==length) {
2586                                     return length;
2587                                 }
2588                             }
2589                         }
2590                     } else /* spanCondition==USET_SPAN_SIMPLE */ {
2591                         if(matchLimit>next) {
2592                             // Remember longest match from start.
2593                             next=matchLimit;
2594                         }
2595                     }
2596                 }
2597             }
2598             if(next==start) {
2599                 break;  // No match from start.
2600             }
2601             start=next;
2602         }
2603         if(start>maxSpanLimit) {
2604             return start;
2605         } else {
2606             return maxSpanLimit;
2607         }
2608     }
2609 }
2610 
containsSpanBackUTF16(const UnicodeSetWithStrings & set,const UChar * s,int32_t length,USetSpanCondition spanCondition)2611 static int32_t containsSpanBackUTF16(const UnicodeSetWithStrings &set, const UChar *s, int32_t length,
2612                                      USetSpanCondition spanCondition) {
2613     if(length==0) {
2614         return 0;
2615     }
2616     const UnicodeSet &realSet(set.getSet());
2617     if(!set.hasStrings()) {
2618         if(spanCondition!=USET_SPAN_NOT_CONTAINED) {
2619             spanCondition=USET_SPAN_CONTAINED;  // Pin to 0/1 values.
2620         }
2621 
2622         UChar32 c;
2623         int32_t prev=length;
2624         do {
2625             U16_PREV(s, 0, length, c);
2626             if(realSet.contains(c)!=spanCondition) {
2627                 break;
2628             }
2629         } while((prev=length)>0);
2630         return prev;
2631     } else if(spanCondition==USET_SPAN_NOT_CONTAINED) {
2632         UnicodeSetWithStringsIterator iter(set);
2633         UChar32 c;
2634         int32_t prev=length, length0=length;
2635         do {
2636             U16_PREV(s, 0, length, c);
2637             if(realSet.contains(c)) {
2638                 break;
2639             }
2640             const UnicodeString *str;
2641             iter.reset();
2642             while((str=iter.nextString())!=NULL) {
2643                 if(str->length()<=prev && matches16CPB(s, prev-str->length(), length0, *str)) {
2644                     // spanNeedsStrings=TRUE;
2645                     return prev;
2646                 }
2647             }
2648         } while((prev=length)>0);
2649         return prev;
2650     } else /* USET_SPAN_CONTAINED or USET_SPAN_SIMPLE */ {
2651         UnicodeSetWithStringsIterator iter(set);
2652         UChar32 c;
2653         int32_t prev=length, minSpanStart=length, length0=length;
2654         do {
2655             U16_PREV(s, 0, length, c);
2656             if(!realSet.contains(c)) {
2657                 length=prev;  // Do not span this single, not-contained code point.
2658             }
2659             const UnicodeString *str;
2660             iter.reset();
2661             while((str=iter.nextString())!=NULL) {
2662                 if(str->length()<=prev && matches16CPB(s, prev-str->length(), length0, *str)) {
2663                     // spanNeedsStrings=TRUE;
2664                     int32_t matchStart=prev-str->length();
2665                     if(matchStart==0) {
2666                         return 0;
2667                     }
2668                     if(spanCondition==USET_SPAN_CONTAINED) {
2669                         // Iterate for the shortest match at each position.
2670                         // Recurse for each but the shortest match.
2671                         if(length==prev) {
2672                             length=matchStart;  // First match from prev.
2673                         } else {
2674                             if(matchStart>length) {
2675                                 // Remember shortest match from prev for iteration.
2676                                 int32_t temp=length;
2677                                 length=matchStart;
2678                                 matchStart=temp;
2679                             }
2680                             // Recurse for non-shortest match from prev.
2681                             int32_t spanStart=containsSpanBackUTF16(set, s, matchStart,
2682                                                                     USET_SPAN_CONTAINED);
2683                             if(spanStart<minSpanStart) {
2684                                 minSpanStart=spanStart;
2685                                 if(minSpanStart==0) {
2686                                     return 0;
2687                                 }
2688                             }
2689                         }
2690                     } else /* spanCondition==USET_SPAN_SIMPLE */ {
2691                         if(matchStart<length) {
2692                             // Remember longest match from prev.
2693                             length=matchStart;
2694                         }
2695                     }
2696                 }
2697             }
2698             if(length==prev) {
2699                 break;  // No match from prev.
2700             }
2701         } while((prev=length)>0);
2702         if(prev<minSpanStart) {
2703             return prev;
2704         } else {
2705             return minSpanStart;
2706         }
2707     }
2708 }
2709 
containsSpanUTF8(const UnicodeSetWithStrings & set,const char * s,int32_t length,USetSpanCondition spanCondition)2710 static int32_t containsSpanUTF8(const UnicodeSetWithStrings &set, const char *s, int32_t length,
2711                                 USetSpanCondition spanCondition) {
2712     const UnicodeSet &realSet(set.getSet());
2713     if(!set.hasStrings()) {
2714         if(spanCondition!=USET_SPAN_NOT_CONTAINED) {
2715             spanCondition=USET_SPAN_CONTAINED;  // Pin to 0/1 values.
2716         }
2717 
2718         UChar32 c;
2719         int32_t start=0, prev;
2720         while((prev=start)<length) {
2721             U8_NEXT_OR_FFFD(s, start, length, c);
2722             if(realSet.contains(c)!=spanCondition) {
2723                 break;
2724             }
2725         }
2726         return prev;
2727     } else if(spanCondition==USET_SPAN_NOT_CONTAINED) {
2728         UnicodeSetWithStringsIterator iter(set);
2729         UChar32 c;
2730         int32_t start, next;
2731         for(start=next=0; start<length;) {
2732             U8_NEXT_OR_FFFD(s, next, length, c);
2733             if(realSet.contains(c)) {
2734                 break;
2735             }
2736             const char *s8;
2737             int32_t length8;
2738             iter.reset();
2739             while((s8=iter.nextUTF8(length8))!=NULL) {
2740                 if(length8!=0 && length8<=(length-start) && 0==memcmp(s+start, s8, length8)) {
2741                     // spanNeedsStrings=TRUE;
2742                     return start;
2743                 }
2744             }
2745             start=next;
2746         }
2747         return start;
2748     } else /* USET_SPAN_CONTAINED or USET_SPAN_SIMPLE */ {
2749         UnicodeSetWithStringsIterator iter(set);
2750         UChar32 c;
2751         int32_t start, next, maxSpanLimit=0;
2752         for(start=next=0; start<length;) {
2753             U8_NEXT_OR_FFFD(s, next, length, c);
2754             if(!realSet.contains(c)) {
2755                 next=start;  // Do not span this single, not-contained code point.
2756             }
2757             const char *s8;
2758             int32_t length8;
2759             iter.reset();
2760             while((s8=iter.nextUTF8(length8))!=NULL) {
2761                 if(length8!=0 && length8<=(length-start) && 0==memcmp(s+start, s8, length8)) {
2762                     // spanNeedsStrings=TRUE;
2763                     int32_t matchLimit=start+length8;
2764                     if(matchLimit==length) {
2765                         return length;
2766                     }
2767                     if(spanCondition==USET_SPAN_CONTAINED) {
2768                         // Iterate for the shortest match at each position.
2769                         // Recurse for each but the shortest match.
2770                         if(next==start) {
2771                             next=matchLimit;  // First match from start.
2772                         } else {
2773                             if(matchLimit<next) {
2774                                 // Remember shortest match from start for iteration.
2775                                 int32_t temp=next;
2776                                 next=matchLimit;
2777                                 matchLimit=temp;
2778                             }
2779                             // Recurse for non-shortest match from start.
2780                             int32_t spanLength=containsSpanUTF8(set, s+matchLimit, length-matchLimit,
2781                                                                 USET_SPAN_CONTAINED);
2782                             if((matchLimit+spanLength)>maxSpanLimit) {
2783                                 maxSpanLimit=matchLimit+spanLength;
2784                                 if(maxSpanLimit==length) {
2785                                     return length;
2786                                 }
2787                             }
2788                         }
2789                     } else /* spanCondition==USET_SPAN_SIMPLE */ {
2790                         if(matchLimit>next) {
2791                             // Remember longest match from start.
2792                             next=matchLimit;
2793                         }
2794                     }
2795                 }
2796             }
2797             if(next==start) {
2798                 break;  // No match from start.
2799             }
2800             start=next;
2801         }
2802         if(start>maxSpanLimit) {
2803             return start;
2804         } else {
2805             return maxSpanLimit;
2806         }
2807     }
2808 }
2809 
containsSpanBackUTF8(const UnicodeSetWithStrings & set,const char * s,int32_t length,USetSpanCondition spanCondition)2810 static int32_t containsSpanBackUTF8(const UnicodeSetWithStrings &set, const char *s, int32_t length,
2811                                     USetSpanCondition spanCondition) {
2812     if(length==0) {
2813         return 0;
2814     }
2815     const UnicodeSet &realSet(set.getSet());
2816     if(!set.hasStrings()) {
2817         if(spanCondition!=USET_SPAN_NOT_CONTAINED) {
2818             spanCondition=USET_SPAN_CONTAINED;  // Pin to 0/1 values.
2819         }
2820 
2821         UChar32 c;
2822         int32_t prev=length;
2823         do {
2824             U8_PREV_OR_FFFD(s, 0, length, c);
2825             if(realSet.contains(c)!=spanCondition) {
2826                 break;
2827             }
2828         } while((prev=length)>0);
2829         return prev;
2830     } else if(spanCondition==USET_SPAN_NOT_CONTAINED) {
2831         UnicodeSetWithStringsIterator iter(set);
2832         UChar32 c;
2833         int32_t prev=length;
2834         do {
2835             U8_PREV_OR_FFFD(s, 0, length, c);
2836             if(realSet.contains(c)) {
2837                 break;
2838             }
2839             const char *s8;
2840             int32_t length8;
2841             iter.reset();
2842             while((s8=iter.nextUTF8(length8))!=NULL) {
2843                 if(length8!=0 && length8<=prev && 0==memcmp(s+prev-length8, s8, length8)) {
2844                     // spanNeedsStrings=TRUE;
2845                     return prev;
2846                 }
2847             }
2848         } while((prev=length)>0);
2849         return prev;
2850     } else /* USET_SPAN_CONTAINED or USET_SPAN_SIMPLE */ {
2851         UnicodeSetWithStringsIterator iter(set);
2852         UChar32 c;
2853         int32_t prev=length, minSpanStart=length;
2854         do {
2855             U8_PREV_OR_FFFD(s, 0, length, c);
2856             if(!realSet.contains(c)) {
2857                 length=prev;  // Do not span this single, not-contained code point.
2858             }
2859             const char *s8;
2860             int32_t length8;
2861             iter.reset();
2862             while((s8=iter.nextUTF8(length8))!=NULL) {
2863                 if(length8!=0 && length8<=prev && 0==memcmp(s+prev-length8, s8, length8)) {
2864                     // spanNeedsStrings=TRUE;
2865                     int32_t matchStart=prev-length8;
2866                     if(matchStart==0) {
2867                         return 0;
2868                     }
2869                     if(spanCondition==USET_SPAN_CONTAINED) {
2870                         // Iterate for the shortest match at each position.
2871                         // Recurse for each but the shortest match.
2872                         if(length==prev) {
2873                             length=matchStart;  // First match from prev.
2874                         } else {
2875                             if(matchStart>length) {
2876                                 // Remember shortest match from prev for iteration.
2877                                 int32_t temp=length;
2878                                 length=matchStart;
2879                                 matchStart=temp;
2880                             }
2881                             // Recurse for non-shortest match from prev.
2882                             int32_t spanStart=containsSpanBackUTF8(set, s, matchStart,
2883                                                                    USET_SPAN_CONTAINED);
2884                             if(spanStart<minSpanStart) {
2885                                 minSpanStart=spanStart;
2886                                 if(minSpanStart==0) {
2887                                     return 0;
2888                                 }
2889                             }
2890                         }
2891                     } else /* spanCondition==USET_SPAN_SIMPLE */ {
2892                         if(matchStart<length) {
2893                             // Remember longest match from prev.
2894                             length=matchStart;
2895                         }
2896                     }
2897                 }
2898             }
2899             if(length==prev) {
2900                 break;  // No match from prev.
2901             }
2902         } while((prev=length)>0);
2903         if(prev<minSpanStart) {
2904             return prev;
2905         } else {
2906             return minSpanStart;
2907         }
2908     }
2909 }
2910 
2911 // spans to be performed and compared
2912 enum {
2913     SPAN_UTF16          =1,
2914     SPAN_UTF8           =2,
2915     SPAN_UTFS           =3,
2916 
2917     SPAN_SET            =4,
2918     SPAN_COMPLEMENT     =8,
2919     SPAN_POLARITY       =0xc,
2920 
2921     SPAN_FWD            =0x10,
2922     SPAN_BACK           =0x20,
2923     SPAN_DIRS           =0x30,
2924 
2925     SPAN_CONTAINED      =0x100,
2926     SPAN_SIMPLE         =0x200,
2927     SPAN_CONDITION      =0x300,
2928 
2929     SPAN_ALL            =0x33f
2930 };
2931 
invertSpanCondition(USetSpanCondition spanCondition,USetSpanCondition contained)2932 static inline USetSpanCondition invertSpanCondition(USetSpanCondition spanCondition, USetSpanCondition contained) {
2933     return spanCondition == USET_SPAN_NOT_CONTAINED ? contained : USET_SPAN_NOT_CONTAINED;
2934 }
2935 
slen(const void * s,UBool isUTF16)2936 static inline int32_t slen(const void *s, UBool isUTF16) {
2937     return isUTF16 ? u_strlen((const UChar *)s) : static_cast<int32_t>(strlen((const char *)s));
2938 }
2939 
2940 /*
2941  * Count spans on a string with the method according to type and set the span limits.
2942  * The set may be the complement of the original.
2943  * When using spanBack() and comparing with span(), use a span condition for the first spanBack()
2944  * according to the expected number of spans.
2945  * Sets typeName to an empty string if there is no such type.
2946  * Returns -1 if the span option is filtered out.
2947  */
getSpans(const UnicodeSetWithStrings & set,UBool isComplement,const void * s,int32_t length,UBool isUTF16,uint32_t whichSpans,int type,const char * & typeName,int32_t limits[],int32_t limitsCapacity,int32_t expectCount)2948 static int32_t getSpans(const UnicodeSetWithStrings &set, UBool isComplement,
2949                         const void *s, int32_t length, UBool isUTF16,
2950                         uint32_t whichSpans,
2951                         int type, const char *&typeName,
2952                         int32_t limits[], int32_t limitsCapacity,
2953                         int32_t expectCount) {
2954     const UnicodeSet &realSet(set.getSet());
2955     int32_t start, count;
2956     USetSpanCondition spanCondition, firstSpanCondition, contained;
2957     UBool isForward;
2958 
2959     if(type<0 || 7<type) {
2960         typeName="";
2961         return 0;
2962     }
2963 
2964     static const char *const typeNames16[]={
2965         "contains", "contains(LM)",
2966         "span", "span(LM)",
2967         "containsBack", "containsBack(LM)",
2968         "spanBack", "spanBack(LM)"
2969     };
2970 
2971     static const char *const typeNames8[]={
2972         "containsUTF8", "containsUTF8(LM)",
2973         "spanUTF8", "spanUTF8(LM)",
2974         "containsBackUTF8", "containsBackUTF8(LM)", // not implemented
2975         "spanBackUTF8", "spanBackUTF8(LM)"
2976     };
2977 
2978     typeName= isUTF16 ? typeNames16[type] : typeNames8[type];
2979 
2980     // filter span options
2981     if(type<=3) {
2982         // span forward
2983         if((whichSpans&SPAN_FWD)==0) {
2984             return -1;
2985         }
2986         isForward=TRUE;
2987     } else {
2988         // span backward
2989         if((whichSpans&SPAN_BACK)==0) {
2990             return -1;
2991         }
2992         isForward=FALSE;
2993     }
2994     if((type&1)==0) {
2995         // use USET_SPAN_CONTAINED
2996         if((whichSpans&SPAN_CONTAINED)==0) {
2997             return -1;
2998         }
2999         contained=USET_SPAN_CONTAINED;
3000     } else {
3001         // use USET_SPAN_SIMPLE
3002         if((whichSpans&SPAN_SIMPLE)==0) {
3003             return -1;
3004         }
3005         contained=USET_SPAN_SIMPLE;
3006     }
3007 
3008     // Default first span condition for going forward with an uncomplemented set.
3009     spanCondition=USET_SPAN_NOT_CONTAINED;
3010     if(isComplement) {
3011         spanCondition=invertSpanCondition(spanCondition, contained);
3012     }
3013 
3014     // First span condition for span(), used to terminate the spanBack() iteration.
3015     firstSpanCondition=spanCondition;
3016 
3017     // spanBack(): Its initial span condition is span()'s last span condition,
3018     // which is the opposite of span()'s first span condition
3019     // if we expect an even number of spans.
3020     // (The loop inverts spanCondition (expectCount-1) times
3021     // before the expectCount'th span() call.)
3022     // If we do not compare forward and backward directions, then we do not have an
3023     // expectCount and just start with firstSpanCondition.
3024     if(!isForward && (whichSpans&SPAN_FWD)!=0 && (expectCount&1)==0) {
3025         spanCondition=invertSpanCondition(spanCondition, contained);
3026     }
3027 
3028     count=0;
3029     switch(type) {
3030     case 0:
3031     case 1:
3032         start=0;
3033         if(length<0) {
3034             length=slen(s, isUTF16);
3035         }
3036         for(;;) {
3037             start+= isUTF16 ? containsSpanUTF16(set, (const UChar *)s+start, length-start, spanCondition) :
3038                               containsSpanUTF8(set, (const char *)s+start, length-start, spanCondition);
3039             if(count<limitsCapacity) {
3040                 limits[count]=start;
3041             }
3042             ++count;
3043             if(start>=length) {
3044                 break;
3045             }
3046             spanCondition=invertSpanCondition(spanCondition, contained);
3047         }
3048         break;
3049     case 2:
3050     case 3:
3051         start=0;
3052         for(;;) {
3053             start+= isUTF16 ? realSet.span((const UChar *)s+start, length>=0 ? length-start : length, spanCondition) :
3054                               realSet.spanUTF8((const char *)s+start, length>=0 ? length-start : length, spanCondition);
3055             if(count<limitsCapacity) {
3056                 limits[count]=start;
3057             }
3058             ++count;
3059             if(length>=0 ? start>=length :
3060                            isUTF16 ? ((const UChar *)s)[start]==0 :
3061                                      ((const char *)s)[start]==0
3062             ) {
3063                 break;
3064             }
3065             spanCondition=invertSpanCondition(spanCondition, contained);
3066         }
3067         break;
3068     case 4:
3069     case 5:
3070         if(length<0) {
3071             length=slen(s, isUTF16);
3072         }
3073         for(;;) {
3074             ++count;
3075             if(count<=limitsCapacity) {
3076                 limits[limitsCapacity-count]=length;
3077             }
3078             length= isUTF16 ? containsSpanBackUTF16(set, (const UChar *)s, length, spanCondition) :
3079                               containsSpanBackUTF8(set, (const char *)s, length, spanCondition);
3080             if(length==0 && spanCondition==firstSpanCondition) {
3081                 break;
3082             }
3083             spanCondition=invertSpanCondition(spanCondition, contained);
3084         }
3085         if(count<limitsCapacity) {
3086             memmove(limits, limits+(limitsCapacity-count), count*4);
3087         }
3088         break;
3089     case 6:
3090     case 7:
3091         for(;;) {
3092             ++count;
3093             if(count<=limitsCapacity) {
3094                 limits[limitsCapacity-count]= length >=0 ? length : slen(s, isUTF16);
3095             }
3096             // Note: Length<0 is tested only for the first spanBack().
3097             // If we wanted to keep length<0 for all spanBack()s, we would have to
3098             // temporarily modify the string by placing a NUL where the previous spanBack() stopped.
3099             length= isUTF16 ? realSet.spanBack((const UChar *)s, length, spanCondition) :
3100                               realSet.spanBackUTF8((const char *)s, length, spanCondition);
3101             if(length==0 && spanCondition==firstSpanCondition) {
3102                 break;
3103             }
3104             spanCondition=invertSpanCondition(spanCondition, contained);
3105         }
3106         if(count<limitsCapacity) {
3107             memmove(limits, limits+(limitsCapacity-count), count*4);
3108         }
3109         break;
3110     default:
3111         typeName="";
3112         return -1;
3113     }
3114 
3115     return count;
3116 }
3117 
3118 // sets to be tested; odd index=isComplement
3119 enum {
3120     SLOW,
3121     SLOW_NOT,
3122     FAST,
3123     FAST_NOT,
3124     SET_COUNT
3125 };
3126 
3127 static const char *const setNames[SET_COUNT]={
3128     "slow",
3129     "slow.not",
3130     "fast",
3131     "fast.not"
3132 };
3133 
3134 /*
3135  * Verify that we get the same results whether we look at text with contains(),
3136  * span() or spanBack(), using unfrozen or frozen versions of the set,
3137  * and using the set or its complement (switching the spanConditions accordingly).
3138  * The latter verifies that
3139  *   set.span(spanCondition) == set.complement().span(!spanCondition).
3140  *
3141  * The expectLimits[] are either provided by the caller (with expectCount>=0)
3142  * or returned to the caller (with an input expectCount<0).
3143  */
testSpan(const UnicodeSetWithStrings * sets[4],const void * s,int32_t length,UBool isUTF16,uint32_t whichSpans,int32_t expectLimits[],int32_t & expectCount,const char * testName,int32_t index)3144 void UnicodeSetTest::testSpan(const UnicodeSetWithStrings *sets[4],
3145                               const void *s, int32_t length, UBool isUTF16,
3146                               uint32_t whichSpans,
3147                               int32_t expectLimits[], int32_t &expectCount,
3148                               const char *testName, int32_t index) {
3149     int32_t limits[500];
3150     int32_t limitsCount;
3151     int i, j;
3152 
3153     const char *typeName;
3154     int type;
3155 
3156     for(i=0; i<SET_COUNT; ++i) {
3157         if((i&1)==0) {
3158             // Even-numbered sets are original, uncomplemented sets.
3159             if((whichSpans&SPAN_SET)==0) {
3160                 continue;
3161             }
3162         } else {
3163             // Odd-numbered sets are complemented.
3164             if((whichSpans&SPAN_COMPLEMENT)==0) {
3165                 continue;
3166             }
3167         }
3168         for(type=0;; ++type) {
3169             limitsCount=getSpans(*sets[i], (UBool)(i&1),
3170                                  s, length, isUTF16,
3171                                  whichSpans,
3172                                  type, typeName,
3173                                  limits, UPRV_LENGTHOF(limits), expectCount);
3174             if(typeName[0]==0) {
3175                 break; // All types tried.
3176             }
3177             if(limitsCount<0) {
3178                 continue; // Span option filtered out.
3179             }
3180             if(expectCount<0) {
3181                 expectCount=limitsCount;
3182                 if(limitsCount>UPRV_LENGTHOF(limits)) {
3183                     errln("FAIL: %s[0x%lx].%s.%s span count=%ld > %ld capacity - too many spans",
3184                           testName, (long)index, setNames[i], typeName, (long)limitsCount, (long)UPRV_LENGTHOF(limits));
3185                     return;
3186                 }
3187                 memcpy(expectLimits, limits, limitsCount*4);
3188             } else if(limitsCount!=expectCount) {
3189                 errln("FAIL: %s[0x%lx].%s.%s span count=%ld != %ld",
3190                       testName, (long)index, setNames[i], typeName, (long)limitsCount, (long)expectCount);
3191             } else {
3192                 for(j=0; j<limitsCount; ++j) {
3193                     if(limits[j]!=expectLimits[j]) {
3194                         errln("FAIL: %s[0x%lx].%s.%s span count=%ld limits[%d]=%ld != %ld",
3195                               testName, (long)index, setNames[i], typeName, (long)limitsCount,
3196                               j, (long)limits[j], (long)expectLimits[j]);
3197                         break;
3198                     }
3199                 }
3200             }
3201         }
3202     }
3203 
3204     // Compare span() with containsAll()/containsNone(),
3205     // but only if we have expectLimits[] from the uncomplemented set.
3206     if(isUTF16 && (whichSpans&SPAN_SET)!=0) {
3207         const UChar *s16=(const UChar *)s;
3208         UnicodeString string;
3209         int32_t prev=0, limit, length;
3210         for(i=0; i<expectCount; ++i) {
3211             limit=expectLimits[i];
3212             length=limit-prev;
3213             if(length>0) {
3214                 string.setTo(FALSE, s16+prev, length);  // read-only alias
3215                 if(i&1) {
3216                     if(!sets[SLOW]->getSet().containsAll(string)) {
3217                         errln("FAIL: %s[0x%lx].%s.containsAll(%ld..%ld)==FALSE contradicts span()",
3218                               testName, (long)index, setNames[SLOW], (long)prev, (long)limit);
3219                         return;
3220                     }
3221                     if(!sets[FAST]->getSet().containsAll(string)) {
3222                         errln("FAIL: %s[0x%lx].%s.containsAll(%ld..%ld)==FALSE contradicts span()",
3223                               testName, (long)index, setNames[FAST], (long)prev, (long)limit);
3224                         return;
3225                     }
3226                 } else {
3227                     if(!sets[SLOW]->getSet().containsNone(string)) {
3228                         errln("FAIL: %s[0x%lx].%s.containsNone(%ld..%ld)==FALSE contradicts span()",
3229                               testName, (long)index, setNames[SLOW], (long)prev, (long)limit);
3230                         return;
3231                     }
3232                     if(!sets[FAST]->getSet().containsNone(string)) {
3233                         errln("FAIL: %s[0x%lx].%s.containsNone(%ld..%ld)==FALSE contradicts span()",
3234                               testName, (long)index, setNames[FAST], (long)prev, (long)limit);
3235                         return;
3236                     }
3237                 }
3238             }
3239             prev=limit;
3240         }
3241     }
3242 }
3243 
3244 // Specifically test either UTF-16 or UTF-8.
testSpan(const UnicodeSetWithStrings * sets[4],const void * s,int32_t length,UBool isUTF16,uint32_t whichSpans,const char * testName,int32_t index)3245 void UnicodeSetTest::testSpan(const UnicodeSetWithStrings *sets[4],
3246                               const void *s, int32_t length, UBool isUTF16,
3247                               uint32_t whichSpans,
3248                               const char *testName, int32_t index) {
3249     int32_t expectLimits[500];
3250     int32_t expectCount=-1;
3251     testSpan(sets, s, length, isUTF16, whichSpans, expectLimits, expectCount, testName, index);
3252 }
3253 
stringContainsUnpairedSurrogate(const UChar * s,int32_t length)3254 UBool stringContainsUnpairedSurrogate(const UChar *s, int32_t length) {
3255     UChar c, c2;
3256 
3257     if(length>=0) {
3258         while(length>0) {
3259             c=*s++;
3260             --length;
3261             if(0xd800<=c && c<0xe000) {
3262                 if(c>=0xdc00 || length==0 || !U16_IS_TRAIL(c2=*s++)) {
3263                     return TRUE;
3264                 }
3265                 --length;
3266             }
3267         }
3268     } else {
3269         while((c=*s++)!=0) {
3270             if(0xd800<=c && c<0xe000) {
3271                 if(c>=0xdc00 || !U16_IS_TRAIL(c2=*s++)) {
3272                     return TRUE;
3273                 }
3274             }
3275         }
3276     }
3277     return FALSE;
3278 }
3279 
3280 // Test both UTF-16 and UTF-8 versions of span() etc. on the same sets and text,
3281 // unless either UTF is turned off in whichSpans.
3282 // Testing UTF-16 and UTF-8 together requires that surrogate code points
3283 // have the same contains(c) value as U+FFFD.
testSpanBothUTFs(const UnicodeSetWithStrings * sets[4],const UChar * s16,int32_t length16,uint32_t whichSpans,const char * testName,int32_t index)3284 void UnicodeSetTest::testSpanBothUTFs(const UnicodeSetWithStrings *sets[4],
3285                                       const UChar *s16, int32_t length16,
3286                                       uint32_t whichSpans,
3287                                       const char *testName, int32_t index) {
3288     int32_t expectLimits[500];
3289     int32_t expectCount;
3290 
3291     expectCount=-1;  // Get expectLimits[] from testSpan().
3292 
3293     if((whichSpans&SPAN_UTF16)!=0) {
3294         testSpan(sets, s16, length16, TRUE, whichSpans, expectLimits, expectCount, testName, index);
3295     }
3296     if((whichSpans&SPAN_UTF8)==0) {
3297         return;
3298     }
3299 
3300     // Convert s16[] and expectLimits[] to UTF-8.
3301     uint8_t s8[3000];
3302     int32_t offsets[3000];
3303 
3304     const UChar *s16Limit=s16+length16;
3305     char *t=(char *)s8;
3306     char *tLimit=t+sizeof(s8);
3307     int32_t *o=offsets;
3308     UErrorCode errorCode=U_ZERO_ERROR;
3309 
3310     // Convert with substitution: Turn unpaired surrogates into U+FFFD.
3311     ucnv_fromUnicode(openUTF8Converter(), &t, tLimit, &s16, s16Limit, o, TRUE, &errorCode);
3312     if(U_FAILURE(errorCode)) {
3313         errln("FAIL: %s[0x%lx] ucnv_fromUnicode(to UTF-8) fails with %s",
3314               testName, (long)index, u_errorName(errorCode));
3315         ucnv_resetFromUnicode(utf8Cnv);
3316         return;
3317     }
3318     int32_t length8=(int32_t)(t-(char *)s8);
3319 
3320     // Convert expectLimits[].
3321     int32_t i, j, expect;
3322     for(i=j=0; i<expectCount; ++i) {
3323         expect=expectLimits[i];
3324         if(expect==length16) {
3325             expectLimits[i]=length8;
3326         } else {
3327             while(offsets[j]<expect) {
3328                 ++j;
3329             }
3330             expectLimits[i]=j;
3331         }
3332     }
3333 
3334     testSpan(sets, s8, length8, FALSE, whichSpans, expectLimits, expectCount, testName, index);
3335 }
3336 
nextCodePoint(UChar32 c)3337 static UChar32 nextCodePoint(UChar32 c) {
3338     // Skip some large and boring ranges.
3339     switch(c) {
3340     case 0x3441:
3341         return 0x4d7f;
3342     case 0x5100:
3343         return 0x9f00;
3344     case 0xb040:
3345         return 0xd780;
3346     case 0xe041:
3347         return 0xf8fe;
3348     case 0x10100:
3349         return 0x20000;
3350     case 0x20041:
3351         return 0xe0000;
3352     case 0xe0101:
3353         return 0x10fffd;
3354     default:
3355         return c+1;
3356     }
3357 }
3358 
3359 // Verify that all implementations represent the same set.
testSpanContents(const UnicodeSetWithStrings * sets[4],uint32_t whichSpans,const char * testName)3360 void UnicodeSetTest::testSpanContents(const UnicodeSetWithStrings *sets[4], uint32_t whichSpans, const char *testName) {
3361     // contains(U+FFFD) is inconsistent with contains(some surrogates),
3362     // or the set contains strings with unpaired surrogates which don't translate to valid UTF-8:
3363     // Skip the UTF-8 part of the test - if the string contains surrogates -
3364     // because it is likely to produce a different result.
3365     UBool inconsistentSurrogates=
3366             (!(sets[0]->getSet().contains(0xfffd) ?
3367                sets[0]->getSet().contains(0xd800, 0xdfff) :
3368                sets[0]->getSet().containsNone(0xd800, 0xdfff)) ||
3369              sets[0]->hasStringsWithSurrogates());
3370 
3371     UChar s[1000];
3372     int32_t length=0;
3373     uint32_t localWhichSpans;
3374 
3375     UChar32 c, first;
3376     for(first=c=0;; c=nextCodePoint(c)) {
3377         if(c>0x10ffff || length>(UPRV_LENGTHOF(s)-U16_MAX_LENGTH)) {
3378             localWhichSpans=whichSpans;
3379             if(stringContainsUnpairedSurrogate(s, length) && inconsistentSurrogates) {
3380                 localWhichSpans&=~SPAN_UTF8;
3381             }
3382             testSpanBothUTFs(sets, s, length, localWhichSpans, testName, first);
3383             if(c>0x10ffff) {
3384                 break;
3385             }
3386             length=0;
3387             first=c;
3388         }
3389         U16_APPEND_UNSAFE(s, length, c);
3390     }
3391 }
3392 
3393 // Test with a particular, interesting string.
3394 // Specify length and try NUL-termination.
testSpanUTF16String(const UnicodeSetWithStrings * sets[4],uint32_t whichSpans,const char * testName)3395 void UnicodeSetTest::testSpanUTF16String(const UnicodeSetWithStrings *sets[4], uint32_t whichSpans, const char *testName) {
3396     static const UChar s[]={
3397         0x61, 0x62, 0x20,                       // Latin, space
3398         0x3b1, 0x3b2, 0x3b3,                    // Greek
3399         0xd900,                                 // lead surrogate
3400         0x3000, 0x30ab, 0x30ad,                 // wide space, Katakana
3401         0xdc05,                                 // trail surrogate
3402         0xa0, 0xac00, 0xd7a3,                   // nbsp, Hangul
3403         0xd900, 0xdc05,                         // unassigned supplementary
3404         0xd840, 0xdfff, 0xd860, 0xdffe,         // Han supplementary
3405         0xd7a4, 0xdc05, 0xd900, 0x2028,         // unassigned, surrogates in wrong order, LS
3406         0                                       // NUL
3407     };
3408 
3409     if((whichSpans&SPAN_UTF16)==0) {
3410         return;
3411     }
3412     testSpan(sets, s, -1, TRUE, (whichSpans&~SPAN_UTF8), testName, 0);
3413     testSpan(sets, s, UPRV_LENGTHOF(s)-1, TRUE, (whichSpans&~SPAN_UTF8), testName, 1);
3414 }
3415 
testSpanUTF8String(const UnicodeSetWithStrings * sets[4],uint32_t whichSpans,const char * testName)3416 void UnicodeSetTest::testSpanUTF8String(const UnicodeSetWithStrings *sets[4], uint32_t whichSpans, const char *testName) {
3417     static const char s[]={
3418         "abc"                                   // Latin
3419 
3420         /* trail byte in lead position */
3421         "\x80"
3422 
3423         " "                                     // space
3424 
3425         /* truncated multi-byte sequences */
3426         "\xd0"
3427         "\xe0"
3428         "\xe1"
3429         "\xed"
3430         "\xee"
3431         "\xf0"
3432         "\xf1"
3433         "\xf4"
3434         "\xf8"
3435         "\xfc"
3436 
3437         "\xCE\xB1\xCE\xB2\xCE\xB3"              // Greek
3438 
3439         /* trail byte in lead position */
3440         "\x80"
3441 
3442         "\xe0\x80"
3443         "\xe0\xa0"
3444         "\xe1\x80"
3445         "\xed\x80"
3446         "\xed\xa0"
3447         "\xee\x80"
3448         "\xf0\x80"
3449         "\xf0\x90"
3450         "\xf1\x80"
3451         "\xf4\x80"
3452         "\xf4\x90"
3453         "\xf8\x80"
3454         "\xfc\x80"
3455 
3456         "\xE3\x80\x80\xE3\x82\xAB\xE3\x82\xAD"  // wide space, Katakana
3457 
3458         /* trail byte in lead position */
3459         "\x80"
3460 
3461         "\xf0\x80\x80"
3462         "\xf0\x90\x80"
3463         "\xf1\x80\x80"
3464         "\xf4\x80\x80"
3465         "\xf4\x90\x80"
3466         "\xf8\x80\x80"
3467         "\xfc\x80\x80"
3468 
3469         "\xC2\xA0\xEA\xB0\x80\xED\x9E\xA3"      // nbsp, Hangul
3470 
3471         /* trail byte in lead position */
3472         "\x80"
3473 
3474         "\xf8\x80\x80\x80"
3475         "\xfc\x80\x80\x80"
3476 
3477         "\xF1\x90\x80\x85"                      // unassigned supplementary
3478 
3479         /* trail byte in lead position */
3480         "\x80"
3481 
3482         "\xfc\x80\x80\x80\x80"
3483 
3484         "\xF0\xA0\x8F\xBF\xF0\xA8\x8F\xBE"      // Han supplementary
3485 
3486         /* trail byte in lead position */
3487         "\x80"
3488 
3489         /* complete sequences but non-shortest forms or out of range etc. */
3490         "\xc0\x80"
3491         "\xe0\x80\x80"
3492         "\xed\xa0\x80"
3493         "\xf0\x80\x80\x80"
3494         "\xf4\x90\x80\x80"
3495         "\xf8\x80\x80\x80\x80"
3496         "\xfc\x80\x80\x80\x80\x80"
3497         "\xfe"
3498         "\xff"
3499 
3500         /* trail byte in lead position */
3501         "\x80"
3502 
3503         "\xED\x9E\xA4\xE2\x80\xA8"              // unassigned, LS, NUL-terminated
3504     };
3505 
3506     if((whichSpans&SPAN_UTF8)==0) {
3507         return;
3508     }
3509     testSpan(sets, s, -1, FALSE, (whichSpans&~SPAN_UTF16), testName, 0);
3510     testSpan(sets, s, UPRV_LENGTHOF(s)-1, FALSE, (whichSpans&~SPAN_UTF16), testName, 1);
3511 }
3512 
3513 // Take a set of span options and multiply them so that
3514 // each portion only has one of the options a, b and c.
3515 // If b==0, then the set of options is just modified with mask and a.
3516 // If b!=0 and c==0, then the set of options is just modified with mask, a and b.
3517 static int32_t
addAlternative(uint32_t whichSpans[],int32_t whichSpansCount,uint32_t mask,uint32_t a,uint32_t b,uint32_t c)3518 addAlternative(uint32_t whichSpans[], int32_t whichSpansCount,
3519                uint32_t mask, uint32_t a, uint32_t b, uint32_t c) {
3520     uint32_t s;
3521     int32_t i;
3522 
3523     for(i=0; i<whichSpansCount; ++i) {
3524         s=whichSpans[i]&mask;
3525         whichSpans[i]=s|a;
3526         if(b!=0) {
3527             whichSpans[whichSpansCount+i]=s|b;
3528             if(c!=0) {
3529                 whichSpans[2*whichSpansCount+i]=s|c;
3530             }
3531         }
3532     }
3533     return b==0 ? whichSpansCount : c==0 ? 2*whichSpansCount : 3*whichSpansCount;
3534 }
3535 
3536 #define _63_a "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa"
3537 #define _64_a "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa"
3538 #define _63_b "bbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbb"
3539 #define _64_b "bbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbb"
3540 
TestSpan()3541 void UnicodeSetTest::TestSpan() {
3542     // "[...]" is a UnicodeSet pattern.
3543     // "*" performs tests on all Unicode code points and on a selection of
3544     //   malformed UTF-8/16 strings.
3545     // "-options" limits the scope of testing for the current set.
3546     //   By default, the test verifies that equivalent boundaries are found
3547     //   for UTF-16 and UTF-8, going forward and backward,
3548     //   alternating USET_SPAN_NOT_CONTAINED with
3549     //   either USET_SPAN_CONTAINED or USET_SPAN_SIMPLE.
3550     //   Single-character options:
3551     //     8 -- UTF-16 and UTF-8 boundaries may differ.
3552     //          Cause: contains(U+FFFD) is inconsistent with contains(some surrogates),
3553     //          or the set contains strings with unpaired surrogates
3554     //          which do not translate to valid UTF-8.
3555     //     c -- set.span() and set.complement().span() boundaries may differ.
3556     //          Cause: Set strings are not complemented.
3557     //     b -- span() and spanBack() boundaries may differ.
3558     //          Cause: Strings in the set overlap, and spanBack(USET_SPAN_CONTAINED)
3559     //          and spanBack(USET_SPAN_SIMPLE) are defined to
3560     //          match with non-overlapping substrings.
3561     //          For example, with a set containing "ab" and "ba",
3562     //          span() of "aba" yields boundaries { 0, 2, 3 }
3563     //          because the initial "ab" matches from 0 to 2,
3564     //          while spanBack() yields boundaries { 0, 1, 3 }
3565     //          because the final "ba" matches from 1 to 3.
3566     //     l -- USET_SPAN_CONTAINED and USET_SPAN_SIMPLE boundaries may differ.
3567     //          Cause: Strings in the set overlap, and a longer match may
3568     //          require a sequence including non-longest substrings.
3569     //          For example, with a set containing "ab", "abc" and "cd",
3570     //          span(contained) of "abcd" spans the entire string
3571     //          but span(longest match) only spans the first 3 characters.
3572     //   Each "-options" first resets all options and then applies the specified options.
3573     //   A "-" without options resets the options.
3574     //   The options are also reset for each new set.
3575     // Other strings will be spanned.
3576     static const char *const testdata[]={
3577         "[:ID_Continue:]",
3578         "*",
3579         "[:White_Space:]",
3580         "*",
3581         "[]",
3582         "*",
3583         "[\\u0000-\\U0010FFFF]",
3584         "*",
3585         "[\\u0000\\u0080\\u0800\\U00010000]",
3586         "*",
3587         "[\\u007F\\u07FF\\uFFFF\\U0010FFFF]",
3588         "*",
3589         "[[[:ID_Continue:]-[\\u30ab\\u30ad]]{\\u3000\\u30ab}{\\u3000\\u30ab\\u30ad}]",
3590         "-c",
3591         "*",
3592         "[[[:ID_Continue:]-[\\u30ab\\u30ad]]{\\u30ab\\u30ad}{\\u3000\\u30ab\\u30ad}]",
3593         "-c",
3594         "*",
3595 
3596         // Overlapping strings cause overlapping attempts to match.
3597         "[x{xy}{xya}{axy}{ax}]",
3598         "-cl",
3599 
3600         // More repetitions of "xya" would take too long with the recursive
3601         // reference implementation.
3602         // containsAll()=FALSE
3603         // test_string 0x14
3604         "xx"
3605         "xyaxyaxyaxya"  // set.complement().span(longest match) will stop here.
3606         "xx"            // set.complement().span(contained) will stop between the two 'x'es.
3607         "xyaxyaxyaxya"
3608         "xx"
3609         "xyaxyaxyaxya"  // span() ends here.
3610         "aaa",
3611 
3612         // containsAll()=TRUE
3613         // test_string 0x15
3614         "xx"
3615         "xyaxyaxyaxya"
3616         "xx"
3617         "xyaxyaxyaxya"
3618         "xx"
3619         "xyaxyaxyaxy",
3620 
3621         "-bc",
3622         // test_string 0x17
3623         "byayaxya",  // span() -> { 4, 7, 8 }  spanBack() -> { 5, 8 }
3624         "-c",
3625         "byayaxy",   // span() -> { 4, 7 }     complement.span() -> { 7 }
3626         "byayax",    // span() -> { 4, 6 }     complement.span() -> { 6 }
3627         "-",
3628         "byaya",     // span() -> { 5 }
3629         "byay",      // span() -> { 4 }
3630         "bya",       // span() -> { 3 }
3631 
3632         // span(longest match) will not span the whole string.
3633         "[a{ab}{bc}]",
3634         "-cl",
3635         // test_string 0x21
3636         "abc",
3637 
3638         "[a{ab}{abc}{cd}]",
3639         "-cl",
3640         "acdabcdabccd",
3641 
3642         // spanBack(longest match) will not span the whole string.
3643         "[c{ab}{bc}]",
3644         "-cl",
3645         "abc",
3646 
3647         "[d{cd}{bcd}{ab}]",
3648         "-cl",
3649         "abbcdabcdabd",
3650 
3651         // Test with non-ASCII set strings - test proper handling of surrogate pairs
3652         // and UTF-8 trail bytes.
3653         // Copies of above test sets and strings, but transliterated to have
3654         // different code points with similar trail units.
3655         // Previous: a      b         c            d
3656         // Unicode:  042B   30AB      200AB        204AB
3657         // UTF-16:   042B   30AB      D840 DCAB    D841 DCAB
3658         // UTF-8:    D0 AB  E3 82 AB  F0 A0 82 AB  F0 A0 92 AB
3659         "[\\u042B{\\u042B\\u30AB}{\\u042B\\u30AB\\U000200AB}{\\U000200AB\\U000204AB}]",
3660         "-cl",
3661         "\\u042B\\U000200AB\\U000204AB\\u042B\\u30AB\\U000200AB\\U000204AB\\u042B\\u30AB\\U000200AB\\U000200AB\\U000204AB",
3662 
3663         "[\\U000204AB{\\U000200AB\\U000204AB}{\\u30AB\\U000200AB\\U000204AB}{\\u042B\\u30AB}]",
3664         "-cl",
3665         "\\u042B\\u30AB\\u30AB\\U000200AB\\U000204AB\\u042B\\u30AB\\U000200AB\\U000204AB\\u042B\\u30AB\\U000204AB",
3666 
3667         // Stress bookkeeping and recursion.
3668         // The following strings are barely doable with the recursive
3669         // reference implementation.
3670         // The not-contained character at the end prevents an early exit from the span().
3671         "[b{bb}]",
3672         "-c",
3673         // test_string 0x33
3674         "bbbbbbbbbbbbbbbbbbbbbbbb-",
3675         // On complement sets, span() and spanBack() get different results
3676         // because b is not in the complement set and there is an odd number of b's
3677         // in the test string.
3678         "-bc",
3679         "bbbbbbbbbbbbbbbbbbbbbbbbb-",
3680 
3681         // Test with set strings with an initial or final code point span
3682         // longer than 254.
3683         "[a{" _64_a _64_a _64_a _64_a "b}"
3684           "{a" _64_b _64_b _64_b _64_b "}]",
3685         "-c",
3686         _64_a _64_a _64_a _63_a "b",
3687         _64_a _64_a _64_a _64_a "b",
3688         _64_a _64_a _64_a _64_a "aaaabbbb",
3689         "a" _64_b _64_b _64_b _63_b,
3690         "a" _64_b _64_b _64_b _64_b,
3691         "aaaabbbb" _64_b _64_b _64_b _64_b,
3692 
3693         // Test with strings containing unpaired surrogates.
3694         // They are not representable in UTF-8, and a leading trail surrogate
3695         // and a trailing lead surrogate must not match in the middle of a proper surrogate pair.
3696         // U+20001 == \\uD840\\uDC01
3697         // U+20400 == \\uD841\\uDC00
3698         "[a\\U00020001\\U00020400{ab}{b\\uD840}{\\uDC00a}]",
3699         "-8cl",
3700         "aaab\\U00020001ba\\U00020400aba\\uD840ab\\uD840\\U00020000b\\U00020000a\\U00020000\\uDC00a\\uDC00babbb"
3701     };
3702     uint32_t whichSpans[96]={ SPAN_ALL };
3703     int32_t whichSpansCount=1;
3704 
3705     UnicodeSet *sets[SET_COUNT]={ NULL };
3706     const UnicodeSetWithStrings *sets_with_str[SET_COUNT]={ NULL };
3707 
3708     char testName[1024];
3709     char *testNameLimit=testName;
3710 
3711     int32_t i, j;
3712     for(i=0; i<UPRV_LENGTHOF(testdata); ++i) {
3713         const char *s=testdata[i];
3714         if(s[0]=='[') {
3715             // Create new test sets from this pattern.
3716             for(j=0; j<SET_COUNT; ++j) {
3717                 delete sets_with_str[j];
3718                 delete sets[j];
3719             }
3720             UErrorCode errorCode=U_ZERO_ERROR;
3721             sets[SLOW]=new UnicodeSet(UnicodeString(s, -1, US_INV).unescape(), errorCode);
3722             if(U_FAILURE(errorCode)) {
3723                 dataerrln("FAIL: Unable to create UnicodeSet(%s) - %s", s, u_errorName(errorCode));
3724                 break;
3725             }
3726             sets[SLOW_NOT]=new UnicodeSet(*sets[SLOW]);
3727             sets[SLOW_NOT]->complement();
3728             // Intermediate set: Test cloning of a frozen set.
3729             UnicodeSet *fast=new UnicodeSet(*sets[SLOW]);
3730             fast->freeze();
3731             sets[FAST]=fast->clone();
3732             delete fast;
3733             UnicodeSet *fastNot=new UnicodeSet(*sets[SLOW_NOT]);
3734             fastNot->freeze();
3735             sets[FAST_NOT]=fastNot->clone();
3736             delete fastNot;
3737 
3738             for(j=0; j<SET_COUNT; ++j) {
3739                 sets_with_str[j]=new UnicodeSetWithStrings(*sets[j]);
3740             }
3741 
3742             strcpy(testName, s);
3743             testNameLimit=strchr(testName, 0);
3744             *testNameLimit++=':';
3745             *testNameLimit=0;
3746 
3747             whichSpans[0]=SPAN_ALL;
3748             whichSpansCount=1;
3749         } else if(s[0]=='-') {
3750             whichSpans[0]=SPAN_ALL;
3751             whichSpansCount=1;
3752 
3753             while(*++s!=0) {
3754                 switch(*s) {
3755                 case 'c':
3756                     whichSpansCount=addAlternative(whichSpans, whichSpansCount,
3757                                                    ~SPAN_POLARITY,
3758                                                    SPAN_SET,
3759                                                    SPAN_COMPLEMENT,
3760                                                    0);
3761                     break;
3762                 case 'b':
3763                     whichSpansCount=addAlternative(whichSpans, whichSpansCount,
3764                                                    ~SPAN_DIRS,
3765                                                    SPAN_FWD,
3766                                                    SPAN_BACK,
3767                                                    0);
3768                     break;
3769                 case 'l':
3770                     // test USET_SPAN_CONTAINED FWD & BACK, and separately
3771                     // USET_SPAN_SIMPLE only FWD, and separately
3772                     // USET_SPAN_SIMPLE only BACK
3773                     whichSpansCount=addAlternative(whichSpans, whichSpansCount,
3774                                                    ~(SPAN_DIRS|SPAN_CONDITION),
3775                                                    SPAN_DIRS|SPAN_CONTAINED,
3776                                                    SPAN_FWD|SPAN_SIMPLE,
3777                                                    SPAN_BACK|SPAN_SIMPLE);
3778                     break;
3779                 case '8':
3780                     whichSpansCount=addAlternative(whichSpans, whichSpansCount,
3781                                                    ~SPAN_UTFS,
3782                                                    SPAN_UTF16,
3783                                                    SPAN_UTF8,
3784                                                    0);
3785                     break;
3786                 default:
3787                     errln("FAIL: unrecognized span set option in \"%s\"", testdata[i]);
3788                     break;
3789                 }
3790             }
3791         } else if(0==strcmp(s, "*")) {
3792             strcpy(testNameLimit, "bad_string");
3793             for(j=0; j<whichSpansCount; ++j) {
3794                 if(whichSpansCount>1) {
3795                     sprintf(testNameLimit+10 /* strlen("bad_string") */,
3796                             "%%0x%3x",
3797                             whichSpans[j]);
3798                 }
3799                 testSpanUTF16String(sets_with_str, whichSpans[j], testName);
3800                 testSpanUTF8String(sets_with_str, whichSpans[j], testName);
3801             }
3802 
3803             strcpy(testNameLimit, "contents");
3804             for(j=0; j<whichSpansCount; ++j) {
3805                 if(whichSpansCount>1) {
3806                     sprintf(testNameLimit+8 /* strlen("contents") */,
3807                             "%%0x%3x",
3808                             whichSpans[j]);
3809                 }
3810                 testSpanContents(sets_with_str, whichSpans[j], testName);
3811             }
3812         } else {
3813             UnicodeString string=UnicodeString(s, -1, US_INV).unescape();
3814             strcpy(testNameLimit, "test_string");
3815             for(j=0; j<whichSpansCount; ++j) {
3816                 if(whichSpansCount>1) {
3817                     sprintf(testNameLimit+11 /* strlen("test_string") */,
3818                             "%%0x%3x",
3819                             whichSpans[j]);
3820                 }
3821                 testSpanBothUTFs(sets_with_str, string.getBuffer(), string.length(), whichSpans[j], testName, i);
3822             }
3823         }
3824     }
3825     for(j=0; j<SET_COUNT; ++j) {
3826         delete sets_with_str[j];
3827         delete sets[j];
3828     }
3829 }
3830 
3831 // Test select patterns and strings, and test USET_SPAN_SIMPLE.
TestStringSpan()3832 void UnicodeSetTest::TestStringSpan() {
3833     static const char *pattern="[x{xy}{xya}{axy}{ax}]";
3834     static const char *const string=
3835         "xx"
3836         "xyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxya"
3837         "xx"
3838         "xyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxya"
3839         "xx"
3840         "xyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxy"
3841         "aaaa";
3842 
3843     UErrorCode errorCode=U_ZERO_ERROR;
3844     UnicodeString pattern16=UnicodeString(pattern, -1, US_INV);
3845     UnicodeSet set(pattern16, errorCode);
3846     if(U_FAILURE(errorCode)) {
3847         errln("FAIL: Unable to create UnicodeSet(%s) - %s", pattern, u_errorName(errorCode));
3848         return;
3849     }
3850 
3851     UnicodeString string16=UnicodeString(string, -1, US_INV).unescape();
3852 
3853     if(set.containsAll(string16)) {
3854         errln("FAIL: UnicodeSet(%s).containsAll(%s) should be FALSE", pattern, string);
3855     }
3856 
3857     // Remove trailing "aaaa".
3858     string16.truncate(string16.length()-4);
3859     if(!set.containsAll(string16)) {
3860         errln("FAIL: UnicodeSet(%s).containsAll(%s[:-4]) should be TRUE", pattern, string);
3861     }
3862 
3863     string16=u"byayaxya";
3864     const UChar *s16=string16.getBuffer();
3865     int32_t length16=string16.length();
3866     (void)length16;   // Suppress set but not used warning.
3867     if( set.span(s16, 8, USET_SPAN_NOT_CONTAINED)!=4 ||
3868         set.span(s16, 7, USET_SPAN_NOT_CONTAINED)!=4 ||
3869         set.span(s16, 6, USET_SPAN_NOT_CONTAINED)!=4 ||
3870         set.span(s16, 5, USET_SPAN_NOT_CONTAINED)!=5 ||
3871         set.span(s16, 4, USET_SPAN_NOT_CONTAINED)!=4 ||
3872         set.span(s16, 3, USET_SPAN_NOT_CONTAINED)!=3
3873     ) {
3874         errln("FAIL: UnicodeSet(%s).span(while not) returns the wrong value", pattern);
3875     }
3876 
3877     pattern="[a{ab}{abc}{cd}]";
3878     pattern16=UnicodeString(pattern, -1, US_INV);
3879     set.applyPattern(pattern16, errorCode);
3880     if(U_FAILURE(errorCode)) {
3881         errln("FAIL: Unable to create UnicodeSet(%s) - %s", pattern, u_errorName(errorCode));
3882         return;
3883     }
3884     string16=u"acdabcdabccd";
3885     s16=string16.getBuffer();
3886     length16=string16.length();
3887     if( set.span(s16, 12, USET_SPAN_CONTAINED)!=12 ||
3888         set.span(s16, 12, USET_SPAN_SIMPLE)!=6 ||
3889         set.span(s16+7, 5, USET_SPAN_SIMPLE)!=5
3890     ) {
3891         errln("FAIL: UnicodeSet(%s).span(while longest match) returns the wrong value", pattern);
3892     }
3893 
3894     pattern="[d{cd}{bcd}{ab}]";
3895     pattern16=UnicodeString(pattern, -1, US_INV);
3896     set.applyPattern(pattern16, errorCode).freeze();
3897     if(U_FAILURE(errorCode)) {
3898         errln("FAIL: Unable to create UnicodeSet(%s) - %s", pattern, u_errorName(errorCode));
3899         return;
3900     }
3901     string16=u"abbcdabcdabd";
3902     s16=string16.getBuffer();
3903     length16=string16.length();
3904     if( set.spanBack(s16, 12, USET_SPAN_CONTAINED)!=0 ||
3905         set.spanBack(s16, 12, USET_SPAN_SIMPLE)!=6 ||
3906         set.spanBack(s16, 5, USET_SPAN_SIMPLE)!=0
3907     ) {
3908         errln("FAIL: UnicodeSet(%s).spanBack(while longest match) returns the wrong value", pattern);
3909     }
3910 }
3911 
TestPatternWithSurrogates()3912 void UnicodeSetTest::TestPatternWithSurrogates() {
3913     IcuTestErrorCode errorCode(*this, "TestPatternWithSurrogates");
3914     // Regression test for ICU-11891
3915     UnicodeSet surrogates;
3916     surrogates.add(0xd000, 0xd82f);  // a range ending with a lead surrogate code point
3917     surrogates.add(0xd83a);  // a lead surrogate
3918     surrogates.add(0xdc00, 0xdfff);  // a range of trail surrogates
3919     UnicodeString pat;
3920     surrogates.toPattern(pat, false);  // bad if U+D83A is immediately followed by U+DC00
3921     UnicodeSet s2;
3922     // was: U_MALFORMED_SET
3923     // Java: IllegalArgumentException: Error: Invalid range at "[...\U0001E800-\uDFFF|...]"
3924     s2.applyPattern(pat, errorCode);
3925     if (errorCode.errIfFailureAndReset("surrogates (1) to/from pattern")) { return; }
3926     checkEqual(surrogates, s2, "surrogates (1) to/from pattern");
3927 
3928     // create a range of DBFF-DC00, and in the complement form a range of DC01-DC03
3929     surrogates.add(0xdbff).remove(0xdc01, 0xdc03);
3930     // add a beyond-surrogates range, up to the last code point
3931     surrogates.add(0x10affe, 0x10ffff);
3932     surrogates.toPattern(pat, false);  // bad if U+DBFF is immediately followed by U+DC00
3933     s2.applyPattern(pat, errorCode);
3934     if (errorCode.errIfFailureAndReset("surrogates (2) to/from pattern")) { return; }
3935     checkEqual(surrogates, s2, "surrogates (2) to/from pattern");
3936 
3937     // Test the toPattern() code path when the pattern is shorter in complement form:
3938     // [^opposite-ranges]
3939     surrogates.add(0, 0x6789);
3940     surrogates.toPattern(pat, false);
3941     s2.applyPattern(pat, errorCode);
3942     if (errorCode.errIfFailureAndReset("surrogates (3) to/from pattern")) { return; }
3943     checkEqual(surrogates, s2, "surrogates (3) to/from pattern");
3944 
3945     // Start with a pattern, in case the original pattern is kept but
3946     // without the extra white space.
3947     surrogates.applyPattern(u"[\\uD83A \\uDC00-\\uDFFF]", errorCode);
3948     if (errorCode.errIfFailureAndReset("surrogates from pattern")) { return; }
3949     surrogates.toPattern(pat, false);
3950     s2.applyPattern(pat, errorCode);
3951     if (errorCode.errIfFailureAndReset("surrogates from/to/from pattern")) { return; }
3952     checkEqual(surrogates, s2, "surrogates from/to/from pattern");
3953 }
3954 
TestIntOverflow()3955 void UnicodeSetTest::TestIntOverflow() {
3956     // This test triggers undefined double->int conversion behavior
3957     // if the implementation is not careful.
3958     IcuTestErrorCode errorCode(*this, "TestIntOverflow");
3959     UnicodeSet set(u"[:ccc=2222222222222222222:]", errorCode);
3960     assertTrue("[:ccc=int_overflow:] -> empty set", set.isEmpty());
3961     assertEquals("[:ccc=int_overflow:] -> illegal argument",
3962                  U_ILLEGAL_ARGUMENT_ERROR, errorCode.reset());
3963 }
3964 
TestUnusedCcc()3965 void UnicodeSetTest::TestUnusedCcc() {
3966 #if !UCONFIG_NO_NORMALIZATION
3967     // All numeric ccc values 0..255 are valid, but many are unused.
3968     IcuTestErrorCode errorCode(*this, "TestUnusedCcc");
3969     UnicodeSet ccc2(u"[:ccc=2:]", errorCode);
3970     assertSuccess("[:ccc=2:]", errorCode);
3971     assertTrue("[:ccc=2:] -> empty set", ccc2.isEmpty());
3972 
3973     UnicodeSet ccc255(u"[:ccc=255:]", errorCode);
3974     assertSuccess("[:ccc=255:]", errorCode);
3975     assertTrue("[:ccc=255:] -> empty set", ccc255.isEmpty());
3976 
3977     // Non-integer values and values outside 0..255 are invalid.
3978     UnicodeSet ccc_1(u"[:ccc=-1:]", errorCode);
3979     assertEquals("[:ccc=-1:] -> illegal argument",
3980                  U_ILLEGAL_ARGUMENT_ERROR, errorCode.reset());
3981     assertTrue("[:ccc=-1:] -> empty set", ccc_1.isEmpty());
3982 
3983     UnicodeSet ccc256(u"[:ccc=256:]", errorCode);
3984     assertEquals("[:ccc=256:] -> illegal argument",
3985                  U_ILLEGAL_ARGUMENT_ERROR, errorCode.reset());
3986     assertTrue("[:ccc=256:] -> empty set", ccc256.isEmpty());
3987 
3988     UnicodeSet ccc1_1(u"[:ccc=1.1:]", errorCode);
3989     assertEquals("[:ccc=1.1:] -> illegal argument",
3990                  U_ILLEGAL_ARGUMENT_ERROR, errorCode.reset());
3991     assertTrue("[:ccc=1.1:] -> empty set", ccc1_1.isEmpty());
3992 #endif
3993 }
3994 
TestDeepPattern()3995 void UnicodeSetTest::TestDeepPattern() {
3996     IcuTestErrorCode errorCode(*this, "TestDeepPattern");
3997     // Nested ranges are parsed via recursion which can use a lot of stack space.
3998     // After a reasonable limit, we should get an error.
3999     constexpr int32_t DEPTH = 20000;
4000     UnicodeString pattern, suffix;
4001     for (int32_t i = 0; i < DEPTH; ++i) {
4002         pattern.append(u"[a", 2);
4003         suffix.append(']');
4004     }
4005     pattern.append(suffix);
4006     UnicodeSet set(pattern, errorCode);
4007     assertTrue("[a[a[a...1000s...]]] -> error", errorCode.isFailure());
4008     errorCode.reset();
4009 }
4010 
TestEmptyString()4011 void UnicodeSetTest::TestEmptyString() {
4012     IcuTestErrorCode errorCode(*this, "TestEmptyString");
4013     // Starting with ICU 69, the empty string is allowed in UnicodeSet. ICU-13702
4014     UnicodeSet set(u"[{}]", errorCode);
4015     if (!assertSuccess("set from pattern with {}", errorCode)) { return; }
4016     assertTrue("set from pattern with {}", set.contains(u""));
4017     assertEquals("set from pattern with {}: size", 1, set.size());
4018     assertFalse("set from pattern with {}: isEmpty", set.isEmpty());
4019 
4020     // Remove, add back, ...
4021     assertFalse("remove empty string", set.remove(u"").contains(u""));
4022     assertEquals("remove empty string: size", 0, set.size());
4023     assertTrue("remove empty string: isEmpty", set.isEmpty());
4024     assertTrue("add empty string", set.add(u"").contains(u""));
4025     // missing API -- assertTrue("retain empty string", set.retain(u"").contains(u""));
4026     assertFalse("complement-remove empty string", set.complement(u"").contains(u""));
4027     assertTrue("complement-add empty string", set.complement(u"").contains(u""));
4028 
4029     assertFalse("clear", set.clear().contains(u""));
4030     assertTrue("add empty string 2", set.add(u"").contains(u""));
4031     assertFalse("removeAllStrings", set.removeAllStrings().contains(u""));
4032     assertTrue("add empty string 3", set.add(u"").contains(u""));
4033     // Note that this leaves the set containing exactly the empty string.
4034 
4035     // strings() access and iteration
4036     // no C++ equivalent for Java strings() -- assertTrue("strings()", set.strings().contains(u""));
4037     UnicodeSetIterator sit(set);
4038     assertTrue("set iterator.next()", sit.next());
4039     assertTrue("set iterator has empty string", sit.isString() && sit.getString().isEmpty());
4040 
4041     // The empty string is ignored in matching.
4042     set.add(u'a').add(u'c');
4043     assertEquals("span", 1, set.span(u"abc", 3, USET_SPAN_SIMPLE));
4044     assertEquals("spanBack", 2, set.spanBack(u"abc", 3, USET_SPAN_SIMPLE));
4045     assertTrue("containsNone", set.containsNone(u"def"));
4046     assertFalse("containsSome", set.containsSome(u"def"));
4047     set.freeze();
4048     assertEquals("frozen span", 1, set.span(u"abc", 3, USET_SPAN_SIMPLE));
4049     assertEquals("frozen spanBack", 2, set.spanBack(u"abc", 3, USET_SPAN_SIMPLE));
4050     assertTrue("frozen containsNone", set.containsNone(u"def"));
4051     assertFalse("frozen containsSome", set.containsSome(u"def"));
4052 }
4053 
assertNext(UnicodeSetIterator & iter,const UnicodeString & expected)4054 void UnicodeSetTest::assertNext(UnicodeSetIterator &iter, const UnicodeString &expected) {
4055     assertTrue(expected + ".next()", iter.next());
4056     assertEquals(expected + ".getString()", expected, iter.getString());
4057 }
4058 
TestSkipToStrings()4059 void UnicodeSetTest::TestSkipToStrings() {
4060     IcuTestErrorCode errorCode(*this, "TestSkipToStrings");
4061     UnicodeSet set(u"[0189{}{ch}]", errorCode);
4062     UnicodeSetIterator iter(set);
4063     assertNext(iter.skipToStrings(), u"");
4064     assertNext(iter, u"ch");
4065     assertFalse("no next", iter.next());
4066 
4067     iter.reset();
4068     assertNext(iter, u"0");
4069     assertNext(iter, u"1");
4070     assertNext(iter, u"8");
4071     assertNext(iter, u"9");
4072     assertNext(iter, u"");
4073     assertNext(iter, u"ch");
4074     assertFalse("no next", iter.next());
4075 
4076     iter.reset();
4077     assertNext(iter, u"0");
4078     iter.skipToStrings();
4079     assertNext(iter, u"");
4080     assertNext(iter, u"ch");
4081     assertFalse("no next", iter.next());
4082 
4083     iter.reset();
4084     iter.nextRange();
4085     assertNext(iter, u"8");
4086     iter.skipToStrings();
4087     assertNext(iter, u"");
4088     assertNext(iter, u"ch");
4089     assertFalse("no next", iter.next());
4090 
4091     iter.reset();
4092     iter.nextRange();
4093     iter.nextRange();
4094     iter.nextRange();
4095     iter.skipToStrings();
4096     assertNext(iter, u"ch");
4097     assertFalse("no next", iter.next());
4098 }
4099 
TestPatternCodePointComplement()4100 void UnicodeSetTest::TestPatternCodePointComplement() {
4101     IcuTestErrorCode errorCode(*this, "TestPatternCodePointComplement");
4102     // ICU-21524 changes pattern ^ and equivalent functions to perform a "code point complement".
4103     // [^abc{ch}] = [[:Any:]-[abc{ch}]] which removes all strings.
4104     {
4105         UnicodeSet simple(u"[^abc{ch}]", errorCode);
4106         assertEquals("[^abc{ch}] --> lots of elements", 0x110000 - 3, simple.size());
4107         assertFalse("[^abc{ch}] --> no strings", simple.hasStrings());
4108         assertFalse("[^abc{ch}] --> no 'a'", simple.contains(u'a'));
4109     }
4110 
4111     {
4112         UnicodeSet notBasic(u"[:^Basic_Emoji:]", errorCode);
4113         if (errorCode.errDataIfFailureAndReset("[:^Basic_Emoji:]")) {
4114             return;
4115         }
4116         assertTrue("[:^Basic_Emoji:] --> lots of elements", notBasic.size() > 1000);
4117         assertFalse("[:^Basic_Emoji:] --> no strings", notBasic.hasStrings());
4118         assertFalse("[:^Basic_Emoji:] --> no bicycle", notBasic.contains(U'��'));
4119     }
4120 
4121     {
4122         UnicodeSet notBasic(u"[:Basic_Emoji=No:]", errorCode);
4123         assertTrue("[:Basic_Emoji=No:] --> lots of elements", notBasic.size() > 1000);
4124         assertFalse("[:Basic_Emoji=No:] --> no strings", notBasic.hasStrings());
4125         assertFalse("[:Basic_Emoji=No:] --> no bicycle", notBasic.contains(U'��'));
4126     }
4127 
4128     {
4129         UnicodeSet notBasic;
4130         notBasic.applyIntPropertyValue(UCHAR_BASIC_EMOJI, 0, errorCode);
4131         assertTrue("[].applyIntPropertyValue(Basic_Emoji, 0) --> lots of elements",
4132                 notBasic.size() > 1000);
4133         assertFalse("[].applyIntPropertyValue(Basic_Emoji, 0) --> no strings",
4134                 notBasic.hasStrings());
4135         assertFalse("[].applyIntPropertyValue(Basic_Emoji, 0) --> no bicycle",
4136                 notBasic.contains(U'��'));
4137     }
4138 
4139     {
4140         UnicodeSet notBasic;
4141         notBasic.applyPropertyAlias("Basic_Emoji", "No", errorCode);
4142         assertTrue("[].applyPropertyAlias(Basic_Emoji, No) --> lots of elements",
4143                 notBasic.size() > 1000);
4144         assertFalse("[].applyPropertyAlias(Basic_Emoji, No) --> no strings",
4145                 notBasic.hasStrings());
4146         assertFalse("[].applyPropertyAlias(Basic_Emoji, No) --> no bicycle",
4147                 notBasic.contains(U'��'));
4148     }
4149 
4150     // When there are strings, we must not use the complement for a more compact toPattern().
4151     {
4152         UnicodeSet set;
4153         set.add(0,  u'Y').add(u'b', u'q').add(u'x', 0x10ffff);
4154         UnicodeString pattern;
4155         set.toPattern(pattern, true);
4156         UnicodeSet set2(pattern, errorCode);
4157         checkEqual(set, set2, "set(with 0 & max, only code points) pattern round-trip");
4158         assertEquals("set(with 0 & max, only code points).toPattern()", u"[^Z-ar-w]", pattern);
4159 
4160         set.add("ch").add("ss");
4161         set.toPattern(pattern, true);
4162         set2 = UnicodeSet(pattern, errorCode);
4163         checkEqual(set, set2, "set(with 0 & max, with strings) pattern round-trip");
4164         assertEquals("set(with 0 & max, with strings).toPattern()",
4165                 u"[\\u0000-Yb-qx-\\U0010FFFF{ch}{ss}]", pattern);
4166     }
4167 
4168     // The complement() API behavior does not change under this ticket.
4169     {
4170         UnicodeSet notBasic(u"[:Basic_Emoji:]", errorCode);
4171         notBasic.complement();
4172         assertTrue("[:Basic_Emoji:].complement() --> lots of elements", notBasic.size() > 1000);
4173         assertTrue("[:Basic_Emoji:].complement() --> has strings", notBasic.hasStrings());
4174         assertTrue("[:Basic_Emoji:].complement().contains(chipmunk+emoji)",
4175                 notBasic.contains(u"��\uFE0F"));
4176         assertFalse("[:Basic_Emoji:].complement() --> no bicycle", notBasic.contains(U'��'));
4177     }
4178 }
4179