1 // © 2016 and later: Unicode, Inc. and others.
2 // License & terms of use: http://www.unicode.org/copyright.html
3 /*
4 ********************************************************************************
5 * Copyright (C) 1999-2016 International Business Machines Corporation and
6 * others. All Rights Reserved.
7 ********************************************************************************
8 * Date Name Description
9 * 10/20/99 alan Creation.
10 * 03/22/2000 Madhu Added additional tests
11 ********************************************************************************
12 */
13
14 #include <stdio.h>
15
16 #include <string.h>
17 #include "unicode/utypes.h"
18 #include "usettest.h"
19 #include "unicode/ucnv.h"
20 #include "unicode/uniset.h"
21 #include "unicode/uchar.h"
22 #include "unicode/usetiter.h"
23 #include "unicode/ustring.h"
24 #include "unicode/parsepos.h"
25 #include "unicode/symtable.h"
26 #include "unicode/utf8.h"
27 #include "unicode/utf16.h"
28 #include "unicode/uversion.h"
29 #include "cmemory.h"
30 #include "hash.h"
31
32 #define TEST_ASSERT_SUCCESS(status) UPRV_BLOCK_MACRO_BEGIN { \
33 if (U_FAILURE(status)) { \
34 dataerrln("fail in file \"%s\", line %d: \"%s\"", __FILE__, __LINE__, \
35 u_errorName(status)); \
36 } \
37 } UPRV_BLOCK_MACRO_END
38
39 #define TEST_ASSERT(expr) UPRV_BLOCK_MACRO_BEGIN { \
40 if (!(expr)) { \
41 dataerrln("fail in file \"%s\", line %d", __FILE__, __LINE__); \
42 } \
43 } UPRV_BLOCK_MACRO_END
44
operator +(const UnicodeString & left,const UnicodeSet & set)45 UnicodeString operator+(const UnicodeString& left, const UnicodeSet& set) {
46 UnicodeString pat;
47 set.toPattern(pat);
48 return left + UnicodeSetTest::escape(pat);
49 }
50
UnicodeSetTest()51 UnicodeSetTest::UnicodeSetTest() : utf8Cnv(NULL) {
52 }
53
openUTF8Converter()54 UConverter *UnicodeSetTest::openUTF8Converter() {
55 if(utf8Cnv==NULL) {
56 UErrorCode errorCode=U_ZERO_ERROR;
57 utf8Cnv=ucnv_open("UTF-8", &errorCode);
58 }
59 return utf8Cnv;
60 }
61
~UnicodeSetTest()62 UnicodeSetTest::~UnicodeSetTest() {
63 ucnv_close(utf8Cnv);
64 }
65
66 void
runIndexedTest(int32_t index,UBool exec,const char * & name,char *)67 UnicodeSetTest::runIndexedTest(int32_t index, UBool exec,
68 const char* &name, char* /*par*/) {
69 if (exec) {
70 logln(u"TestSuite UnicodeSetTest");
71 }
72 TESTCASE_AUTO_BEGIN;
73 TESTCASE_AUTO(TestPatterns);
74 TESTCASE_AUTO(TestAddRemove);
75 TESTCASE_AUTO(TestCategories);
76 TESTCASE_AUTO(TestCloneEqualHash);
77 TESTCASE_AUTO(TestMinimalRep);
78 TESTCASE_AUTO(TestAPI);
79 TESTCASE_AUTO(TestScriptSet);
80 TESTCASE_AUTO(TestPropertySet);
81 TESTCASE_AUTO(TestClone);
82 TESTCASE_AUTO(TestExhaustive);
83 TESTCASE_AUTO(TestToPattern);
84 TESTCASE_AUTO(TestIndexOf);
85 TESTCASE_AUTO(TestStrings);
86 TESTCASE_AUTO(Testj2268);
87 TESTCASE_AUTO(TestCloseOver);
88 TESTCASE_AUTO(TestEscapePattern);
89 TESTCASE_AUTO(TestInvalidCodePoint);
90 TESTCASE_AUTO(TestSymbolTable);
91 TESTCASE_AUTO(TestSurrogate);
92 TESTCASE_AUTO(TestPosixClasses);
93 TESTCASE_AUTO(TestIteration);
94 TESTCASE_AUTO(TestFreezable);
95 TESTCASE_AUTO(TestSpan);
96 TESTCASE_AUTO(TestStringSpan);
97 TESTCASE_AUTO(TestPatternWithSurrogates);
98 TESTCASE_AUTO(TestIntOverflow);
99 TESTCASE_AUTO(TestUnusedCcc);
100 TESTCASE_AUTO(TestDeepPattern);
101 TESTCASE_AUTO(TestEmptyString);
102 TESTCASE_AUTO(TestSkipToStrings);
103 TESTCASE_AUTO(TestPatternCodePointComplement);
104 TESTCASE_AUTO_END;
105 }
106
107 static const char NOT[] = "%%%%";
108
109 /**
110 * UVector was improperly copying contents
111 * This code will crash this is still true
112 */
Testj2268()113 void UnicodeSetTest::Testj2268() {
114 UnicodeSet t;
115 t.add(UnicodeString("abc"));
116 UnicodeSet test(t);
117 UnicodeString ustrPat;
118 test.toPattern(ustrPat, TRUE);
119 }
120
121 /**
122 * Test toPattern().
123 */
TestToPattern()124 void UnicodeSetTest::TestToPattern() {
125 UErrorCode ec = U_ZERO_ERROR;
126
127 // Test that toPattern() round trips with syntax characters and
128 // whitespace.
129 {
130 static const char* OTHER_TOPATTERN_TESTS[] = {
131 "[[:latin:]&[:greek:]]",
132 "[[:latin:]-[:greek:]]",
133 "[:nonspacing mark:]",
134 NULL
135 };
136
137 for (int32_t j=0; OTHER_TOPATTERN_TESTS[j]!=NULL; ++j) {
138 ec = U_ZERO_ERROR;
139 UnicodeSet s(OTHER_TOPATTERN_TESTS[j], ec);
140 if (U_FAILURE(ec)) {
141 dataerrln((UnicodeString)"FAIL: bad pattern " + OTHER_TOPATTERN_TESTS[j] + " - " + UnicodeString(u_errorName(ec)));
142 continue;
143 }
144 checkPat(OTHER_TOPATTERN_TESTS[j], s);
145 }
146
147 for (UChar32 i = 0; i <= 0x10FFFF; ++i) {
148 if ((i <= 0xFF && !u_isalpha(i)) || u_isspace(i)) {
149
150 // check various combinations to make sure they all work.
151 if (i != 0 && !toPatternAux(i, i)){
152 continue;
153 }
154 if (!toPatternAux(0, i)){
155 continue;
156 }
157 if (!toPatternAux(i, 0xFFFF)){
158 continue;
159 }
160 }
161 }
162 }
163
164 // Test pattern behavior of multicharacter strings.
165 {
166 ec = U_ZERO_ERROR;
167 UnicodeSet* s = new UnicodeSet("[a-z {aa} {ab}]", ec);
168
169 // This loop isn't a loop. It's here to make the compiler happy.
170 // If you're curious, try removing it and changing the 'break'
171 // statements (except for the last) to goto's.
172 for (;;) {
173 if (U_FAILURE(ec)) break;
174 const char* exp1[] = {"aa", "ab", NOT, "ac", NULL};
175 expectToPattern(*s, "[a-z{aa}{ab}]", exp1);
176
177 s->add("ac");
178 const char* exp2[] = {"aa", "ab", "ac", NOT, "xy", NULL};
179 expectToPattern(*s, "[a-z{aa}{ab}{ac}]", exp2);
180
181 s->applyPattern(u"[a-z {\\{l} {r\\}}]", ec);
182 if (U_FAILURE(ec)) break;
183 const char* exp3[] = {"{l", "r}", NOT, "xy", NULL};
184 expectToPattern(*s, u"[a-z{r\\}}{\\{l}]", exp3);
185
186 s->add("[]");
187 const char* exp4[] = {"{l", "r}", "[]", NOT, "xy", NULL};
188 expectToPattern(*s, u"[a-z{\\[\\]}{r\\}}{\\{l}]", exp4);
189
190 s->applyPattern(u"[a-z {\\u4E01\\u4E02}{\\n\\r}]", ec);
191 if (U_FAILURE(ec)) break;
192 const char* exp5[] = {"\\u4E01\\u4E02", "\n\r", NULL};
193 expectToPattern(*s, u"[a-z{\\u000A\\u000D}{\\u4E01\\u4E02}]", exp5);
194
195 // j2189
196 s->clear();
197 s->add(UnicodeString("abc", ""));
198 s->add(UnicodeString("abc", ""));
199 const char* exp6[] = {"abc", NOT, "ab", NULL};
200 expectToPattern(*s, "[{abc}]", exp6);
201
202 break;
203 }
204
205 if (U_FAILURE(ec)) errln("FAIL: pattern parse error");
206 delete s;
207 }
208
209 // JB#3400: For 2 character ranges prefer [ab] to [a-b]
210 UnicodeSet s;
211 s.add(u'a', u'b');
212 expectToPattern(s, "[ab]", NULL);
213 }
214
toPatternAux(UChar32 start,UChar32 end)215 UBool UnicodeSetTest::toPatternAux(UChar32 start, UChar32 end) {
216
217 // use Integer.toString because Utility.hex doesn't handle ints
218 UnicodeString pat = "";
219 // TODO do these in hex
220 //String source = "0x" + Integer.toString(start,16).toUpperCase();
221 //if (start != end) source += "..0x" + Integer.toString(end,16).toUpperCase();
222 UnicodeString source;
223 source = source + (uint32_t)start;
224 if (start != end)
225 source = source + ".." + (uint32_t)end;
226 UnicodeSet testSet;
227 testSet.add(start, end);
228 return checkPat(source, testSet);
229 }
230
checkPat(const UnicodeString & source,const UnicodeSet & testSet)231 UBool UnicodeSetTest::checkPat(const UnicodeString& source,
232 const UnicodeSet& testSet) {
233 // What we want to make sure of is that a pattern generated
234 // by toPattern(), with or without escaped unprintables, can
235 // be passed back into the UnicodeSet constructor.
236 UnicodeString pat0;
237
238 testSet.toPattern(pat0, TRUE);
239
240 if (!checkPat(source + " (escaped)", testSet, pat0)) return FALSE;
241
242 //String pat1 = unescapeLeniently(pat0);
243 //if (!checkPat(source + " (in code)", testSet, pat1)) return false;
244
245 UnicodeString pat2;
246 testSet.toPattern(pat2, FALSE);
247 if (!checkPat(source, testSet, pat2)) return FALSE;
248
249 //String pat3 = unescapeLeniently(pat2);
250 // if (!checkPat(source + " (in code)", testSet, pat3)) return false;
251
252 //logln(source + " => " + pat0 + ", " + pat1 + ", " + pat2 + ", " + pat3);
253 logln((UnicodeString)source + " => " + pat0 + ", " + pat2);
254 return TRUE;
255 }
256
checkPat(const UnicodeString & source,const UnicodeSet & testSet,const UnicodeString & pat)257 UBool UnicodeSetTest::checkPat(const UnicodeString& source,
258 const UnicodeSet& testSet,
259 const UnicodeString& pat) {
260 UErrorCode ec = U_ZERO_ERROR;
261 UnicodeSet testSet2(pat, ec);
262 if (testSet2 != testSet) {
263 errln((UnicodeString)"Fail toPattern: " + source + " => " + pat);
264 return FALSE;
265 }
266 return TRUE;
267 }
268
269 void
TestPatterns(void)270 UnicodeSetTest::TestPatterns(void) {
271 UnicodeSet set;
272 expectPattern(set, UnicodeString("[[a-m]&[d-z]&[k-y]]", ""), "km");
273 expectPattern(set, UnicodeString("[[a-z]-[m-y]-[d-r]]", ""), "aczz");
274 expectPattern(set, UnicodeString("[a\\-z]", ""), "--aazz");
275 expectPattern(set, UnicodeString("[-az]", ""), "--aazz");
276 expectPattern(set, UnicodeString("[az-]", ""), "--aazz");
277 expectPattern(set, UnicodeString("[[[a-z]-[aeiou]i]]", ""), "bdfnptvz");
278
279 // Throw in a test of complement
280 set.complement();
281 UnicodeString exp;
282 exp.append((UChar)0x0000).append("aeeoouu").append((UChar)(u'z'+1)).append(u'\uFFFF');
283 expectPairs(set, exp);
284 }
285
286 void
TestCategories(void)287 UnicodeSetTest::TestCategories(void) {
288 UErrorCode status = U_ZERO_ERROR;
289 const char* pat = " [:Lu:] "; // Whitespace ok outside [:..:]
290 UnicodeSet set(pat, status);
291 if (U_FAILURE(status)) {
292 dataerrln((UnicodeString)"Fail: Can't construct set with " + pat + " - " + UnicodeString(u_errorName(status)));
293 return;
294 } else {
295 expectContainment(set, pat, "ABC", "abc");
296 }
297
298 UChar32 i;
299 int32_t failures = 0;
300 // Make sure generation of L doesn't pollute cached Lu set
301 // First generate L, then Lu
302 set.applyPattern("[:L:]", status);
303 if (U_FAILURE(status)) { errln("FAIL"); return; }
304 for (i=0; i<0x200; ++i) {
305 UBool l = u_isalpha((UChar)i);
306 if (l != set.contains(i)) {
307 errln((UnicodeString)"FAIL: L contains " + (unsigned short)i + " = " +
308 set.contains(i));
309 if (++failures == 10) break;
310 }
311 }
312
313 set.applyPattern("[:Lu:]", status);
314 if (U_FAILURE(status)) { errln("FAIL"); return; }
315 for (i=0; i<0x200; ++i) {
316 UBool lu = (u_charType((UChar)i) == U_UPPERCASE_LETTER);
317 if (lu != set.contains(i)) {
318 errln((UnicodeString)"FAIL: Lu contains " + (unsigned short)i + " = " +
319 set.contains(i));
320 if (++failures == 20) break;
321 }
322 }
323 }
324 void
TestCloneEqualHash(void)325 UnicodeSetTest::TestCloneEqualHash(void) {
326 UErrorCode status = U_ZERO_ERROR;
327 // set1 and set2 used to be built with the obsolete constructor taking
328 // UCharCategory values; replaced with pattern constructors
329 // markus 20030502
330 UnicodeSet *set1=new UnicodeSet(u"\\p{Lowercase Letter}", status); // :Ll: Letter, lowercase
331 UnicodeSet *set1a=new UnicodeSet(u"[:Ll:]", status); // Letter, lowercase
332 if (U_FAILURE(status)){
333 dataerrln((UnicodeString)"FAIL: Can't construst set with category->Ll" + " - " + UnicodeString(u_errorName(status)));
334 return;
335 }
336 UnicodeSet *set2=new UnicodeSet(u"\\p{Decimal Number}", status); //Number, Decimal digit
337 UnicodeSet *set2a=new UnicodeSet(u"[:Nd:]", status); //Number, Decimal digit
338 if (U_FAILURE(status)){
339 errln((UnicodeString)"FAIL: Can't construct set with category->Nd");
340 return;
341 }
342
343 if (*set1 != *set1a) {
344 errln("FAIL: category constructor for Ll broken");
345 }
346 if (*set2 != *set2a) {
347 errln("FAIL: category constructor for Nd broken");
348 }
349 delete set1a;
350 delete set2a;
351
352 logln("Testing copy construction");
353 UnicodeSet *set1copy=new UnicodeSet(*set1);
354 if(*set1 != *set1copy || *set1 == *set2 ||
355 getPairs(*set1) != getPairs(*set1copy) ||
356 set1->hashCode() != set1copy->hashCode()){
357 errln("FAIL : Error in copy construction");
358 return;
359 }
360
361 logln("Testing =operator");
362 UnicodeSet set1equal=*set1;
363 UnicodeSet set2equal=*set2;
364 if(set1equal != *set1 || set1equal != *set1copy || set2equal != *set2 ||
365 set2equal == *set1 || set2equal == *set1copy || set2equal == set1equal){
366 errln("FAIL: Error in =operator");
367 }
368
369 logln("Testing clone()");
370 UnicodeSet *set1clone=set1->clone();
371 UnicodeSet *set2clone=set2->clone();
372 if(*set1clone != *set1 || *set1clone != *set1copy || *set1clone != set1equal ||
373 *set2clone != *set2 || *set2clone == *set1copy || *set2clone != set2equal ||
374 *set2clone == *set1 || *set2clone == set1equal || *set2clone == *set1clone){
375 errln("FAIL: Error in clone");
376 }
377
378 logln("Testing hashcode");
379 if(set1->hashCode() != set1equal.hashCode() || set1->hashCode() != set1clone->hashCode() ||
380 set2->hashCode() != set2equal.hashCode() || set2->hashCode() != set2clone->hashCode() ||
381 set1copy->hashCode() != set1equal.hashCode() || set1copy->hashCode() != set1clone->hashCode() ||
382 set1->hashCode() == set2->hashCode() || set1copy->hashCode() == set2->hashCode() ||
383 set2->hashCode() == set1clone->hashCode() || set2->hashCode() == set1equal.hashCode() ){
384 errln("FAIL: Error in hashCode()");
385 }
386
387 delete set1;
388 delete set1copy;
389 delete set2;
390 delete set1clone;
391 delete set2clone;
392
393
394 }
395 void
TestAddRemove(void)396 UnicodeSetTest::TestAddRemove(void) {
397 UnicodeSet set; // Construct empty set
398 doAssert(set.isEmpty() == TRUE, "set should be empty");
399 doAssert(set.size() == 0, "size should be 0");
400 set.complement();
401 doAssert(set.size() == 0x110000, "size should be 0x110000");
402 set.clear();
403 set.add(0x0061, 0x007a);
404 expectPairs(set, "az");
405 doAssert(set.isEmpty() == FALSE, "set should not be empty");
406 doAssert(set.size() != 0, "size should not be equal to 0");
407 doAssert(set.size() == 26, "size should be equal to 26");
408 set.remove(0x006d, 0x0070);
409 expectPairs(set, "alqz");
410 doAssert(set.size() == 22, "size should be equal to 22");
411 set.remove(0x0065, 0x0067);
412 expectPairs(set, "adhlqz");
413 doAssert(set.size() == 19, "size should be equal to 19");
414 set.remove(0x0064, 0x0069);
415 expectPairs(set, "acjlqz");
416 doAssert(set.size() == 16, "size should be equal to 16");
417 set.remove(0x0063, 0x0072);
418 expectPairs(set, "absz");
419 doAssert(set.size() == 10, "size should be equal to 10");
420 set.add(0x0066, 0x0071);
421 expectPairs(set, "abfqsz");
422 doAssert(set.size() == 22, "size should be equal to 22");
423 set.remove(0x0061, 0x0067);
424 expectPairs(set, "hqsz");
425 set.remove(0x0061, 0x007a);
426 expectPairs(set, "");
427 doAssert(set.isEmpty() == TRUE, "set should be empty");
428 doAssert(set.size() == 0, "size should be 0");
429 set.add(0x0061);
430 doAssert(set.isEmpty() == FALSE, "set should not be empty");
431 doAssert(set.size() == 1, "size should not be equal to 1");
432 set.add(0x0062);
433 set.add(0x0063);
434 expectPairs(set, "ac");
435 doAssert(set.size() == 3, "size should not be equal to 3");
436 set.add(0x0070);
437 set.add(0x0071);
438 expectPairs(set, "acpq");
439 doAssert(set.size() == 5, "size should not be equal to 5");
440 set.clear();
441 expectPairs(set, "");
442 doAssert(set.isEmpty() == TRUE, "set should be empty");
443 doAssert(set.size() == 0, "size should be 0");
444
445 // Try removing an entire set from another set
446 expectPattern(set, "[c-x]", "cx");
447 UnicodeSet set2;
448 expectPattern(set2, "[f-ky-za-bc[vw]]", "acfkvwyz");
449 set.removeAll(set2);
450 expectPairs(set, "deluxx");
451
452 // Try adding an entire set to another set
453 expectPattern(set, "[jackiemclean]", "aacceein");
454 expectPattern(set2, "[hitoshinamekatajamesanderson]", "aadehkmort");
455 set.addAll(set2);
456 expectPairs(set, "aacehort");
457 doAssert(set.containsAll(set2) == TRUE, "set should contain all the elements in set2");
458
459 // Try retaining an set of elements contained in another set (intersection)
460 UnicodeSet set3;
461 expectPattern(set3, "[a-c]", "ac");
462 doAssert(set.containsAll(set3) == FALSE, "set doesn't contain all the elements in set3");
463 set3.remove(0x0062);
464 expectPairs(set3, "aacc");
465 doAssert(set.containsAll(set3) == TRUE, "set should contain all the elements in set3");
466 set.retainAll(set3);
467 expectPairs(set, "aacc");
468 doAssert(set.size() == set3.size(), "set.size() should be set3.size()");
469 doAssert(set.containsAll(set3) == TRUE, "set should contain all the elements in set3");
470 set.clear();
471 doAssert(set.size() != set3.size(), "set.size() != set3.size()");
472
473 // Test commutativity
474 expectPattern(set, "[hitoshinamekatajamesanderson]", "aadehkmort");
475 expectPattern(set2, "[jackiemclean]", "aacceein");
476 set.addAll(set2);
477 expectPairs(set, "aacehort");
478 doAssert(set.containsAll(set2) == TRUE, "set should contain all the elements in set2");
479
480
481
482
483 }
484
485 /**
486 * Make sure minimal representation is maintained.
487 */
TestMinimalRep()488 void UnicodeSetTest::TestMinimalRep() {
489 UErrorCode status = U_ZERO_ERROR;
490 // This is pretty thoroughly tested by checkCanonicalRep()
491 // run against the exhaustive operation results. Use the code
492 // here for debugging specific spot problems.
493
494 // 1 overlap against 2
495 UnicodeSet set("[h-km-q]", status);
496 if (U_FAILURE(status)) { errln("FAIL"); return; }
497 UnicodeSet set2("[i-o]", status);
498 if (U_FAILURE(status)) { errln("FAIL"); return; }
499 set.addAll(set2);
500 expectPairs(set, "hq");
501 // right
502 set.applyPattern("[a-m]", status);
503 if (U_FAILURE(status)) { errln("FAIL"); return; }
504 set2.applyPattern("[e-o]", status);
505 if (U_FAILURE(status)) { errln("FAIL"); return; }
506 set.addAll(set2);
507 expectPairs(set, "ao");
508 // left
509 set.applyPattern("[e-o]", status);
510 if (U_FAILURE(status)) { errln("FAIL"); return; }
511 set2.applyPattern("[a-m]", status);
512 if (U_FAILURE(status)) { errln("FAIL"); return; }
513 set.addAll(set2);
514 expectPairs(set, "ao");
515 // 1 overlap against 3
516 set.applyPattern("[a-eg-mo-w]", status);
517 if (U_FAILURE(status)) { errln("FAIL"); return; }
518 set2.applyPattern("[d-q]", status);
519 if (U_FAILURE(status)) { errln("FAIL"); return; }
520 set.addAll(set2);
521 expectPairs(set, "aw");
522 }
523
TestAPI()524 void UnicodeSetTest::TestAPI() {
525 UErrorCode status = U_ZERO_ERROR;
526 // default ct
527 UnicodeSet set;
528 if (!set.isEmpty() || set.getRangeCount() != 0) {
529 errln((UnicodeString)"FAIL, set should be empty but isn't: " +
530 set);
531 }
532
533 // clear(), isEmpty()
534 set.add(0x0061);
535 if (set.isEmpty()) {
536 errln((UnicodeString)"FAIL, set shouldn't be empty but is: " +
537 set);
538 }
539 set.clear();
540 if (!set.isEmpty()) {
541 errln((UnicodeString)"FAIL, set should be empty but isn't: " +
542 set);
543 }
544
545 // size()
546 set.clear();
547 if (set.size() != 0) {
548 errln((UnicodeString)"FAIL, size should be 0, but is " + set.size() +
549 ": " + set);
550 }
551 set.add(0x0061);
552 if (set.size() != 1) {
553 errln((UnicodeString)"FAIL, size should be 1, but is " + set.size() +
554 ": " + set);
555 }
556 set.add(0x0031, 0x0039);
557 if (set.size() != 10) {
558 errln((UnicodeString)"FAIL, size should be 10, but is " + set.size() +
559 ": " + set);
560 }
561
562 // contains(first, last)
563 set.clear();
564 set.applyPattern("[A-Y 1-8 b-d l-y]", status);
565 if (U_FAILURE(status)) { errln("FAIL"); return; }
566 for (int32_t i = 0; i<set.getRangeCount(); ++i) {
567 UChar32 a = set.getRangeStart(i);
568 UChar32 b = set.getRangeEnd(i);
569 if (!set.contains(a, b)) {
570 errln((UnicodeString)"FAIL, should contain " + (unsigned short)a + '-' + (unsigned short)b +
571 " but doesn't: " + set);
572 }
573 if (set.contains((UChar32)(a-1), b)) {
574 errln((UnicodeString)"FAIL, shouldn't contain " +
575 (unsigned short)(a-1) + '-' + (unsigned short)b +
576 " but does: " + set);
577 }
578 if (set.contains(a, (UChar32)(b+1))) {
579 errln((UnicodeString)"FAIL, shouldn't contain " +
580 (unsigned short)a + '-' + (unsigned short)(b+1) +
581 " but does: " + set);
582 }
583 }
584
585 // Ported InversionList test.
586 UnicodeSet a((UChar32)3,(UChar32)10);
587 UnicodeSet b((UChar32)7,(UChar32)15);
588 UnicodeSet c;
589
590 logln((UnicodeString)"a [3-10]: " + a);
591 logln((UnicodeString)"b [7-15]: " + b);
592 c = a;
593 c.addAll(b);
594 UnicodeSet exp((UChar32)3,(UChar32)15);
595 if (c == exp) {
596 logln((UnicodeString)"c.set(a).add(b): " + c);
597 } else {
598 errln((UnicodeString)"FAIL: c.set(a).add(b) = " + c + ", expect " + exp);
599 }
600 c.complement();
601 exp.set((UChar32)0, (UChar32)2);
602 exp.add((UChar32)16, UnicodeSet::MAX_VALUE);
603 if (c == exp) {
604 logln((UnicodeString)"c.complement(): " + c);
605 } else {
606 errln((UnicodeString)"FAIL: c.complement() = " + c + ", expect " + exp);
607 }
608 c.complement();
609 exp.set((UChar32)3, (UChar32)15);
610 if (c == exp) {
611 logln((UnicodeString)"c.complement(): " + c);
612 } else {
613 errln((UnicodeString)"FAIL: c.complement() = " + c + ", expect " + exp);
614 }
615 c = a;
616 c.complementAll(b);
617 exp.set((UChar32)3,(UChar32)6);
618 exp.add((UChar32)11,(UChar32) 15);
619 if (c == exp) {
620 logln((UnicodeString)"c.set(a).exclusiveOr(b): " + c);
621 } else {
622 errln((UnicodeString)"FAIL: c.set(a).exclusiveOr(b) = " + c + ", expect " + exp);
623 }
624
625 exp = c;
626 bitsToSet(setToBits(c), c);
627 if (c == exp) {
628 logln((UnicodeString)"bitsToSet(setToBits(c)): " + c);
629 } else {
630 errln((UnicodeString)"FAIL: bitsToSet(setToBits(c)) = " + c + ", expect " + exp);
631 }
632
633 // Additional tests for coverage JB#2118
634 //UnicodeSet::complement(class UnicodeString const &)
635 //UnicodeSet::complementAll(class UnicodeString const &)
636 //UnicodeSet::containsNone(class UnicodeSet const &)
637 //UnicodeSet::containsNone(long,long)
638 //UnicodeSet::containsSome(class UnicodeSet const &)
639 //UnicodeSet::containsSome(long,long)
640 //UnicodeSet::removeAll(class UnicodeString const &)
641 //UnicodeSet::retain(long)
642 //UnicodeSet::retainAll(class UnicodeString const &)
643 //UnicodeSet::serialize(unsigned short *,long,enum UErrorCode &)
644 //UnicodeSetIterator::getString(void)
645 set.clear();
646 set.complement("ab");
647 exp.applyPattern("[{ab}]", status);
648 if (U_FAILURE(status)) { errln("FAIL"); return; }
649 if (set != exp) { errln("FAIL: complement(\"ab\")"); return; }
650
651 UnicodeSetIterator iset(set);
652 if (!iset.next() || !iset.isString()) {
653 errln("FAIL: UnicodeSetIterator::next/isString");
654 } else if (iset.getString() != "ab") {
655 errln("FAIL: UnicodeSetIterator::getString");
656 }
657
658 set.add(u'a', u'z');
659 set.complementAll("alan");
660 exp.applyPattern("[{ab}b-kmo-z]", status);
661 if (U_FAILURE(status)) { errln("FAIL"); return; }
662 if (set != exp) { errln("FAIL: complementAll(\"alan\")"); return; }
663
664 exp.applyPattern("[a-z]", status);
665 if (U_FAILURE(status)) { errln("FAIL"); return; }
666 if (set.containsNone(exp)) { errln("FAIL: containsNone(UnicodeSet)"); }
667 if (!set.containsSome(exp)) { errln("FAIL: containsSome(UnicodeSet)"); }
668 exp.applyPattern("[aln]", status);
669 if (U_FAILURE(status)) { errln("FAIL"); return; }
670 if (!set.containsNone(exp)) { errln("FAIL: containsNone(UnicodeSet)"); }
671 if (set.containsSome(exp)) { errln("FAIL: containsSome(UnicodeSet)"); }
672
673 if (set.containsNone(u'a', u'z')) {
674 errln("FAIL: containsNone(UChar32, UChar32)");
675 }
676 if (!set.containsSome(u'a', u'z')) {
677 errln("FAIL: containsSome(UChar32, UChar32)");
678 }
679 if (!set.containsNone(u'A', u'Z')) {
680 errln("FAIL: containsNone(UChar32, UChar32)");
681 }
682 if (set.containsSome(u'A', u'Z')) {
683 errln("FAIL: containsSome(UChar32, UChar32)");
684 }
685
686 set.removeAll("liu");
687 exp.applyPattern("[{ab}b-hj-kmo-tv-z]", status);
688 if (U_FAILURE(status)) { errln("FAIL"); return; }
689 if (set != exp) { errln("FAIL: removeAll(\"liu\")"); return; }
690
691 set.retainAll("star");
692 exp.applyPattern("[rst]", status);
693 if (U_FAILURE(status)) { errln("FAIL"); return; }
694 if (set != exp) { errln("FAIL: retainAll(\"star\")"); return; }
695
696 set.retain(u's');
697 exp.applyPattern("[s]", status);
698 if (U_FAILURE(status)) { errln("FAIL"); return; }
699 if (set != exp) { errln("FAIL: retain('s')"); return; }
700
701 // ICU 2.6 coverage tests
702 // public final UnicodeSet retain(String s);
703 // public final UnicodeSet remove(int c);
704 // public final UnicodeSet remove(String s);
705 // public int hashCode();
706 set.applyPattern(u"[a-z{ab}{cd}]", status);
707 if (U_FAILURE(status)) { errln("FAIL"); return; }
708 set.retain(u"cd");
709 exp.applyPattern(u"[{cd}]", status);
710 if (U_FAILURE(status)) { errln("FAIL"); return; }
711 if (set != exp) { errln("FAIL: (with cd).retain(\"cd\")"); return; }
712
713 set.applyPattern(u"[a-z{ab}{yz}]", status);
714 if (U_FAILURE(status)) { errln("FAIL"); return; }
715 set.retain(u"cd");
716 exp.clear();
717 if (set != exp) { errln("FAIL: (without cd).retain(\"cd\")"); return; }
718
719 set.applyPattern(u"[a-z{ab}{cd}]", status);
720 if (U_FAILURE(status)) { errln("FAIL"); return; }
721 set.remove(u'c');
722 exp.applyPattern(u"[abd-z{ab}{cd}]", status);
723 if (set != exp) { errln("FAIL: remove('c')"); return; }
724
725 set.remove(u"cd");
726 exp.applyPattern(u"[abd-z{ab}]", status);
727 if (U_FAILURE(status)) { errln("FAIL"); return; }
728 if (set != exp) { errln("FAIL: remove(\"cd\")"); return; }
729
730 set.applyPattern("[s]", status);
731 if (U_FAILURE(status)) { errln("FAIL"); return; }
732 uint16_t buf[32];
733 int32_t slen = set.serialize(buf, UPRV_LENGTHOF(buf), status);
734 if (U_FAILURE(status)) { errln("FAIL: serialize"); return; }
735 if (slen != 3 || buf[0] != 2 || buf[1] != u's' || buf[2] != u't') {
736 errln("FAIL: serialize");
737 return;
738 }
739
740 // Conversions to and from USet
741 UnicodeSet *uniset = &set;
742 USet *uset = uniset->toUSet();
743 TEST_ASSERT((void *)uset == (void *)uniset);
744 UnicodeSet *setx = UnicodeSet::fromUSet(uset);
745 TEST_ASSERT((void *)setx == (void *)uset);
746 const UnicodeSet *constSet = uniset;
747 const USet *constUSet = constSet->toUSet();
748 TEST_ASSERT((void *)constUSet == (void *)constSet);
749 const UnicodeSet *constSetx = UnicodeSet::fromUSet(constUSet);
750 TEST_ASSERT((void *)constSetx == (void *)constUSet);
751
752 // span(UnicodeString) and spanBack(UnicodeString) convenience methods
753 UnicodeString longString=u"aaaaaaaaaabbbbbbbbbbcccccccccc";
754 UnicodeSet ac(0x61, 0x63);
755 ac.remove(0x62).freeze();
756 if( ac.span(longString, -5, USET_SPAN_CONTAINED)!=10 ||
757 ac.span(longString, 0, USET_SPAN_CONTAINED)!=10 ||
758 ac.span(longString, 5, USET_SPAN_CONTAINED)!=10 ||
759 ac.span(longString, 10, USET_SPAN_CONTAINED)!=10 ||
760 ac.span(longString, 15, USET_SPAN_CONTAINED)!=15 ||
761 ac.span(longString, 20, USET_SPAN_CONTAINED)!=30 ||
762 ac.span(longString, 25, USET_SPAN_CONTAINED)!=30 ||
763 ac.span(longString, 30, USET_SPAN_CONTAINED)!=30 ||
764 ac.span(longString, 35, USET_SPAN_CONTAINED)!=30 ||
765 ac.span(longString, INT32_MAX, USET_SPAN_CONTAINED)!=30
766 ) {
767 errln("UnicodeSet.span(UnicodeString, ...) returns incorrect end indexes");
768 }
769 if( ac.spanBack(longString, -5, USET_SPAN_CONTAINED)!=0 ||
770 ac.spanBack(longString, 0, USET_SPAN_CONTAINED)!=0 ||
771 ac.spanBack(longString, 5, USET_SPAN_CONTAINED)!=0 ||
772 ac.spanBack(longString, 10, USET_SPAN_CONTAINED)!=0 ||
773 ac.spanBack(longString, 15, USET_SPAN_CONTAINED)!=15 ||
774 ac.spanBack(longString, 20, USET_SPAN_CONTAINED)!=20 ||
775 ac.spanBack(longString, 25, USET_SPAN_CONTAINED)!=20 ||
776 ac.spanBack(longString, 30, USET_SPAN_CONTAINED)!=20 ||
777 ac.spanBack(longString, 35, USET_SPAN_CONTAINED)!=20 ||
778 ac.spanBack(longString, INT32_MAX, USET_SPAN_CONTAINED)!=20
779 ) {
780 errln("UnicodeSet.spanBack(UnicodeString, ...) returns incorrect start indexes");
781 }
782 }
783
TestIteration()784 void UnicodeSetTest::TestIteration() {
785 UErrorCode ec = U_ZERO_ERROR;
786 int i = 0;
787 int outerLoop;
788
789 // 6 code points, 3 ranges, 2 strings, 8 total elements
790 // Iteration will access them in sorted order - a, b, c, y, z, U0001abcd, "str1", "str2"
791 UnicodeSet set(u"[zabyc\\U0001abcd{str1}{str2}]", ec);
792 TEST_ASSERT_SUCCESS(ec);
793 UnicodeSetIterator it(set);
794
795 for (outerLoop=0; outerLoop<3; outerLoop++) {
796 // Run the test multiple times, to check that iterator.reset() is working.
797 for (i=0; i<10; i++) {
798 UBool nextv = it.next();
799 UBool isString = it.isString();
800 int32_t codePoint = it.getCodepoint();
801 //int32_t codePointEnd = it.getCodepointEnd();
802 UnicodeString s = it.getString();
803 switch (i) {
804 case 0:
805 TEST_ASSERT(nextv == TRUE);
806 TEST_ASSERT(isString == FALSE);
807 TEST_ASSERT(codePoint==0x61);
808 TEST_ASSERT(s == "a");
809 break;
810 case 1:
811 TEST_ASSERT(nextv == TRUE);
812 TEST_ASSERT(isString == FALSE);
813 TEST_ASSERT(codePoint==0x62);
814 TEST_ASSERT(s == "b");
815 break;
816 case 2:
817 TEST_ASSERT(nextv == TRUE);
818 TEST_ASSERT(isString == FALSE);
819 TEST_ASSERT(codePoint==0x63);
820 TEST_ASSERT(s == "c");
821 break;
822 case 3:
823 TEST_ASSERT(nextv == TRUE);
824 TEST_ASSERT(isString == FALSE);
825 TEST_ASSERT(codePoint==0x79);
826 TEST_ASSERT(s == "y");
827 break;
828 case 4:
829 TEST_ASSERT(nextv == TRUE);
830 TEST_ASSERT(isString == FALSE);
831 TEST_ASSERT(codePoint==0x7a);
832 TEST_ASSERT(s == "z");
833 break;
834 case 5:
835 TEST_ASSERT(nextv == TRUE);
836 TEST_ASSERT(isString == FALSE);
837 TEST_ASSERT(codePoint==0x1abcd);
838 TEST_ASSERT(s == UnicodeString((UChar32)0x1abcd));
839 break;
840 case 6:
841 TEST_ASSERT(nextv == TRUE);
842 TEST_ASSERT(isString == TRUE);
843 TEST_ASSERT(s == "str1");
844 break;
845 case 7:
846 TEST_ASSERT(nextv == TRUE);
847 TEST_ASSERT(isString == TRUE);
848 TEST_ASSERT(s == "str2");
849 break;
850 case 8:
851 TEST_ASSERT(nextv == FALSE);
852 break;
853 case 9:
854 TEST_ASSERT(nextv == FALSE);
855 break;
856 }
857 }
858 it.reset(); // prepare to run the iteration again.
859 }
860 }
861
862
863
864
TestStrings()865 void UnicodeSetTest::TestStrings() {
866 UErrorCode ec = U_ZERO_ERROR;
867
868 UnicodeSet* testList[] = {
869 UnicodeSet::createFromAll("abc"),
870 new UnicodeSet("[a-c]", ec),
871
872 &(UnicodeSet::createFrom("ch")->add('a','z').add("ll")),
873 new UnicodeSet("[{ll}{ch}a-z]", ec),
874
875 UnicodeSet::createFrom("ab}c"),
876 new UnicodeSet("[{ab\\}c}]", ec),
877
878 &((new UnicodeSet('a','z'))->add('A', 'Z').retain('M','m').complement('X')),
879 new UnicodeSet("[[a-zA-Z]&[M-m]-[X]]", ec),
880
881 NULL
882 };
883
884 if (U_FAILURE(ec)) {
885 errln("FAIL: couldn't construct test sets");
886 }
887 assertFalse("[a-c].hasStrings()", testList[0]->hasStrings());
888 assertTrue("[{ll}{ch}a-z].hasStrings()", testList[2]->hasStrings());
889
890 for (int32_t i = 0; testList[i] != NULL; i+=2) {
891 if (U_SUCCESS(ec)) {
892 UnicodeString pat0, pat1;
893 testList[i]->toPattern(pat0, TRUE);
894 testList[i+1]->toPattern(pat1, TRUE);
895 if (*testList[i] == *testList[i+1]) {
896 logln((UnicodeString)"Ok: " + pat0 + " == " + pat1);
897 } else {
898 logln((UnicodeString)"FAIL: " + pat0 + " != " + pat1);
899 }
900 }
901 delete testList[i];
902 delete testList[i+1];
903 }
904 }
905
906 /**
907 * Test the [:Latin:] syntax.
908 */
TestScriptSet()909 void UnicodeSetTest::TestScriptSet() {
910 expectContainment(u"[:Latin:]", "aA", CharsToUnicodeString("\\u0391\\u03B1"));
911
912 expectContainment(u"[:Greek:]", CharsToUnicodeString("\\u0391\\u03B1"), "aA");
913
914 /* Jitterbug 1423 */
915 expectContainment(u"[[:Common:][:Inherited:]]", CharsToUnicodeString("\\U00003099\\U0001D169\\u0000"), "aA");
916
917 }
918
919 /**
920 * Test the [:Latin:] syntax.
921 */
TestPropertySet()922 void UnicodeSetTest::TestPropertySet() {
923 static const char* const DATA[] = {
924 // Pattern, Chars IN, Chars NOT in
925
926 "[:Latin:]",
927 "aA",
928 "\\u0391\\u03B1",
929
930 "[\\p{Greek}]",
931 "\\u0391\\u03B1",
932 "aA",
933
934 "\\P{ GENERAL Category = upper case letter }",
935 "abc",
936 "ABC",
937
938 #if !UCONFIG_NO_NORMALIZATION
939 // Combining class: @since ICU 2.2
940 // Check both symbolic and numeric
941 "\\p{ccc=Nukta}",
942 "\\u0ABC",
943 "abc",
944
945 "\\p{Canonical Combining Class = 11}",
946 "\\u05B1",
947 "\\u05B2",
948
949 "[:c c c = iota subscript :]",
950 "\\u0345",
951 "xyz",
952 #endif
953
954 // Bidi class: @since ICU 2.2
955 "\\p{bidiclass=lefttoright}",
956 "abc",
957 "\\u0671\\u0672",
958
959 // Binary properties: @since ICU 2.2
960 "\\p{ideographic}",
961 "\\u4E0A",
962 "x",
963
964 "[:math=false:]",
965 "q)*(",
966 // weiv: )(and * were removed from math in Unicode 4.0.1
967 //"(*+)",
968 "+<>^",
969
970 // JB#1767 \N{}, \p{ASCII}
971 "[:Ascii:]",
972 "abc\\u0000\\u007F",
973 "\\u0080\\u4E00",
974
975 "[\\N{ latin small letter a }[:name= latin small letter z:]]",
976 "az",
977 "qrs",
978
979 // JB#2015
980 "[:any:]",
981 "a\\U0010FFFF",
982 "",
983
984 "[:nv=0.5:]",
985 "\\u00BD\\u0F2A",
986 "\\u00BC",
987
988 // JB#2653: Age
989 "[:Age=1.1:]",
990 "\\u03D6", // 1.1
991 "\\u03D8\\u03D9", // 3.2
992
993 "[:Age=3.1:]",
994 "\\u1800\\u3400\\U0002f800",
995 "\\u0220\\u034f\\u30ff\\u33ff\\ufe73\\U00010000\\U00050000",
996
997 // JB#2350: Case_Sensitive
998 "[:Case Sensitive:]",
999 "A\\u1FFC\\U00010410",
1000 ";\\u00B4\\U00010500",
1001
1002 // JB#2832: C99-compatibility props
1003 "[:blank:]",
1004 " \\u0009",
1005 "1-9A-Z",
1006
1007 "[:graph:]",
1008 "19AZ",
1009 " \\u0003\\u0007\\u0009\\u000A\\u000D",
1010
1011 "[:punct:]",
1012 "!@#%&*()[]{}-_\\/;:,.?'\"",
1013 "09azAZ",
1014
1015 "[:xdigit:]",
1016 "09afAF",
1017 "gG!",
1018
1019 // Regex compatibility test
1020 "[-b]", // leading '-' is literal
1021 "-b",
1022 "ac",
1023
1024 "[^-b]", // leading '-' is literal
1025 "ac",
1026 "-b",
1027
1028 "[b-]", // trailing '-' is literal
1029 "-b",
1030 "ac",
1031
1032 "[^b-]", // trailing '-' is literal
1033 "ac",
1034 "-b",
1035
1036 "[a-b-]", // trailing '-' is literal
1037 "ab-",
1038 "c=",
1039
1040 "[[a-q]&[p-z]-]", // trailing '-' is literal
1041 "pq-",
1042 "or=",
1043
1044 "[\\s|\\)|:|$|\\>]", // from regex tests
1045 "s|):$>",
1046 "abc",
1047
1048 "[\\uDC00cd]", // JB#2906: isolated trail at start
1049 "cd\\uDC00",
1050 "ab\\uD800\\U00010000",
1051
1052 "[ab\\uD800]", // JB#2906: isolated trail at start
1053 "ab\\uD800",
1054 "cd\\uDC00\\U00010000",
1055
1056 "[ab\\uD800cd]", // JB#2906: isolated lead in middle
1057 "abcd\\uD800",
1058 "ef\\uDC00\\U00010000",
1059
1060 "[ab\\uDC00cd]", // JB#2906: isolated trail in middle
1061 "abcd\\uDC00",
1062 "ef\\uD800\\U00010000",
1063
1064 #if !UCONFIG_NO_NORMALIZATION
1065 "[:^lccc=0:]", // Lead canonical class
1066 "\\u0300\\u0301",
1067 "abcd\\u00c0\\u00c5",
1068
1069 "[:^tccc=0:]", // Trail canonical class
1070 "\\u0300\\u0301\\u00c0\\u00c5",
1071 "abcd",
1072
1073 "[[:^lccc=0:][:^tccc=0:]]", // Lead and trail canonical class
1074 "\\u0300\\u0301\\u00c0\\u00c5",
1075 "abcd",
1076
1077 "[[:^lccc=0:]-[:^tccc=0:]]", // Stuff that starts with an accent but ends with a base (none right now)
1078 "",
1079 "abcd\\u0300\\u0301\\u00c0\\u00c5",
1080
1081 "[[:ccc=0:]-[:lccc=0:]-[:tccc=0:]]", // Weirdos. Complete canonical class is zero, but both lead and trail are not
1082 "\\u0F73\\u0F75\\u0F81",
1083 "abcd\\u0300\\u0301\\u00c0\\u00c5",
1084 #endif /* !UCONFIG_NO_NORMALIZATION */
1085
1086 "[:Assigned:]",
1087 "A\\uE000\\uF8FF\\uFDC7\\U00010000\\U0010FFFD",
1088 "\\u0558\\uFDD3\\uFFFE\\U00050005",
1089
1090 // Script_Extensions, new in Unicode 6.0
1091 "[:scx=Arab:]",
1092 "\\u061E\\u061F\\u0620\\u0621\\u063F\\u0640\\u0650\\u065E\\uFDF1\\uFDF2\\uFDF3",
1093 "\\u088F\\uFDEF\\uFEFE",
1094
1095 // U+FDF2 has Script=Arabic and also Arab in its Script_Extensions,
1096 // so scx-sc is missing U+FDF2.
1097 "[[:Script_Extensions=Arabic:]-[:Arab:]]",
1098 "\\u0640\\u064B\\u0650\\u0655",
1099 "\\uFDF2"
1100 };
1101
1102 static const int32_t DATA_LEN = UPRV_LENGTHOF(DATA);
1103
1104 for (int32_t i=0; i<DATA_LEN; i+=3) {
1105 expectContainment(UnicodeString(DATA[i], -1, US_INV), CharsToUnicodeString(DATA[i+1]),
1106 CharsToUnicodeString(DATA[i+2]));
1107 }
1108 }
1109
1110 /**
1111 * Test that Posix style character classes [:digit:], etc.
1112 * have the Unicode definitions from TR 18.
1113 */
TestPosixClasses()1114 void UnicodeSetTest::TestPosixClasses() {
1115 {
1116 UErrorCode status = U_ZERO_ERROR;
1117 UnicodeSet s1("[:alpha:]", status);
1118 UnicodeSet s2(u"\\p{Alphabetic}", status);
1119 TEST_ASSERT_SUCCESS(status);
1120 TEST_ASSERT(s1==s2);
1121 }
1122 {
1123 UErrorCode status = U_ZERO_ERROR;
1124 UnicodeSet s1("[:lower:]", status);
1125 UnicodeSet s2(u"\\p{lowercase}", status);
1126 TEST_ASSERT_SUCCESS(status);
1127 TEST_ASSERT(s1==s2);
1128 }
1129 {
1130 UErrorCode status = U_ZERO_ERROR;
1131 UnicodeSet s1("[:upper:]", status);
1132 UnicodeSet s2(u"\\p{Uppercase}", status);
1133 TEST_ASSERT_SUCCESS(status);
1134 TEST_ASSERT(s1==s2);
1135 }
1136 {
1137 UErrorCode status = U_ZERO_ERROR;
1138 UnicodeSet s1("[:punct:]", status);
1139 UnicodeSet s2(u"\\p{gc=Punctuation}", status);
1140 TEST_ASSERT_SUCCESS(status);
1141 TEST_ASSERT(s1==s2);
1142 }
1143 {
1144 UErrorCode status = U_ZERO_ERROR;
1145 UnicodeSet s1("[:digit:]", status);
1146 UnicodeSet s2(u"\\p{gc=DecimalNumber}", status);
1147 TEST_ASSERT_SUCCESS(status);
1148 TEST_ASSERT(s1==s2);
1149 }
1150 {
1151 UErrorCode status = U_ZERO_ERROR;
1152 UnicodeSet s1("[:xdigit:]", status);
1153 UnicodeSet s2(u"[\\p{DecimalNumber}\\p{HexDigit}]", status);
1154 TEST_ASSERT_SUCCESS(status);
1155 TEST_ASSERT(s1==s2);
1156 }
1157 {
1158 UErrorCode status = U_ZERO_ERROR;
1159 UnicodeSet s1("[:alnum:]", status);
1160 UnicodeSet s2(u"[\\p{Alphabetic}\\p{DecimalNumber}]", status);
1161 TEST_ASSERT_SUCCESS(status);
1162 TEST_ASSERT(s1==s2);
1163 }
1164 {
1165 UErrorCode status = U_ZERO_ERROR;
1166 UnicodeSet s1("[:space:]", status);
1167 UnicodeSet s2(u"\\p{Whitespace}", status);
1168 TEST_ASSERT_SUCCESS(status);
1169 TEST_ASSERT(s1==s2);
1170 }
1171 {
1172 UErrorCode status = U_ZERO_ERROR;
1173 UnicodeSet s1("[:blank:]", status);
1174 TEST_ASSERT_SUCCESS(status);
1175 UnicodeSet s2(u"[\\p{Whitespace}-[\\u000a\\u000B\\u000c\\u000d\\u0085\\p{LineSeparator}\\p{ParagraphSeparator}]]",
1176 status);
1177 TEST_ASSERT_SUCCESS(status);
1178 TEST_ASSERT(s1==s2);
1179 }
1180 {
1181 UErrorCode status = U_ZERO_ERROR;
1182 UnicodeSet s1("[:cntrl:]", status);
1183 TEST_ASSERT_SUCCESS(status);
1184 UnicodeSet s2(u"\\p{Control}", status);
1185 TEST_ASSERT_SUCCESS(status);
1186 TEST_ASSERT(s1==s2);
1187 }
1188 {
1189 UErrorCode status = U_ZERO_ERROR;
1190 UnicodeSet s1("[:graph:]", status);
1191 TEST_ASSERT_SUCCESS(status);
1192 UnicodeSet s2(u"[^\\p{Whitespace}\\p{Control}\\p{Surrogate}\\p{Unassigned}]", status);
1193 TEST_ASSERT_SUCCESS(status);
1194 TEST_ASSERT(s1==s2);
1195 }
1196 {
1197 UErrorCode status = U_ZERO_ERROR;
1198 UnicodeSet s1("[:print:]", status);
1199 TEST_ASSERT_SUCCESS(status);
1200 UnicodeSet s2(u"[[:graph:][:blank:]-[\\p{Control}]]", status);
1201 TEST_ASSERT_SUCCESS(status);
1202 TEST_ASSERT(s1==s2);
1203 }
1204 }
1205 /**
1206 * Test cloning of UnicodeSet. For C++, we test the copy constructor.
1207 */
TestClone()1208 void UnicodeSetTest::TestClone() {
1209 UErrorCode ec = U_ZERO_ERROR;
1210 UnicodeSet s("[abcxyz]", ec);
1211 UnicodeSet t(s);
1212 expectContainment(t, "abc", "def");
1213 }
1214
1215 /**
1216 * Test the indexOf() and charAt() methods.
1217 */
TestIndexOf()1218 void UnicodeSetTest::TestIndexOf() {
1219 UErrorCode ec = U_ZERO_ERROR;
1220 UnicodeSet set("[a-cx-y3578]", ec);
1221 if (U_FAILURE(ec)) {
1222 errln("FAIL: UnicodeSet constructor");
1223 return;
1224 }
1225 for (int32_t i=0; i<set.size(); ++i) {
1226 UChar32 c = set.charAt(i);
1227 if (set.indexOf(c) != i) {
1228 errln("FAIL: charAt(%d) = %X => indexOf() => %d",
1229 i, c, set.indexOf(c));
1230 }
1231 }
1232 UChar32 c = set.charAt(set.size());
1233 if (c != -1) {
1234 errln("FAIL: charAt(<out of range>) = %X", c);
1235 }
1236 int32_t j = set.indexOf(u'q');
1237 if (j != -1) {
1238 errln((UnicodeString)"FAIL: indexOf('q') = " + j);
1239 }
1240 }
1241
1242 /**
1243 * Test closure API.
1244 */
TestCloseOver()1245 void UnicodeSetTest::TestCloseOver() {
1246 UErrorCode ec = U_ZERO_ERROR;
1247
1248 char CASE[] = {(char)USET_CASE_INSENSITIVE};
1249 char CASE_MAPPINGS[] = {(char)USET_ADD_CASE_MAPPINGS};
1250 const char* DATA[] = {
1251 // selector, input, output
1252 CASE,
1253 "[aq\\u00DF{Bc}{bC}{Fi}]",
1254 "[aAqQ\\u00DF\\u1E9E\\uFB01{ss}{bc}{fi}]", // U+1E9E LATIN CAPITAL LETTER SHARP S is new in Unicode 5.1
1255
1256 CASE,
1257 "[\\u01F1]", // 'DZ'
1258 "[\\u01F1\\u01F2\\u01F3]",
1259
1260 CASE,
1261 "[\\u1FB4]",
1262 "[\\u1FB4{\\u03AC\\u03B9}]",
1263
1264 CASE,
1265 "[{F\\uFB01}]",
1266 "[\\uFB03{ffi}]",
1267
1268 CASE, // make sure binary search finds limits
1269 "[a\\uFF3A]",
1270 "[aA\\uFF3A\\uFF5A]",
1271
1272 CASE,
1273 "[a-z]","[A-Za-z\\u017F\\u212A]",
1274 CASE,
1275 "[abc]","[A-Ca-c]",
1276 CASE,
1277 "[ABC]","[A-Ca-c]",
1278
1279 CASE, "[i]", "[iI]",
1280
1281 CASE, "[\\u0130]", "[\\u0130{i\\u0307}]", // dotted I
1282 CASE, "[{i\\u0307}]", "[\\u0130{i\\u0307}]", // i with dot
1283
1284 CASE, "[\\u0131]", "[\\u0131]", // dotless i
1285
1286 CASE, "[\\u0390]", "[\\u0390\\u1FD3{\\u03B9\\u0308\\u0301}]",
1287
1288 CASE, "[\\u03c2]", "[\\u03a3\\u03c2\\u03c3]", // sigmas
1289
1290 CASE, "[\\u03f2]", "[\\u03f2\\u03f9]", // lunate sigmas
1291
1292 CASE, "[\\u03f7]", "[\\u03f7\\u03f8]",
1293
1294 CASE, "[\\u1fe3]", "[\\u03b0\\u1fe3{\\u03c5\\u0308\\u0301}]",
1295
1296 CASE, "[\\ufb05]", "[\\ufb05\\ufb06{st}]",
1297 CASE, "[{st}]", "[\\ufb05\\ufb06{st}]",
1298
1299 CASE, "[\\U0001044F]", "[\\U00010427\\U0001044F]",
1300
1301 CASE, "[{a\\u02BE}]", "[\\u1E9A{a\\u02BE}]", // first in sorted table
1302
1303 CASE, "[{\\u1f7c\\u03b9}]", "[\\u1ff2{\\u1f7c\\u03b9}]", // last in sorted table
1304
1305 #if !UCONFIG_NO_FILE_IO
1306 CASE_MAPPINGS,
1307 "[aq\\u00DF{Bc}{bC}{Fi}]",
1308 "[aAqQ\\u00DF{ss}{Ss}{SS}{Bc}{BC}{bC}{bc}{FI}{Fi}{fi}]",
1309 #endif
1310
1311 CASE_MAPPINGS,
1312 "[\\u01F1]", // 'DZ'
1313 "[\\u01F1\\u01F2\\u01F3]",
1314
1315 CASE_MAPPINGS,
1316 "[a-z]",
1317 "[A-Za-z]",
1318
1319 NULL
1320 };
1321
1322 UnicodeSet s;
1323 UnicodeSet t;
1324 UnicodeString buf;
1325 for (int32_t i=0; DATA[i]!=NULL; i+=3) {
1326 int32_t selector = DATA[i][0];
1327 UnicodeString pat(DATA[i+1], -1, US_INV);
1328 UnicodeString exp(DATA[i+2], -1, US_INV);
1329 s.applyPattern(pat, ec);
1330 s.closeOver(selector);
1331 t.applyPattern(exp, ec);
1332 if (U_FAILURE(ec)) {
1333 errln("FAIL: applyPattern failed");
1334 continue;
1335 }
1336 if (s == t) {
1337 logln((UnicodeString)"Ok: " + pat + ".closeOver(" + selector + ") => " + exp);
1338 } else {
1339 dataerrln((UnicodeString)"FAIL: " + pat + ".closeOver(" + selector + ") => " +
1340 s.toPattern(buf, TRUE) + ", expected " + exp);
1341 }
1342 }
1343
1344 #if 0
1345 /*
1346 * Unused test code.
1347 * This was used to compare the old implementation (using USET_CASE)
1348 * with the new one (using 0x100 temporarily)
1349 * while transitioning from hardcoded case closure tables in uniset.cpp
1350 * (moved to uniset_props.cpp) to building the data by gencase into ucase.icu.
1351 * and using ucase.c functions for closure.
1352 * See Jitterbug 3432 RFE: Move uniset.cpp data to a data file
1353 *
1354 * Note: The old and new implementation never fully matched because
1355 * the old implementation turned out to not map U+0130 and U+0131 correctly
1356 * (dotted I and dotless i) and because the old implementation's data tables
1357 * were outdated compared to Unicode 4.0.1 at the time of the change to the
1358 * new implementation. (So sigmas and some other characters were not handled
1359 * according to the newer Unicode version.)
1360 */
1361 UnicodeSet sens("[:case_sensitive:]", ec), sens2, s2;
1362 UnicodeSetIterator si(sens);
1363 UnicodeString str, buf2;
1364 const UnicodeString *pStr;
1365 UChar32 c;
1366 while(si.next()) {
1367 if(!si.isString()) {
1368 c=si.getCodepoint();
1369 s.clear();
1370 s.add(c);
1371
1372 str.setTo(c);
1373 str.foldCase();
1374 sens2.add(str);
1375
1376 t=s;
1377 s.closeOver(USET_CASE);
1378 t.closeOver(0x100);
1379 if(s!=t) {
1380 errln("FAIL: closeOver(U+%04x) differs: ", c);
1381 errln((UnicodeString)"old "+s.toPattern(buf, TRUE)+" new: "+t.toPattern(buf2, TRUE));
1382 }
1383 }
1384 }
1385 // remove all code points
1386 // should contain all full case folding mapping strings
1387 sens2.remove(0, 0x10ffff);
1388 si.reset(sens2);
1389 while(si.next()) {
1390 if(si.isString()) {
1391 pStr=&si.getString();
1392 s.clear();
1393 s.add(*pStr);
1394 t=s2=s;
1395 s.closeOver(USET_CASE);
1396 t.closeOver(0x100);
1397 if(s!=t) {
1398 errln((UnicodeString)"FAIL: closeOver("+s2.toPattern(buf, TRUE)+") differs: ");
1399 errln((UnicodeString)"old "+s.toPattern(buf, TRUE)+" new: "+t.toPattern(buf2, TRUE));
1400 }
1401 }
1402 }
1403 #endif
1404
1405 // Test the pattern API
1406 s.applyPattern("[abc]", USET_CASE_INSENSITIVE, NULL, ec);
1407 if (U_FAILURE(ec)) {
1408 errln("FAIL: applyPattern failed");
1409 } else {
1410 expectContainment(s, "abcABC", "defDEF");
1411 }
1412 UnicodeSet v("[^abc]", USET_CASE_INSENSITIVE, NULL, ec);
1413 if (U_FAILURE(ec)) {
1414 errln("FAIL: constructor failed");
1415 } else {
1416 expectContainment(v, "defDEF", "abcABC");
1417 }
1418 UnicodeSet cm("[abck]", USET_ADD_CASE_MAPPINGS, NULL, ec);
1419 if (U_FAILURE(ec)) {
1420 errln("FAIL: construct w/case mappings failed");
1421 } else {
1422 expectContainment(cm, "abckABCK", CharsToUnicodeString("defDEF\\u212A"));
1423 }
1424 }
1425
TestEscapePattern()1426 void UnicodeSetTest::TestEscapePattern() {
1427 const char pattern[] =
1428 "[\\uFEFF \\u200A-\\u200E \\U0001D173-\\U0001D17A \\U000F0000-\\U000FFFFD ]";
1429 const char exp[] =
1430 "[\\u200A-\\u200E\\uFEFF\\U0001D173-\\U0001D17A\\U000F0000-\\U000FFFFD]";
1431 // We test this with two passes; in the second pass we
1432 // pre-unescape the pattern. Since U+200E is Pattern_White_Space,
1433 // this fails -- which is what we expect.
1434 for (int32_t pass=1; pass<=2; ++pass) {
1435 UErrorCode ec = U_ZERO_ERROR;
1436 UnicodeString pat(pattern, -1, US_INV);
1437 if (pass==2) {
1438 pat = pat.unescape();
1439 }
1440 // Pattern is only good for pass 1
1441 UBool isPatternValid = (pass==1);
1442
1443 UnicodeSet set(pat, ec);
1444 if (U_SUCCESS(ec) != isPatternValid){
1445 errln((UnicodeString)"FAIL: applyPattern(" +
1446 escape(pat) + ") => " +
1447 u_errorName(ec));
1448 continue;
1449 }
1450 if (U_FAILURE(ec)) {
1451 continue;
1452 }
1453 if (set.contains(u'\u0644')){
1454 errln((UnicodeString)"FAIL: " + escape(pat) + " contains(U+0664)");
1455 }
1456
1457 UnicodeString newpat;
1458 set.toPattern(newpat, TRUE);
1459 if (newpat == UnicodeString(exp, -1, US_INV)) {
1460 logln(escape(pat) + " => " + newpat);
1461 } else {
1462 errln((UnicodeString)"FAIL: " + escape(pat) + " => " + newpat);
1463 }
1464
1465 for (int32_t i=0; i<set.getRangeCount(); ++i) {
1466 UnicodeString str("Range ");
1467 str.append((UChar)(u'0' + i))
1468 .append(": ")
1469 .append((UChar32)set.getRangeStart(i))
1470 .append(" - ")
1471 .append((UChar32)set.getRangeEnd(i));
1472 str = str + " (" + set.getRangeStart(i) + " - " +
1473 set.getRangeEnd(i) + ")";
1474 if (set.getRangeStart(i) < 0) {
1475 errln((UnicodeString)"FAIL: " + escape(str));
1476 } else {
1477 logln(escape(str));
1478 }
1479 }
1480 }
1481 }
1482
expectRange(const UnicodeString & label,const UnicodeSet & set,UChar32 start,UChar32 end)1483 void UnicodeSetTest::expectRange(const UnicodeString& label,
1484 const UnicodeSet& set,
1485 UChar32 start, UChar32 end) {
1486 UnicodeSet exp(start, end);
1487 UnicodeString pat;
1488 if (set == exp) {
1489 logln(label + " => " + set.toPattern(pat, TRUE));
1490 } else {
1491 UnicodeString xpat;
1492 errln((UnicodeString)"FAIL: " + label + " => " +
1493 set.toPattern(pat, TRUE) +
1494 ", expected " + exp.toPattern(xpat, TRUE));
1495 }
1496 }
1497
TestInvalidCodePoint()1498 void UnicodeSetTest::TestInvalidCodePoint() {
1499
1500 const UChar32 DATA[] = {
1501 // Test range Expected range
1502 0, 0x10FFFF, 0, 0x10FFFF,
1503 (UChar32)-1, 8, 0, 8,
1504 8, 0x110000, 8, 0x10FFFF
1505 };
1506 const int32_t DATA_LENGTH = UPRV_LENGTHOF(DATA);
1507
1508 UnicodeString pat;
1509 int32_t i;
1510
1511 for (i=0; i<DATA_LENGTH; i+=4) {
1512 UChar32 start = DATA[i];
1513 UChar32 end = DATA[i+1];
1514 UChar32 xstart = DATA[i+2];
1515 UChar32 xend = DATA[i+3];
1516
1517 // Try various API using the test code points
1518
1519 UnicodeSet set(start, end);
1520 expectRange((UnicodeString)"ct(" + start + "," + end + ")",
1521 set, xstart, xend);
1522
1523 set.clear();
1524 set.set(start, end);
1525 expectRange((UnicodeString)"set(" + start + "," + end + ")",
1526 set, xstart, xend);
1527
1528 UBool b = set.contains(start);
1529 b = set.contains(start, end);
1530 b = set.containsNone(start, end);
1531 b = set.containsSome(start, end);
1532 (void)b; // Suppress set but not used warning.
1533
1534 /*int32_t index = set.indexOf(start);*/
1535
1536 set.clear();
1537 set.add(start);
1538 set.add(start, end);
1539 expectRange((UnicodeString)"add(" + start + "," + end + ")",
1540 set, xstart, xend);
1541
1542 set.set(0, 0x10FFFF);
1543 set.retain(start, end);
1544 expectRange((UnicodeString)"retain(" + start + "," + end + ")",
1545 set, xstart, xend);
1546 set.retain(start);
1547
1548 set.set(0, 0x10FFFF);
1549 set.remove(start);
1550 set.remove(start, end);
1551 set.complement();
1552 expectRange((UnicodeString)"!remove(" + start + "," + end + ")",
1553 set, xstart, xend);
1554
1555 set.set(0, 0x10FFFF);
1556 set.complement(start, end);
1557 set.complement();
1558 expectRange((UnicodeString)"!complement(" + start + "," + end + ")",
1559 set, xstart, xend);
1560 set.complement(start);
1561 }
1562
1563 const UChar32 DATA2[] = {
1564 0,
1565 0x10FFFF,
1566 (UChar32)-1,
1567 0x110000
1568 };
1569 const int32_t DATA2_LENGTH = UPRV_LENGTHOF(DATA2);
1570
1571 for (i=0; i<DATA2_LENGTH; ++i) {
1572 UChar32 c = DATA2[i], end = 0x10FFFF;
1573 UBool valid = (c >= 0 && c <= 0x10FFFF);
1574
1575 UnicodeSet set(0, 0x10FFFF);
1576
1577 // For single-codepoint contains, invalid codepoints are NOT contained
1578 UBool b = set.contains(c);
1579 if (b == valid) {
1580 logln((UnicodeString)"[\\u0000-\\U0010FFFF].contains(" + c +
1581 ") = " + b);
1582 } else {
1583 errln((UnicodeString)"FAIL: [\\u0000-\\U0010FFFF].contains(" + c +
1584 ") = " + b);
1585 }
1586
1587 // For codepoint range contains, containsNone, and containsSome,
1588 // invalid or empty (start > end) ranges have UNDEFINED behavior.
1589 b = set.contains(c, end);
1590 logln((UnicodeString)"* [\\u0000-\\U0010FFFF].contains(" + c +
1591 "," + end + ") = " + b);
1592
1593 b = set.containsNone(c, end);
1594 logln((UnicodeString)"* [\\u0000-\\U0010FFFF].containsNone(" + c +
1595 "," + end + ") = " + b);
1596
1597 b = set.containsSome(c, end);
1598 logln((UnicodeString)"* [\\u0000-\\U0010FFFF].containsSome(" + c +
1599 "," + end + ") = " + b);
1600
1601 int32_t index = set.indexOf(c);
1602 if ((index >= 0) == valid) {
1603 logln((UnicodeString)"[\\u0000-\\U0010FFFF].indexOf(" + c +
1604 ") = " + index);
1605 } else {
1606 errln((UnicodeString)"FAIL: [\\u0000-\\U0010FFFF].indexOf(" + c +
1607 ") = " + index);
1608 }
1609 }
1610 }
1611
1612 // Used by TestSymbolTable
1613 class TokenSymbolTable : public SymbolTable {
1614 public:
1615 Hashtable contents;
1616
TokenSymbolTable(UErrorCode & ec)1617 TokenSymbolTable(UErrorCode& ec) : contents(FALSE, ec) {
1618 contents.setValueDeleter(uprv_deleteUObject);
1619 }
1620
~TokenSymbolTable()1621 ~TokenSymbolTable() {}
1622
1623 /**
1624 * (Non-SymbolTable API) Add the given variable and value to
1625 * the table. Variable should NOT contain leading '$'.
1626 */
add(const UnicodeString & var,const UnicodeString & value,UErrorCode & ec)1627 void add(const UnicodeString& var, const UnicodeString& value,
1628 UErrorCode& ec) {
1629 if (U_SUCCESS(ec)) {
1630 contents.put(var, new UnicodeString(value), ec);
1631 }
1632 }
1633
1634 /**
1635 * SymbolTable API
1636 */
lookup(const UnicodeString & s) const1637 virtual const UnicodeString* lookup(const UnicodeString& s) const override {
1638 return (const UnicodeString*) contents.get(s);
1639 }
1640
1641 /**
1642 * SymbolTable API
1643 */
lookupMatcher(UChar32) const1644 virtual const UnicodeFunctor* lookupMatcher(UChar32 /*ch*/) const override {
1645 return NULL;
1646 }
1647
1648 /**
1649 * SymbolTable API
1650 */
parseReference(const UnicodeString & text,ParsePosition & pos,int32_t limit) const1651 virtual UnicodeString parseReference(const UnicodeString& text,
1652 ParsePosition& pos, int32_t limit) const override {
1653 int32_t start = pos.getIndex();
1654 int32_t i = start;
1655 UnicodeString result;
1656 while (i < limit) {
1657 UChar c = text.charAt(i);
1658 if ((i==start && !u_isIDStart(c)) || !u_isIDPart(c)) {
1659 break;
1660 }
1661 ++i;
1662 }
1663 if (i == start) { // No valid name chars
1664 return result; // Indicate failure with empty string
1665 }
1666 pos.setIndex(i);
1667 text.extractBetween(start, i, result);
1668 return result;
1669 }
1670 };
1671
TestSymbolTable()1672 void UnicodeSetTest::TestSymbolTable() {
1673 // Multiple test cases can be set up here. Each test case
1674 // is terminated by null:
1675 // var, value, var, value,..., input pat., exp. output pat., null
1676 const char* DATA[] = {
1677 "us", "a-z", "[0-1$us]", "[0-1a-z]", NULL,
1678 "us", "[a-z]", "[0-1$us]", "[0-1[a-z]]", NULL,
1679 "us", "\\[a\\-z\\]", "[0-1$us]", "[-01\\[\\]az]", NULL,
1680 NULL
1681 };
1682
1683 for (int32_t i=0; DATA[i]!=NULL; ++i) {
1684 UErrorCode ec = U_ZERO_ERROR;
1685 TokenSymbolTable sym(ec);
1686 if (U_FAILURE(ec)) {
1687 errln("FAIL: couldn't construct TokenSymbolTable");
1688 continue;
1689 }
1690
1691 // Set up variables
1692 while (DATA[i+2] != NULL) {
1693 sym.add(UnicodeString(DATA[i], -1, US_INV), UnicodeString(DATA[i+1], -1, US_INV), ec);
1694 if (U_FAILURE(ec)) {
1695 errln("FAIL: couldn't add to TokenSymbolTable");
1696 continue;
1697 }
1698 i += 2;
1699 }
1700
1701 // Input pattern and expected output pattern
1702 UnicodeString inpat = UnicodeString(DATA[i], -1, US_INV), exppat = UnicodeString(DATA[i+1], -1, US_INV);
1703 i += 2;
1704
1705 ParsePosition pos(0);
1706 UnicodeSet us(inpat, pos, USET_IGNORE_SPACE, &sym, ec);
1707 if (U_FAILURE(ec)) {
1708 errln("FAIL: couldn't construct UnicodeSet");
1709 continue;
1710 }
1711
1712 // results
1713 if (pos.getIndex() != inpat.length()) {
1714 errln((UnicodeString)"Failed to read to end of string \""
1715 + inpat + "\": read to "
1716 + pos.getIndex() + ", length is "
1717 + inpat.length());
1718 }
1719
1720 UnicodeSet us2(exppat, ec);
1721 if (U_FAILURE(ec)) {
1722 errln("FAIL: couldn't construct expected UnicodeSet");
1723 continue;
1724 }
1725
1726 UnicodeString a, b;
1727 if (us != us2) {
1728 errln((UnicodeString)"Failed, got " + us.toPattern(a, TRUE) +
1729 ", expected " + us2.toPattern(b, TRUE));
1730 } else {
1731 logln((UnicodeString)"Ok, got " + us.toPattern(a, TRUE));
1732 }
1733 }
1734 }
1735
TestSurrogate()1736 void UnicodeSetTest::TestSurrogate() {
1737 const char* DATA[] = {
1738 // These should all behave identically
1739 "[abc\\uD800\\uDC00]",
1740 // "[abc\uD800\uDC00]", // Can't do this on C -- only Java
1741 "[abc\\U00010000]",
1742 0
1743 };
1744 for (int i=0; DATA[i] != 0; ++i) {
1745 UErrorCode ec = U_ZERO_ERROR;
1746 logln((UnicodeString)"Test pattern " + i + " :" + UnicodeString(DATA[i], -1, US_INV));
1747 UnicodeString str = UnicodeString(DATA[i], -1, US_INV);
1748 UnicodeSet set(str, ec);
1749 if (U_FAILURE(ec)) {
1750 errln("FAIL: UnicodeSet constructor");
1751 continue;
1752 }
1753 expectContainment(set,
1754 CharsToUnicodeString("abc\\U00010000"),
1755 CharsToUnicodeString("\\uD800;\\uDC00")); // split apart surrogate-pair
1756 if (set.size() != 4) {
1757 errln((UnicodeString)"FAIL: " + UnicodeString(DATA[i], -1, US_INV) + ".size() == " +
1758 set.size() + ", expected 4");
1759 }
1760
1761 {
1762 UErrorCode subErr = U_ZERO_ERROR;
1763 checkRoundTrip(set);
1764 checkSerializeRoundTrip(set, subErr);
1765 }
1766 }
1767 }
1768
TestExhaustive()1769 void UnicodeSetTest::TestExhaustive() {
1770 // exhaustive tests. Simulate UnicodeSets with integers.
1771 // That gives us very solid tests (except for large memory tests).
1772
1773 int32_t limit = 128;
1774
1775 UnicodeSet x, y, z, aa;
1776
1777 for (int32_t i = 0; i < limit; ++i) {
1778 bitsToSet(i, x);
1779 logln((UnicodeString)"Testing " + i + ", " + x);
1780 _testComplement(i, x, y);
1781
1782 UnicodeSet &toTest = bitsToSet(i, aa);
1783
1784 // AS LONG AS WE ARE HERE, check roundtrip
1785 checkRoundTrip(toTest);
1786 UErrorCode ec = U_ZERO_ERROR;
1787 checkSerializeRoundTrip(toTest, ec);
1788
1789 for (int32_t j = 0; j < limit; ++j) {
1790 _testAdd(i,j, x,y,z);
1791 _testXor(i,j, x,y,z);
1792 _testRetain(i,j, x,y,z);
1793 _testRemove(i,j, x,y,z);
1794 }
1795 }
1796 }
1797
_testComplement(int32_t a,UnicodeSet & x,UnicodeSet & z)1798 void UnicodeSetTest::_testComplement(int32_t a, UnicodeSet& x, UnicodeSet& z) {
1799 bitsToSet(a, x);
1800 z = x;
1801 z.complement();
1802 int32_t c = setToBits(z);
1803 if (c != (~a)) {
1804 errln((UnicodeString)"FAILED: add: ~" + x + " != " + z);
1805 errln((UnicodeString)"FAILED: add: ~" + a + " != " + c);
1806 }
1807 checkCanonicalRep(z, (UnicodeString)"complement " + a);
1808 }
1809
_testAdd(int32_t a,int32_t b,UnicodeSet & x,UnicodeSet & y,UnicodeSet & z)1810 void UnicodeSetTest::_testAdd(int32_t a, int32_t b, UnicodeSet& x, UnicodeSet& y, UnicodeSet& z) {
1811 bitsToSet(a, x);
1812 bitsToSet(b, y);
1813 z = x;
1814 z.addAll(y);
1815 int32_t c = setToBits(z);
1816 if (c != (a | b)) {
1817 errln((UnicodeString)"FAILED: add: " + x + " | " + y + " != " + z);
1818 errln((UnicodeString)"FAILED: add: " + a + " | " + b + " != " + c);
1819 }
1820 checkCanonicalRep(z, (UnicodeString)"add " + a + "," + b);
1821 }
1822
_testRetain(int32_t a,int32_t b,UnicodeSet & x,UnicodeSet & y,UnicodeSet & z)1823 void UnicodeSetTest::_testRetain(int32_t a, int32_t b, UnicodeSet& x, UnicodeSet& y, UnicodeSet& z) {
1824 bitsToSet(a, x);
1825 bitsToSet(b, y);
1826 z = x;
1827 z.retainAll(y);
1828 int32_t c = setToBits(z);
1829 if (c != (a & b)) {
1830 errln((UnicodeString)"FAILED: retain: " + x + " & " + y + " != " + z);
1831 errln((UnicodeString)"FAILED: retain: " + a + " & " + b + " != " + c);
1832 }
1833 checkCanonicalRep(z, (UnicodeString)"retain " + a + "," + b);
1834 }
1835
_testRemove(int32_t a,int32_t b,UnicodeSet & x,UnicodeSet & y,UnicodeSet & z)1836 void UnicodeSetTest::_testRemove(int32_t a, int32_t b, UnicodeSet& x, UnicodeSet& y, UnicodeSet& z) {
1837 bitsToSet(a, x);
1838 bitsToSet(b, y);
1839 z = x;
1840 z.removeAll(y);
1841 int32_t c = setToBits(z);
1842 if (c != (a &~ b)) {
1843 errln((UnicodeString)"FAILED: remove: " + x + " &~ " + y + " != " + z);
1844 errln((UnicodeString)"FAILED: remove: " + a + " &~ " + b + " != " + c);
1845 }
1846 checkCanonicalRep(z, (UnicodeString)"remove " + a + "," + b);
1847 }
1848
_testXor(int32_t a,int32_t b,UnicodeSet & x,UnicodeSet & y,UnicodeSet & z)1849 void UnicodeSetTest::_testXor(int32_t a, int32_t b, UnicodeSet& x, UnicodeSet& y, UnicodeSet& z) {
1850 bitsToSet(a, x);
1851 bitsToSet(b, y);
1852 z = x;
1853 z.complementAll(y);
1854 int32_t c = setToBits(z);
1855 if (c != (a ^ b)) {
1856 errln((UnicodeString)"FAILED: complement: " + x + " ^ " + y + " != " + z);
1857 errln((UnicodeString)"FAILED: complement: " + a + " ^ " + b + " != " + c);
1858 }
1859 checkCanonicalRep(z, (UnicodeString)"complement " + a + "," + b);
1860 }
1861
1862 /**
1863 * Check that ranges are monotonically increasing and non-
1864 * overlapping.
1865 */
checkCanonicalRep(const UnicodeSet & set,const UnicodeString & msg)1866 void UnicodeSetTest::checkCanonicalRep(const UnicodeSet& set, const UnicodeString& msg) {
1867 int32_t n = set.getRangeCount();
1868 if (n < 0) {
1869 errln((UnicodeString)"FAIL result of " + msg +
1870 ": range count should be >= 0 but is " +
1871 n /*+ " for " + set.toPattern())*/);
1872 return;
1873 }
1874 UChar32 last = 0;
1875 for (int32_t i=0; i<n; ++i) {
1876 UChar32 start = set.getRangeStart(i);
1877 UChar32 end = set.getRangeEnd(i);
1878 if (start > end) {
1879 errln((UnicodeString)"FAIL result of " + msg +
1880 ": range " + (i+1) +
1881 " start > end: " + (int)start + ", " + (int)end +
1882 " for " + set);
1883 }
1884 if (i > 0 && start <= last) {
1885 errln((UnicodeString)"FAIL result of " + msg +
1886 ": range " + (i+1) +
1887 " overlaps previous range: " + (int)start + ", " + (int)end +
1888 " for " + set);
1889 }
1890 last = end;
1891 }
1892 }
1893
1894 /**
1895 * Convert a bitmask to a UnicodeSet.
1896 */
bitsToSet(int32_t a,UnicodeSet & result)1897 UnicodeSet& UnicodeSetTest::bitsToSet(int32_t a, UnicodeSet& result) {
1898 result.clear();
1899 for (UChar32 i = 0; i < 32; ++i) {
1900 if ((a & (1<<i)) != 0) {
1901 result.add(i);
1902 }
1903 }
1904 return result;
1905 }
1906
1907 /**
1908 * Convert a UnicodeSet to a bitmask. Only the characters
1909 * U+0000 to U+0020 are represented in the bitmask.
1910 */
setToBits(const UnicodeSet & x)1911 int32_t UnicodeSetTest::setToBits(const UnicodeSet& x) {
1912 int32_t result = 0;
1913 for (int32_t i = 0; i < 32; ++i) {
1914 if (x.contains((UChar32)i)) {
1915 result |= (1<<i);
1916 }
1917 }
1918 return result;
1919 }
1920
1921 /**
1922 * Return the representation of an inversion list based UnicodeSet
1923 * as a pairs list. Ranges are listed in ascending Unicode order.
1924 * For example, the set [a-zA-M3] is represented as "33AMaz".
1925 */
getPairs(const UnicodeSet & set)1926 UnicodeString UnicodeSetTest::getPairs(const UnicodeSet& set) {
1927 UnicodeString pairs;
1928 for (int32_t i=0; i<set.getRangeCount(); ++i) {
1929 UChar32 start = set.getRangeStart(i);
1930 UChar32 end = set.getRangeEnd(i);
1931 if (end > 0xFFFF) {
1932 end = 0xFFFF;
1933 i = set.getRangeCount(); // Should be unnecessary
1934 }
1935 pairs.append((UChar)start).append((UChar)end);
1936 }
1937 return pairs;
1938 }
1939
1940 /**
1941 * Basic consistency check for a few items.
1942 * That the iterator works, and that we can create a pattern and
1943 * get the same thing back
1944 */
checkRoundTrip(const UnicodeSet & s)1945 void UnicodeSetTest::checkRoundTrip(const UnicodeSet& s) {
1946 {
1947 UnicodeSet t(s);
1948 checkEqual(s, t, "copy ct");
1949 }
1950
1951 {
1952 UnicodeSet t(0xabcd, 0xdef0); // dummy contents should be overwritten
1953 t = s;
1954 checkEqual(s, t, "operator=");
1955 }
1956
1957 {
1958 UnicodeSet t;
1959 copyWithIterator(t, s, FALSE);
1960 checkEqual(s, t, "iterator roundtrip");
1961 }
1962
1963 {
1964 UnicodeSet t;
1965 copyWithIterator(t, s, TRUE); // try range
1966 checkEqual(s, t, "iterator roundtrip");
1967 }
1968
1969 {
1970 UnicodeSet t;
1971 UnicodeString pat;
1972 UErrorCode ec = U_ZERO_ERROR;
1973 s.toPattern(pat, FALSE);
1974 t.applyPattern(pat, ec);
1975 if (U_FAILURE(ec)) {
1976 errln("FAIL: toPattern(escapeUnprintable=FALSE), applyPattern - %s", u_errorName(ec));
1977 return;
1978 } else {
1979 checkEqual(s, t, "toPattern(false)");
1980 }
1981 }
1982
1983 {
1984 UnicodeSet t;
1985 UnicodeString pat;
1986 UErrorCode ec = U_ZERO_ERROR;
1987 s.toPattern(pat, TRUE);
1988 t.applyPattern(pat, ec);
1989 if (U_FAILURE(ec)) {
1990 errln("FAIL: toPattern(escapeUnprintable=TRUE), applyPattern - %s", u_errorName(ec));
1991 return;
1992 } else {
1993 checkEqual(s, t, "toPattern(true)");
1994 }
1995 }
1996 }
1997
checkSerializeRoundTrip(const UnicodeSet & t,UErrorCode & status)1998 void UnicodeSetTest::checkSerializeRoundTrip(const UnicodeSet& t, UErrorCode &status) {
1999 if(U_FAILURE(status)) return;
2000 int32_t len = t.serialize(serializeBuffer.getAlias(), serializeBuffer.getCapacity(), status);
2001 if(status == U_BUFFER_OVERFLOW_ERROR) {
2002 status = U_ZERO_ERROR;
2003 serializeBuffer.resize(len);
2004 len = t.serialize(serializeBuffer.getAlias(), serializeBuffer.getCapacity(), status);
2005 // let 2nd error stand
2006 }
2007 if(U_FAILURE(status)) {
2008 errln("checkSerializeRoundTrip: error %s serializing buffer\n", u_errorName(status));
2009 return;
2010 }
2011 UnicodeSet deserialized(serializeBuffer.getAlias(), len, UnicodeSet::kSerialized, status);
2012 if(U_FAILURE(status)) {
2013 errln("checkSerializeRoundTrip: error %s deserializing buffer: buf %p len %d, original %d\n", u_errorName(status), serializeBuffer.getAlias(), len, t.getRangeCount());
2014 return;
2015 }
2016
2017 checkEqual(t, deserialized, "Set was unequal when deserialized");
2018 }
2019
copyWithIterator(UnicodeSet & t,const UnicodeSet & s,UBool withRange)2020 void UnicodeSetTest::copyWithIterator(UnicodeSet& t, const UnicodeSet& s, UBool withRange) {
2021 t.clear();
2022 UnicodeSetIterator it(s);
2023 if (withRange) {
2024 while (it.nextRange()) {
2025 if (it.isString()) {
2026 t.add(it.getString());
2027 } else {
2028 t.add(it.getCodepoint(), it.getCodepointEnd());
2029 }
2030 }
2031 } else {
2032 while (it.next()) {
2033 if (it.isString()) {
2034 t.add(it.getString());
2035 } else {
2036 t.add(it.getCodepoint());
2037 }
2038 }
2039 }
2040 }
2041
checkEqual(const UnicodeSet & s,const UnicodeSet & t,const char * message)2042 UBool UnicodeSetTest::checkEqual(const UnicodeSet& s, const UnicodeSet& t, const char* message) {
2043 assertEquals(UnicodeString("RangeCount: ","") + message, s.getRangeCount(), t.getRangeCount());
2044 assertEquals(UnicodeString("size: ","") + message, s.size(), t.size());
2045 UnicodeString source; s.toPattern(source, TRUE);
2046 UnicodeString result; t.toPattern(result, TRUE);
2047 if (s != t) {
2048 errln((UnicodeString)"FAIL: " + message
2049 + "; source = " + source
2050 + "; result = " + result
2051 );
2052 return FALSE;
2053 } else {
2054 logln((UnicodeString)"Ok: " + message
2055 + "; source = " + source
2056 + "; result = " + result
2057 );
2058 }
2059 return TRUE;
2060 }
2061
2062 void
expectContainment(const UnicodeString & pat,const UnicodeString & charsIn,const UnicodeString & charsOut)2063 UnicodeSetTest::expectContainment(const UnicodeString& pat,
2064 const UnicodeString& charsIn,
2065 const UnicodeString& charsOut) {
2066 UErrorCode ec = U_ZERO_ERROR;
2067 UnicodeSet set(pat, ec);
2068 if (U_FAILURE(ec)) {
2069 dataerrln((UnicodeString)"FAIL: pattern \"" +
2070 pat + "\" => " + u_errorName(ec));
2071 return;
2072 }
2073 expectContainment(set, pat, charsIn, charsOut);
2074 }
2075
2076 void
expectContainment(const UnicodeSet & set,const UnicodeString & charsIn,const UnicodeString & charsOut)2077 UnicodeSetTest::expectContainment(const UnicodeSet& set,
2078 const UnicodeString& charsIn,
2079 const UnicodeString& charsOut) {
2080 UnicodeString pat;
2081 set.toPattern(pat);
2082 expectContainment(set, pat, charsIn, charsOut);
2083 }
2084
2085 void
expectContainment(const UnicodeSet & set,const UnicodeString & setName,const UnicodeString & charsIn,const UnicodeString & charsOut)2086 UnicodeSetTest::expectContainment(const UnicodeSet& set,
2087 const UnicodeString& setName,
2088 const UnicodeString& charsIn,
2089 const UnicodeString& charsOut) {
2090 UnicodeString bad;
2091 UChar32 c;
2092 int32_t i;
2093
2094 for (i=0; i<charsIn.length(); i+=U16_LENGTH(c)) {
2095 c = charsIn.char32At(i);
2096 if (!set.contains(c)) {
2097 bad.append(c);
2098 }
2099 }
2100 if (bad.length() > 0) {
2101 errln((UnicodeString)"Fail: set " + setName + " does not contain " + prettify(bad) +
2102 ", expected containment of " + prettify(charsIn));
2103 } else {
2104 logln((UnicodeString)"Ok: set " + setName + " contains " + prettify(charsIn));
2105 }
2106
2107 bad.truncate(0);
2108 for (i=0; i<charsOut.length(); i+=U16_LENGTH(c)) {
2109 c = charsOut.char32At(i);
2110 if (set.contains(c)) {
2111 bad.append(c);
2112 }
2113 }
2114 if (bad.length() > 0) {
2115 errln((UnicodeString)"Fail: set " + setName + " contains " + prettify(bad) +
2116 ", expected non-containment of " + prettify(charsOut));
2117 } else {
2118 logln((UnicodeString)"Ok: set " + setName + " does not contain " + prettify(charsOut));
2119 }
2120 }
2121
2122 void
expectPattern(UnicodeSet & set,const UnicodeString & pattern,const UnicodeString & expectedPairs)2123 UnicodeSetTest::expectPattern(UnicodeSet& set,
2124 const UnicodeString& pattern,
2125 const UnicodeString& expectedPairs){
2126 UErrorCode status = U_ZERO_ERROR;
2127 set.applyPattern(pattern, status);
2128 if (U_FAILURE(status)) {
2129 errln(UnicodeString("FAIL: applyPattern(\"") + pattern +
2130 "\") failed");
2131 return;
2132 } else {
2133 if (getPairs(set) != expectedPairs ) {
2134 errln(UnicodeString("FAIL: applyPattern(\"") + pattern +
2135 "\") => pairs \"" +
2136 escape(getPairs(set)) + "\", expected \"" +
2137 escape(expectedPairs) + "\"");
2138 } else {
2139 logln(UnicodeString("Ok: applyPattern(\"") + pattern +
2140 "\") => pairs \"" +
2141 escape(getPairs(set)) + "\"");
2142 }
2143 }
2144 // the result of calling set.toPattern(), which is the string representation of
2145 // this set(set), is passed to a UnicodeSet constructor, and tested that it
2146 // will produce another set that is equal to this one.
2147 UnicodeString temppattern;
2148 set.toPattern(temppattern);
2149 UnicodeSet *tempset=new UnicodeSet(temppattern, status);
2150 if (U_FAILURE(status)) {
2151 errln(UnicodeString("FAIL: applyPattern(\""+ pattern + "\").toPattern() => " + temppattern + " => invalid pattern"));
2152 return;
2153 }
2154 if(*tempset != set || getPairs(*tempset) != getPairs(set)){
2155 errln(UnicodeString("FAIL: applyPattern(\""+ pattern + "\").toPattern() => " + temppattern + " => pairs \""+ escape(getPairs(*tempset)) + "\", expected pairs \"" +
2156 escape(getPairs(set)) + "\""));
2157 } else{
2158 logln(UnicodeString("Ok: applyPattern(\""+ pattern + "\").toPattern() => " + temppattern + " => pairs \"" + escape(getPairs(*tempset)) + "\""));
2159 }
2160
2161 delete tempset;
2162
2163 }
2164
2165 void
expectPairs(const UnicodeSet & set,const UnicodeString & expectedPairs)2166 UnicodeSetTest::expectPairs(const UnicodeSet& set, const UnicodeString& expectedPairs) {
2167 if (getPairs(set) != expectedPairs) {
2168 errln(UnicodeString("FAIL: Expected pair list \"") +
2169 escape(expectedPairs) + "\", got \"" +
2170 escape(getPairs(set)) + "\"");
2171 }
2172 }
2173
expectToPattern(const UnicodeSet & set,const UnicodeString & expPat,const char ** expStrings)2174 void UnicodeSetTest::expectToPattern(const UnicodeSet& set,
2175 const UnicodeString& expPat,
2176 const char** expStrings) {
2177 UnicodeString pat;
2178 set.toPattern(pat, TRUE);
2179 if (pat == expPat) {
2180 logln((UnicodeString)"Ok: toPattern() => \"" + pat + "\"");
2181 } else {
2182 errln((UnicodeString)"FAIL: toPattern() => \"" + pat + "\", expected \"" + expPat + "\"");
2183 return;
2184 }
2185 if (expStrings == NULL) {
2186 return;
2187 }
2188 UBool in = TRUE;
2189 for (int32_t i=0; expStrings[i] != NULL; ++i) {
2190 if (expStrings[i] == NOT) { // sic; pointer comparison
2191 in = FALSE;
2192 continue;
2193 }
2194 UnicodeString s = CharsToUnicodeString(expStrings[i]);
2195 UBool contained = set.contains(s);
2196 if (contained == in) {
2197 logln((UnicodeString)"Ok: " + expPat +
2198 (contained ? " contains {" : " does not contain {") +
2199 escape(expStrings[i]) + "}");
2200 } else {
2201 errln((UnicodeString)"FAIL: " + expPat +
2202 (contained ? " contains {" : " does not contain {") +
2203 escape(expStrings[i]) + "}");
2204 }
2205 }
2206 }
2207
toHexString(int32_t i)2208 static UChar toHexString(int32_t i) { return (UChar)(i + (i < 10 ? u'0' : (u'A' - 10))); }
2209
2210 void
doAssert(UBool condition,const char * message)2211 UnicodeSetTest::doAssert(UBool condition, const char *message)
2212 {
2213 if (!condition) {
2214 errln(UnicodeString("ERROR : ") + message);
2215 }
2216 }
2217
2218 UnicodeString
escape(const UnicodeString & s)2219 UnicodeSetTest::escape(const UnicodeString& s) {
2220 UnicodeString buf;
2221 for (int32_t i=0; i<s.length(); )
2222 {
2223 UChar32 c = s.char32At(i);
2224 if (0x0020 <= c && c <= 0x007F) {
2225 buf += c;
2226 } else {
2227 if (c <= 0xFFFF) {
2228 buf += u"\\u";
2229 } else {
2230 buf += u"\\U";
2231 buf += toHexString((c & 0xF0000000) >> 28);
2232 buf += toHexString((c & 0x0F000000) >> 24);
2233 buf += toHexString((c & 0x00F00000) >> 20);
2234 buf += toHexString((c & 0x000F0000) >> 16);
2235 }
2236 buf += toHexString((c & 0xF000) >> 12);
2237 buf += toHexString((c & 0x0F00) >> 8);
2238 buf += toHexString((c & 0x00F0) >> 4);
2239 buf += toHexString(c & 0x000F);
2240 }
2241 i += U16_LENGTH(c);
2242 }
2243 return buf;
2244 }
2245
TestFreezable()2246 void UnicodeSetTest::TestFreezable() {
2247 UErrorCode errorCode=U_ZERO_ERROR;
2248 UnicodeString idPattern=UNICODE_STRING("[:ID_Continue:]", 15);
2249 UnicodeSet idSet(idPattern, errorCode);
2250 if(U_FAILURE(errorCode)) {
2251 dataerrln("FAIL: unable to create UnicodeSet([:ID_Continue:]) - %s", u_errorName(errorCode));
2252 return;
2253 }
2254
2255 UnicodeString wsPattern=UNICODE_STRING("[:White_Space:]", 15);
2256 UnicodeSet wsSet(wsPattern, errorCode);
2257 if(U_FAILURE(errorCode)) {
2258 dataerrln("FAIL: unable to create UnicodeSet([:White_Space:]) - %s", u_errorName(errorCode));
2259 return;
2260 }
2261
2262 idSet.add(idPattern);
2263 UnicodeSet frozen(idSet);
2264 frozen.freeze();
2265
2266 if(idSet.isFrozen() || !frozen.isFrozen()) {
2267 errln("FAIL: isFrozen() is wrong");
2268 }
2269 if(frozen!=idSet || !(frozen==idSet)) {
2270 errln("FAIL: a copy-constructed frozen set differs from its original");
2271 }
2272
2273 frozen=wsSet;
2274 if(frozen!=idSet || !(frozen==idSet)) {
2275 errln("FAIL: a frozen set was modified by operator=");
2276 }
2277
2278 UnicodeSet frozen2(frozen);
2279 if(frozen2!=frozen || frozen2!=idSet) {
2280 errln("FAIL: a copied frozen set differs from its frozen original");
2281 }
2282 if(!frozen2.isFrozen()) {
2283 errln("FAIL: copy-constructing a frozen set results in a thawed one");
2284 }
2285 UnicodeSet frozen3(5, 55); // Set to some values to really test assignment below, not copy construction.
2286 if(frozen3.contains(0, 4) || !frozen3.contains(5, 55) || frozen3.contains(56, 0x10ffff)) {
2287 errln("FAIL: UnicodeSet(5, 55) failed");
2288 }
2289 frozen3=frozen;
2290 if(!frozen3.isFrozen()) {
2291 errln("FAIL: copying a frozen set results in a thawed one");
2292 }
2293
2294 UnicodeSet *cloned=frozen.clone();
2295 if(!cloned->isFrozen() || *cloned!=frozen || cloned->containsSome(0xd802, 0xd805)) {
2296 errln("FAIL: clone() failed");
2297 }
2298 cloned->add(0xd802, 0xd805);
2299 if(cloned->containsSome(0xd802, 0xd805)) {
2300 errln("FAIL: unable to modify clone");
2301 }
2302 delete cloned;
2303
2304 UnicodeSet *thawed=frozen.cloneAsThawed();
2305 if(thawed->isFrozen() || *thawed!=frozen || thawed->containsSome(0xd802, 0xd805)) {
2306 errln("FAIL: cloneAsThawed() failed");
2307 }
2308 thawed->add(0xd802, 0xd805);
2309 if(!thawed->contains(0xd802, 0xd805)) {
2310 errln("FAIL: unable to modify thawed clone");
2311 }
2312 delete thawed;
2313
2314 frozen.set(5, 55);
2315 if(frozen!=idSet || !(frozen==idSet)) {
2316 errln("FAIL: UnicodeSet::set() modified a frozen set");
2317 }
2318
2319 frozen.clear();
2320 if(frozen!=idSet || !(frozen==idSet)) {
2321 errln("FAIL: UnicodeSet::clear() modified a frozen set");
2322 }
2323
2324 frozen.closeOver(USET_CASE_INSENSITIVE);
2325 if(frozen!=idSet || !(frozen==idSet)) {
2326 errln("FAIL: UnicodeSet::closeOver() modified a frozen set");
2327 }
2328
2329 frozen.compact();
2330 if(frozen!=idSet || !(frozen==idSet)) {
2331 errln("FAIL: UnicodeSet::compact() modified a frozen set");
2332 }
2333
2334 ParsePosition pos;
2335 frozen.
2336 applyPattern(wsPattern, errorCode).
2337 applyPattern(wsPattern, USET_IGNORE_SPACE, NULL, errorCode).
2338 applyPattern(wsPattern, pos, USET_IGNORE_SPACE, NULL, errorCode).
2339 applyIntPropertyValue(UCHAR_CANONICAL_COMBINING_CLASS, 230, errorCode).
2340 applyPropertyAlias(u"Assigned", UnicodeString(), errorCode);
2341 if(frozen!=idSet || !(frozen==idSet)) {
2342 errln("FAIL: UnicodeSet::applyXYZ() modified a frozen set");
2343 }
2344
2345 frozen.
2346 add(0xd800).
2347 add(0xd802, 0xd805).
2348 add(wsPattern).
2349 addAll(idPattern).
2350 addAll(wsSet);
2351 if(frozen!=idSet || !(frozen==idSet)) {
2352 errln("FAIL: UnicodeSet::addXYZ() modified a frozen set");
2353 }
2354
2355 frozen.
2356 retain(0x62).
2357 retain(0x64, 0x69).
2358 retainAll(wsPattern).
2359 retainAll(wsSet);
2360 if(frozen!=idSet || !(frozen==idSet)) {
2361 errln("FAIL: UnicodeSet::retainXYZ() modified a frozen set");
2362 }
2363
2364 frozen.
2365 remove(0x62).
2366 remove(0x64, 0x69).
2367 remove(idPattern).
2368 removeAll(idPattern).
2369 removeAll(idSet);
2370 if(frozen!=idSet || !(frozen==idSet)) {
2371 errln("FAIL: UnicodeSet::removeXYZ() modified a frozen set");
2372 }
2373
2374 frozen.
2375 complement().
2376 complement(0x62).
2377 complement(0x64, 0x69).
2378 complement(idPattern).
2379 complementAll(idPattern).
2380 complementAll(idSet);
2381 if(frozen!=idSet || !(frozen==idSet)) {
2382 errln("FAIL: UnicodeSet::complementXYZ() modified a frozen set");
2383 }
2384 }
2385
2386 // Test span() etc. -------------------------------------------------------- ***
2387
2388 // Append the UTF-8 version of the string to t and return the appended UTF-8 length.
2389 static int32_t
appendUTF8(const UChar * s,int32_t length,char * t,int32_t capacity)2390 appendUTF8(const UChar *s, int32_t length, char *t, int32_t capacity) {
2391 UErrorCode errorCode=U_ZERO_ERROR;
2392 int32_t length8=0;
2393 u_strToUTF8(t, capacity, &length8, s, length, &errorCode);
2394 if(U_SUCCESS(errorCode)) {
2395 return length8;
2396 } else {
2397 // The string contains an unpaired surrogate.
2398 // Ignore this string.
2399 return 0;
2400 }
2401 }
2402
2403 class UnicodeSetWithStringsIterator;
2404
2405 // Make the strings in a UnicodeSet easily accessible.
2406 class UnicodeSetWithStrings {
2407 public:
UnicodeSetWithStrings(const UnicodeSet & normalSet)2408 UnicodeSetWithStrings(const UnicodeSet &normalSet) :
2409 set(normalSet), stringsLength(0), hasSurrogates(FALSE) {
2410 int32_t size=set.size();
2411 if(size>0 && set.charAt(size-1)<0) {
2412 // If a set's last element is not a code point, then it must contain strings.
2413 // Iterate over the set, skip all code point ranges, and cache the strings.
2414 // Convert them to UTF-8 for spanUTF8().
2415 UnicodeSetIterator iter(set);
2416 const UnicodeString *s;
2417 char *s8=utf8;
2418 int32_t length8, utf8Count=0;
2419 while(iter.nextRange() && stringsLength<UPRV_LENGTHOF(strings)) {
2420 if(iter.isString()) {
2421 // Store the pointer to the set's string element
2422 // which we happen to know is a stable pointer.
2423 strings[stringsLength]=s=&iter.getString();
2424 utf8Count+=
2425 utf8Lengths[stringsLength]=length8=
2426 appendUTF8(s->getBuffer(), s->length(),
2427 s8, (int32_t)(sizeof(utf8)-utf8Count));
2428 if(length8==0) {
2429 hasSurrogates=TRUE; // Contains unpaired surrogates.
2430 }
2431 s8+=length8;
2432 ++stringsLength;
2433 }
2434 }
2435 }
2436 }
2437
getSet() const2438 const UnicodeSet &getSet() const {
2439 return set;
2440 }
2441
hasStrings() const2442 UBool hasStrings() const {
2443 return (UBool)(stringsLength>0);
2444 }
2445
hasStringsWithSurrogates() const2446 UBool hasStringsWithSurrogates() const {
2447 return hasSurrogates;
2448 }
2449
2450 private:
2451 friend class UnicodeSetWithStringsIterator;
2452
2453 const UnicodeSet &set;
2454
2455 const UnicodeString *strings[20];
2456 int32_t stringsLength;
2457 UBool hasSurrogates;
2458
2459 char utf8[1024];
2460 int32_t utf8Lengths[20];
2461 };
2462
2463 class UnicodeSetWithStringsIterator {
2464 public:
UnicodeSetWithStringsIterator(const UnicodeSetWithStrings & set)2465 UnicodeSetWithStringsIterator(const UnicodeSetWithStrings &set) :
2466 fSet(set), nextStringIndex(0), nextUTF8Start(0) {
2467 }
2468
reset()2469 void reset() {
2470 nextStringIndex=nextUTF8Start=0;
2471 }
2472
nextString()2473 const UnicodeString *nextString() {
2474 if(nextStringIndex<fSet.stringsLength) {
2475 return fSet.strings[nextStringIndex++];
2476 } else {
2477 return NULL;
2478 }
2479 }
2480
2481 // Do not mix with calls to nextString().
nextUTF8(int32_t & length)2482 const char *nextUTF8(int32_t &length) {
2483 if(nextStringIndex<fSet.stringsLength) {
2484 const char *s8=fSet.utf8+nextUTF8Start;
2485 nextUTF8Start+=length=fSet.utf8Lengths[nextStringIndex++];
2486 return s8;
2487 } else {
2488 length=0;
2489 return NULL;
2490 }
2491 }
2492
2493 private:
2494 const UnicodeSetWithStrings &fSet;
2495 int32_t nextStringIndex;
2496 int32_t nextUTF8Start;
2497 };
2498
2499 // Compare 16-bit Unicode strings (which may be malformed UTF-16)
2500 // at code point boundaries.
2501 // That is, each edge of a match must not be in the middle of a surrogate pair.
2502 static inline UBool
matches16CPB(const UChar * s,int32_t start,int32_t limit,const UnicodeString & t)2503 matches16CPB(const UChar *s, int32_t start, int32_t limit, const UnicodeString &t) {
2504 s+=start;
2505 limit-=start;
2506 int32_t length=t.length();
2507 return 0==t.compare(s, length) &&
2508 !(0<start && U16_IS_LEAD(s[-1]) && U16_IS_TRAIL(s[0])) &&
2509 !(length<limit && U16_IS_LEAD(s[length-1]) && U16_IS_TRAIL(s[length]));
2510 }
2511
2512 // Implement span() with contains() for comparison.
containsSpanUTF16(const UnicodeSetWithStrings & set,const UChar * s,int32_t length,USetSpanCondition spanCondition)2513 static int32_t containsSpanUTF16(const UnicodeSetWithStrings &set, const UChar *s, int32_t length,
2514 USetSpanCondition spanCondition) {
2515 const UnicodeSet &realSet(set.getSet());
2516 if(!set.hasStrings()) {
2517 if(spanCondition!=USET_SPAN_NOT_CONTAINED) {
2518 spanCondition=USET_SPAN_CONTAINED; // Pin to 0/1 values.
2519 }
2520
2521 UChar32 c;
2522 int32_t start=0, prev;
2523 while((prev=start)<length) {
2524 U16_NEXT(s, start, length, c);
2525 if(realSet.contains(c)!=spanCondition) {
2526 break;
2527 }
2528 }
2529 return prev;
2530 } else if(spanCondition==USET_SPAN_NOT_CONTAINED) {
2531 UnicodeSetWithStringsIterator iter(set);
2532 UChar32 c;
2533 int32_t start, next;
2534 for(start=next=0; start<length;) {
2535 U16_NEXT(s, next, length, c);
2536 if(realSet.contains(c)) {
2537 break;
2538 }
2539 const UnicodeString *str;
2540 iter.reset();
2541 while((str=iter.nextString())!=NULL) {
2542 if(str->length()<=(length-start) && matches16CPB(s, start, length, *str)) {
2543 // spanNeedsStrings=TRUE;
2544 return start;
2545 }
2546 }
2547 start=next;
2548 }
2549 return start;
2550 } else /* USET_SPAN_CONTAINED or USET_SPAN_SIMPLE */ {
2551 UnicodeSetWithStringsIterator iter(set);
2552 UChar32 c;
2553 int32_t start, next, maxSpanLimit=0;
2554 for(start=next=0; start<length;) {
2555 U16_NEXT(s, next, length, c);
2556 if(!realSet.contains(c)) {
2557 next=start; // Do not span this single, not-contained code point.
2558 }
2559 const UnicodeString *str;
2560 iter.reset();
2561 while((str=iter.nextString())!=NULL) {
2562 if(str->length()<=(length-start) && matches16CPB(s, start, length, *str)) {
2563 // spanNeedsStrings=TRUE;
2564 int32_t matchLimit=start+str->length();
2565 if(matchLimit==length) {
2566 return length;
2567 }
2568 if(spanCondition==USET_SPAN_CONTAINED) {
2569 // Iterate for the shortest match at each position.
2570 // Recurse for each but the shortest match.
2571 if(next==start) {
2572 next=matchLimit; // First match from start.
2573 } else {
2574 if(matchLimit<next) {
2575 // Remember shortest match from start for iteration.
2576 int32_t temp=next;
2577 next=matchLimit;
2578 matchLimit=temp;
2579 }
2580 // Recurse for non-shortest match from start.
2581 int32_t spanLength=containsSpanUTF16(set, s+matchLimit, length-matchLimit,
2582 USET_SPAN_CONTAINED);
2583 if((matchLimit+spanLength)>maxSpanLimit) {
2584 maxSpanLimit=matchLimit+spanLength;
2585 if(maxSpanLimit==length) {
2586 return length;
2587 }
2588 }
2589 }
2590 } else /* spanCondition==USET_SPAN_SIMPLE */ {
2591 if(matchLimit>next) {
2592 // Remember longest match from start.
2593 next=matchLimit;
2594 }
2595 }
2596 }
2597 }
2598 if(next==start) {
2599 break; // No match from start.
2600 }
2601 start=next;
2602 }
2603 if(start>maxSpanLimit) {
2604 return start;
2605 } else {
2606 return maxSpanLimit;
2607 }
2608 }
2609 }
2610
containsSpanBackUTF16(const UnicodeSetWithStrings & set,const UChar * s,int32_t length,USetSpanCondition spanCondition)2611 static int32_t containsSpanBackUTF16(const UnicodeSetWithStrings &set, const UChar *s, int32_t length,
2612 USetSpanCondition spanCondition) {
2613 if(length==0) {
2614 return 0;
2615 }
2616 const UnicodeSet &realSet(set.getSet());
2617 if(!set.hasStrings()) {
2618 if(spanCondition!=USET_SPAN_NOT_CONTAINED) {
2619 spanCondition=USET_SPAN_CONTAINED; // Pin to 0/1 values.
2620 }
2621
2622 UChar32 c;
2623 int32_t prev=length;
2624 do {
2625 U16_PREV(s, 0, length, c);
2626 if(realSet.contains(c)!=spanCondition) {
2627 break;
2628 }
2629 } while((prev=length)>0);
2630 return prev;
2631 } else if(spanCondition==USET_SPAN_NOT_CONTAINED) {
2632 UnicodeSetWithStringsIterator iter(set);
2633 UChar32 c;
2634 int32_t prev=length, length0=length;
2635 do {
2636 U16_PREV(s, 0, length, c);
2637 if(realSet.contains(c)) {
2638 break;
2639 }
2640 const UnicodeString *str;
2641 iter.reset();
2642 while((str=iter.nextString())!=NULL) {
2643 if(str->length()<=prev && matches16CPB(s, prev-str->length(), length0, *str)) {
2644 // spanNeedsStrings=TRUE;
2645 return prev;
2646 }
2647 }
2648 } while((prev=length)>0);
2649 return prev;
2650 } else /* USET_SPAN_CONTAINED or USET_SPAN_SIMPLE */ {
2651 UnicodeSetWithStringsIterator iter(set);
2652 UChar32 c;
2653 int32_t prev=length, minSpanStart=length, length0=length;
2654 do {
2655 U16_PREV(s, 0, length, c);
2656 if(!realSet.contains(c)) {
2657 length=prev; // Do not span this single, not-contained code point.
2658 }
2659 const UnicodeString *str;
2660 iter.reset();
2661 while((str=iter.nextString())!=NULL) {
2662 if(str->length()<=prev && matches16CPB(s, prev-str->length(), length0, *str)) {
2663 // spanNeedsStrings=TRUE;
2664 int32_t matchStart=prev-str->length();
2665 if(matchStart==0) {
2666 return 0;
2667 }
2668 if(spanCondition==USET_SPAN_CONTAINED) {
2669 // Iterate for the shortest match at each position.
2670 // Recurse for each but the shortest match.
2671 if(length==prev) {
2672 length=matchStart; // First match from prev.
2673 } else {
2674 if(matchStart>length) {
2675 // Remember shortest match from prev for iteration.
2676 int32_t temp=length;
2677 length=matchStart;
2678 matchStart=temp;
2679 }
2680 // Recurse for non-shortest match from prev.
2681 int32_t spanStart=containsSpanBackUTF16(set, s, matchStart,
2682 USET_SPAN_CONTAINED);
2683 if(spanStart<minSpanStart) {
2684 minSpanStart=spanStart;
2685 if(minSpanStart==0) {
2686 return 0;
2687 }
2688 }
2689 }
2690 } else /* spanCondition==USET_SPAN_SIMPLE */ {
2691 if(matchStart<length) {
2692 // Remember longest match from prev.
2693 length=matchStart;
2694 }
2695 }
2696 }
2697 }
2698 if(length==prev) {
2699 break; // No match from prev.
2700 }
2701 } while((prev=length)>0);
2702 if(prev<minSpanStart) {
2703 return prev;
2704 } else {
2705 return minSpanStart;
2706 }
2707 }
2708 }
2709
containsSpanUTF8(const UnicodeSetWithStrings & set,const char * s,int32_t length,USetSpanCondition spanCondition)2710 static int32_t containsSpanUTF8(const UnicodeSetWithStrings &set, const char *s, int32_t length,
2711 USetSpanCondition spanCondition) {
2712 const UnicodeSet &realSet(set.getSet());
2713 if(!set.hasStrings()) {
2714 if(spanCondition!=USET_SPAN_NOT_CONTAINED) {
2715 spanCondition=USET_SPAN_CONTAINED; // Pin to 0/1 values.
2716 }
2717
2718 UChar32 c;
2719 int32_t start=0, prev;
2720 while((prev=start)<length) {
2721 U8_NEXT_OR_FFFD(s, start, length, c);
2722 if(realSet.contains(c)!=spanCondition) {
2723 break;
2724 }
2725 }
2726 return prev;
2727 } else if(spanCondition==USET_SPAN_NOT_CONTAINED) {
2728 UnicodeSetWithStringsIterator iter(set);
2729 UChar32 c;
2730 int32_t start, next;
2731 for(start=next=0; start<length;) {
2732 U8_NEXT_OR_FFFD(s, next, length, c);
2733 if(realSet.contains(c)) {
2734 break;
2735 }
2736 const char *s8;
2737 int32_t length8;
2738 iter.reset();
2739 while((s8=iter.nextUTF8(length8))!=NULL) {
2740 if(length8!=0 && length8<=(length-start) && 0==memcmp(s+start, s8, length8)) {
2741 // spanNeedsStrings=TRUE;
2742 return start;
2743 }
2744 }
2745 start=next;
2746 }
2747 return start;
2748 } else /* USET_SPAN_CONTAINED or USET_SPAN_SIMPLE */ {
2749 UnicodeSetWithStringsIterator iter(set);
2750 UChar32 c;
2751 int32_t start, next, maxSpanLimit=0;
2752 for(start=next=0; start<length;) {
2753 U8_NEXT_OR_FFFD(s, next, length, c);
2754 if(!realSet.contains(c)) {
2755 next=start; // Do not span this single, not-contained code point.
2756 }
2757 const char *s8;
2758 int32_t length8;
2759 iter.reset();
2760 while((s8=iter.nextUTF8(length8))!=NULL) {
2761 if(length8!=0 && length8<=(length-start) && 0==memcmp(s+start, s8, length8)) {
2762 // spanNeedsStrings=TRUE;
2763 int32_t matchLimit=start+length8;
2764 if(matchLimit==length) {
2765 return length;
2766 }
2767 if(spanCondition==USET_SPAN_CONTAINED) {
2768 // Iterate for the shortest match at each position.
2769 // Recurse for each but the shortest match.
2770 if(next==start) {
2771 next=matchLimit; // First match from start.
2772 } else {
2773 if(matchLimit<next) {
2774 // Remember shortest match from start for iteration.
2775 int32_t temp=next;
2776 next=matchLimit;
2777 matchLimit=temp;
2778 }
2779 // Recurse for non-shortest match from start.
2780 int32_t spanLength=containsSpanUTF8(set, s+matchLimit, length-matchLimit,
2781 USET_SPAN_CONTAINED);
2782 if((matchLimit+spanLength)>maxSpanLimit) {
2783 maxSpanLimit=matchLimit+spanLength;
2784 if(maxSpanLimit==length) {
2785 return length;
2786 }
2787 }
2788 }
2789 } else /* spanCondition==USET_SPAN_SIMPLE */ {
2790 if(matchLimit>next) {
2791 // Remember longest match from start.
2792 next=matchLimit;
2793 }
2794 }
2795 }
2796 }
2797 if(next==start) {
2798 break; // No match from start.
2799 }
2800 start=next;
2801 }
2802 if(start>maxSpanLimit) {
2803 return start;
2804 } else {
2805 return maxSpanLimit;
2806 }
2807 }
2808 }
2809
containsSpanBackUTF8(const UnicodeSetWithStrings & set,const char * s,int32_t length,USetSpanCondition spanCondition)2810 static int32_t containsSpanBackUTF8(const UnicodeSetWithStrings &set, const char *s, int32_t length,
2811 USetSpanCondition spanCondition) {
2812 if(length==0) {
2813 return 0;
2814 }
2815 const UnicodeSet &realSet(set.getSet());
2816 if(!set.hasStrings()) {
2817 if(spanCondition!=USET_SPAN_NOT_CONTAINED) {
2818 spanCondition=USET_SPAN_CONTAINED; // Pin to 0/1 values.
2819 }
2820
2821 UChar32 c;
2822 int32_t prev=length;
2823 do {
2824 U8_PREV_OR_FFFD(s, 0, length, c);
2825 if(realSet.contains(c)!=spanCondition) {
2826 break;
2827 }
2828 } while((prev=length)>0);
2829 return prev;
2830 } else if(spanCondition==USET_SPAN_NOT_CONTAINED) {
2831 UnicodeSetWithStringsIterator iter(set);
2832 UChar32 c;
2833 int32_t prev=length;
2834 do {
2835 U8_PREV_OR_FFFD(s, 0, length, c);
2836 if(realSet.contains(c)) {
2837 break;
2838 }
2839 const char *s8;
2840 int32_t length8;
2841 iter.reset();
2842 while((s8=iter.nextUTF8(length8))!=NULL) {
2843 if(length8!=0 && length8<=prev && 0==memcmp(s+prev-length8, s8, length8)) {
2844 // spanNeedsStrings=TRUE;
2845 return prev;
2846 }
2847 }
2848 } while((prev=length)>0);
2849 return prev;
2850 } else /* USET_SPAN_CONTAINED or USET_SPAN_SIMPLE */ {
2851 UnicodeSetWithStringsIterator iter(set);
2852 UChar32 c;
2853 int32_t prev=length, minSpanStart=length;
2854 do {
2855 U8_PREV_OR_FFFD(s, 0, length, c);
2856 if(!realSet.contains(c)) {
2857 length=prev; // Do not span this single, not-contained code point.
2858 }
2859 const char *s8;
2860 int32_t length8;
2861 iter.reset();
2862 while((s8=iter.nextUTF8(length8))!=NULL) {
2863 if(length8!=0 && length8<=prev && 0==memcmp(s+prev-length8, s8, length8)) {
2864 // spanNeedsStrings=TRUE;
2865 int32_t matchStart=prev-length8;
2866 if(matchStart==0) {
2867 return 0;
2868 }
2869 if(spanCondition==USET_SPAN_CONTAINED) {
2870 // Iterate for the shortest match at each position.
2871 // Recurse for each but the shortest match.
2872 if(length==prev) {
2873 length=matchStart; // First match from prev.
2874 } else {
2875 if(matchStart>length) {
2876 // Remember shortest match from prev for iteration.
2877 int32_t temp=length;
2878 length=matchStart;
2879 matchStart=temp;
2880 }
2881 // Recurse for non-shortest match from prev.
2882 int32_t spanStart=containsSpanBackUTF8(set, s, matchStart,
2883 USET_SPAN_CONTAINED);
2884 if(spanStart<minSpanStart) {
2885 minSpanStart=spanStart;
2886 if(minSpanStart==0) {
2887 return 0;
2888 }
2889 }
2890 }
2891 } else /* spanCondition==USET_SPAN_SIMPLE */ {
2892 if(matchStart<length) {
2893 // Remember longest match from prev.
2894 length=matchStart;
2895 }
2896 }
2897 }
2898 }
2899 if(length==prev) {
2900 break; // No match from prev.
2901 }
2902 } while((prev=length)>0);
2903 if(prev<minSpanStart) {
2904 return prev;
2905 } else {
2906 return minSpanStart;
2907 }
2908 }
2909 }
2910
2911 // spans to be performed and compared
2912 enum {
2913 SPAN_UTF16 =1,
2914 SPAN_UTF8 =2,
2915 SPAN_UTFS =3,
2916
2917 SPAN_SET =4,
2918 SPAN_COMPLEMENT =8,
2919 SPAN_POLARITY =0xc,
2920
2921 SPAN_FWD =0x10,
2922 SPAN_BACK =0x20,
2923 SPAN_DIRS =0x30,
2924
2925 SPAN_CONTAINED =0x100,
2926 SPAN_SIMPLE =0x200,
2927 SPAN_CONDITION =0x300,
2928
2929 SPAN_ALL =0x33f
2930 };
2931
invertSpanCondition(USetSpanCondition spanCondition,USetSpanCondition contained)2932 static inline USetSpanCondition invertSpanCondition(USetSpanCondition spanCondition, USetSpanCondition contained) {
2933 return spanCondition == USET_SPAN_NOT_CONTAINED ? contained : USET_SPAN_NOT_CONTAINED;
2934 }
2935
slen(const void * s,UBool isUTF16)2936 static inline int32_t slen(const void *s, UBool isUTF16) {
2937 return isUTF16 ? u_strlen((const UChar *)s) : static_cast<int32_t>(strlen((const char *)s));
2938 }
2939
2940 /*
2941 * Count spans on a string with the method according to type and set the span limits.
2942 * The set may be the complement of the original.
2943 * When using spanBack() and comparing with span(), use a span condition for the first spanBack()
2944 * according to the expected number of spans.
2945 * Sets typeName to an empty string if there is no such type.
2946 * Returns -1 if the span option is filtered out.
2947 */
getSpans(const UnicodeSetWithStrings & set,UBool isComplement,const void * s,int32_t length,UBool isUTF16,uint32_t whichSpans,int type,const char * & typeName,int32_t limits[],int32_t limitsCapacity,int32_t expectCount)2948 static int32_t getSpans(const UnicodeSetWithStrings &set, UBool isComplement,
2949 const void *s, int32_t length, UBool isUTF16,
2950 uint32_t whichSpans,
2951 int type, const char *&typeName,
2952 int32_t limits[], int32_t limitsCapacity,
2953 int32_t expectCount) {
2954 const UnicodeSet &realSet(set.getSet());
2955 int32_t start, count;
2956 USetSpanCondition spanCondition, firstSpanCondition, contained;
2957 UBool isForward;
2958
2959 if(type<0 || 7<type) {
2960 typeName="";
2961 return 0;
2962 }
2963
2964 static const char *const typeNames16[]={
2965 "contains", "contains(LM)",
2966 "span", "span(LM)",
2967 "containsBack", "containsBack(LM)",
2968 "spanBack", "spanBack(LM)"
2969 };
2970
2971 static const char *const typeNames8[]={
2972 "containsUTF8", "containsUTF8(LM)",
2973 "spanUTF8", "spanUTF8(LM)",
2974 "containsBackUTF8", "containsBackUTF8(LM)", // not implemented
2975 "spanBackUTF8", "spanBackUTF8(LM)"
2976 };
2977
2978 typeName= isUTF16 ? typeNames16[type] : typeNames8[type];
2979
2980 // filter span options
2981 if(type<=3) {
2982 // span forward
2983 if((whichSpans&SPAN_FWD)==0) {
2984 return -1;
2985 }
2986 isForward=TRUE;
2987 } else {
2988 // span backward
2989 if((whichSpans&SPAN_BACK)==0) {
2990 return -1;
2991 }
2992 isForward=FALSE;
2993 }
2994 if((type&1)==0) {
2995 // use USET_SPAN_CONTAINED
2996 if((whichSpans&SPAN_CONTAINED)==0) {
2997 return -1;
2998 }
2999 contained=USET_SPAN_CONTAINED;
3000 } else {
3001 // use USET_SPAN_SIMPLE
3002 if((whichSpans&SPAN_SIMPLE)==0) {
3003 return -1;
3004 }
3005 contained=USET_SPAN_SIMPLE;
3006 }
3007
3008 // Default first span condition for going forward with an uncomplemented set.
3009 spanCondition=USET_SPAN_NOT_CONTAINED;
3010 if(isComplement) {
3011 spanCondition=invertSpanCondition(spanCondition, contained);
3012 }
3013
3014 // First span condition for span(), used to terminate the spanBack() iteration.
3015 firstSpanCondition=spanCondition;
3016
3017 // spanBack(): Its initial span condition is span()'s last span condition,
3018 // which is the opposite of span()'s first span condition
3019 // if we expect an even number of spans.
3020 // (The loop inverts spanCondition (expectCount-1) times
3021 // before the expectCount'th span() call.)
3022 // If we do not compare forward and backward directions, then we do not have an
3023 // expectCount and just start with firstSpanCondition.
3024 if(!isForward && (whichSpans&SPAN_FWD)!=0 && (expectCount&1)==0) {
3025 spanCondition=invertSpanCondition(spanCondition, contained);
3026 }
3027
3028 count=0;
3029 switch(type) {
3030 case 0:
3031 case 1:
3032 start=0;
3033 if(length<0) {
3034 length=slen(s, isUTF16);
3035 }
3036 for(;;) {
3037 start+= isUTF16 ? containsSpanUTF16(set, (const UChar *)s+start, length-start, spanCondition) :
3038 containsSpanUTF8(set, (const char *)s+start, length-start, spanCondition);
3039 if(count<limitsCapacity) {
3040 limits[count]=start;
3041 }
3042 ++count;
3043 if(start>=length) {
3044 break;
3045 }
3046 spanCondition=invertSpanCondition(spanCondition, contained);
3047 }
3048 break;
3049 case 2:
3050 case 3:
3051 start=0;
3052 for(;;) {
3053 start+= isUTF16 ? realSet.span((const UChar *)s+start, length>=0 ? length-start : length, spanCondition) :
3054 realSet.spanUTF8((const char *)s+start, length>=0 ? length-start : length, spanCondition);
3055 if(count<limitsCapacity) {
3056 limits[count]=start;
3057 }
3058 ++count;
3059 if(length>=0 ? start>=length :
3060 isUTF16 ? ((const UChar *)s)[start]==0 :
3061 ((const char *)s)[start]==0
3062 ) {
3063 break;
3064 }
3065 spanCondition=invertSpanCondition(spanCondition, contained);
3066 }
3067 break;
3068 case 4:
3069 case 5:
3070 if(length<0) {
3071 length=slen(s, isUTF16);
3072 }
3073 for(;;) {
3074 ++count;
3075 if(count<=limitsCapacity) {
3076 limits[limitsCapacity-count]=length;
3077 }
3078 length= isUTF16 ? containsSpanBackUTF16(set, (const UChar *)s, length, spanCondition) :
3079 containsSpanBackUTF8(set, (const char *)s, length, spanCondition);
3080 if(length==0 && spanCondition==firstSpanCondition) {
3081 break;
3082 }
3083 spanCondition=invertSpanCondition(spanCondition, contained);
3084 }
3085 if(count<limitsCapacity) {
3086 memmove(limits, limits+(limitsCapacity-count), count*4);
3087 }
3088 break;
3089 case 6:
3090 case 7:
3091 for(;;) {
3092 ++count;
3093 if(count<=limitsCapacity) {
3094 limits[limitsCapacity-count]= length >=0 ? length : slen(s, isUTF16);
3095 }
3096 // Note: Length<0 is tested only for the first spanBack().
3097 // If we wanted to keep length<0 for all spanBack()s, we would have to
3098 // temporarily modify the string by placing a NUL where the previous spanBack() stopped.
3099 length= isUTF16 ? realSet.spanBack((const UChar *)s, length, spanCondition) :
3100 realSet.spanBackUTF8((const char *)s, length, spanCondition);
3101 if(length==0 && spanCondition==firstSpanCondition) {
3102 break;
3103 }
3104 spanCondition=invertSpanCondition(spanCondition, contained);
3105 }
3106 if(count<limitsCapacity) {
3107 memmove(limits, limits+(limitsCapacity-count), count*4);
3108 }
3109 break;
3110 default:
3111 typeName="";
3112 return -1;
3113 }
3114
3115 return count;
3116 }
3117
3118 // sets to be tested; odd index=isComplement
3119 enum {
3120 SLOW,
3121 SLOW_NOT,
3122 FAST,
3123 FAST_NOT,
3124 SET_COUNT
3125 };
3126
3127 static const char *const setNames[SET_COUNT]={
3128 "slow",
3129 "slow.not",
3130 "fast",
3131 "fast.not"
3132 };
3133
3134 /*
3135 * Verify that we get the same results whether we look at text with contains(),
3136 * span() or spanBack(), using unfrozen or frozen versions of the set,
3137 * and using the set or its complement (switching the spanConditions accordingly).
3138 * The latter verifies that
3139 * set.span(spanCondition) == set.complement().span(!spanCondition).
3140 *
3141 * The expectLimits[] are either provided by the caller (with expectCount>=0)
3142 * or returned to the caller (with an input expectCount<0).
3143 */
testSpan(const UnicodeSetWithStrings * sets[4],const void * s,int32_t length,UBool isUTF16,uint32_t whichSpans,int32_t expectLimits[],int32_t & expectCount,const char * testName,int32_t index)3144 void UnicodeSetTest::testSpan(const UnicodeSetWithStrings *sets[4],
3145 const void *s, int32_t length, UBool isUTF16,
3146 uint32_t whichSpans,
3147 int32_t expectLimits[], int32_t &expectCount,
3148 const char *testName, int32_t index) {
3149 int32_t limits[500];
3150 int32_t limitsCount;
3151 int i, j;
3152
3153 const char *typeName;
3154 int type;
3155
3156 for(i=0; i<SET_COUNT; ++i) {
3157 if((i&1)==0) {
3158 // Even-numbered sets are original, uncomplemented sets.
3159 if((whichSpans&SPAN_SET)==0) {
3160 continue;
3161 }
3162 } else {
3163 // Odd-numbered sets are complemented.
3164 if((whichSpans&SPAN_COMPLEMENT)==0) {
3165 continue;
3166 }
3167 }
3168 for(type=0;; ++type) {
3169 limitsCount=getSpans(*sets[i], (UBool)(i&1),
3170 s, length, isUTF16,
3171 whichSpans,
3172 type, typeName,
3173 limits, UPRV_LENGTHOF(limits), expectCount);
3174 if(typeName[0]==0) {
3175 break; // All types tried.
3176 }
3177 if(limitsCount<0) {
3178 continue; // Span option filtered out.
3179 }
3180 if(expectCount<0) {
3181 expectCount=limitsCount;
3182 if(limitsCount>UPRV_LENGTHOF(limits)) {
3183 errln("FAIL: %s[0x%lx].%s.%s span count=%ld > %ld capacity - too many spans",
3184 testName, (long)index, setNames[i], typeName, (long)limitsCount, (long)UPRV_LENGTHOF(limits));
3185 return;
3186 }
3187 memcpy(expectLimits, limits, limitsCount*4);
3188 } else if(limitsCount!=expectCount) {
3189 errln("FAIL: %s[0x%lx].%s.%s span count=%ld != %ld",
3190 testName, (long)index, setNames[i], typeName, (long)limitsCount, (long)expectCount);
3191 } else {
3192 for(j=0; j<limitsCount; ++j) {
3193 if(limits[j]!=expectLimits[j]) {
3194 errln("FAIL: %s[0x%lx].%s.%s span count=%ld limits[%d]=%ld != %ld",
3195 testName, (long)index, setNames[i], typeName, (long)limitsCount,
3196 j, (long)limits[j], (long)expectLimits[j]);
3197 break;
3198 }
3199 }
3200 }
3201 }
3202 }
3203
3204 // Compare span() with containsAll()/containsNone(),
3205 // but only if we have expectLimits[] from the uncomplemented set.
3206 if(isUTF16 && (whichSpans&SPAN_SET)!=0) {
3207 const UChar *s16=(const UChar *)s;
3208 UnicodeString string;
3209 int32_t prev=0, limit, length;
3210 for(i=0; i<expectCount; ++i) {
3211 limit=expectLimits[i];
3212 length=limit-prev;
3213 if(length>0) {
3214 string.setTo(FALSE, s16+prev, length); // read-only alias
3215 if(i&1) {
3216 if(!sets[SLOW]->getSet().containsAll(string)) {
3217 errln("FAIL: %s[0x%lx].%s.containsAll(%ld..%ld)==FALSE contradicts span()",
3218 testName, (long)index, setNames[SLOW], (long)prev, (long)limit);
3219 return;
3220 }
3221 if(!sets[FAST]->getSet().containsAll(string)) {
3222 errln("FAIL: %s[0x%lx].%s.containsAll(%ld..%ld)==FALSE contradicts span()",
3223 testName, (long)index, setNames[FAST], (long)prev, (long)limit);
3224 return;
3225 }
3226 } else {
3227 if(!sets[SLOW]->getSet().containsNone(string)) {
3228 errln("FAIL: %s[0x%lx].%s.containsNone(%ld..%ld)==FALSE contradicts span()",
3229 testName, (long)index, setNames[SLOW], (long)prev, (long)limit);
3230 return;
3231 }
3232 if(!sets[FAST]->getSet().containsNone(string)) {
3233 errln("FAIL: %s[0x%lx].%s.containsNone(%ld..%ld)==FALSE contradicts span()",
3234 testName, (long)index, setNames[FAST], (long)prev, (long)limit);
3235 return;
3236 }
3237 }
3238 }
3239 prev=limit;
3240 }
3241 }
3242 }
3243
3244 // Specifically test either UTF-16 or UTF-8.
testSpan(const UnicodeSetWithStrings * sets[4],const void * s,int32_t length,UBool isUTF16,uint32_t whichSpans,const char * testName,int32_t index)3245 void UnicodeSetTest::testSpan(const UnicodeSetWithStrings *sets[4],
3246 const void *s, int32_t length, UBool isUTF16,
3247 uint32_t whichSpans,
3248 const char *testName, int32_t index) {
3249 int32_t expectLimits[500];
3250 int32_t expectCount=-1;
3251 testSpan(sets, s, length, isUTF16, whichSpans, expectLimits, expectCount, testName, index);
3252 }
3253
stringContainsUnpairedSurrogate(const UChar * s,int32_t length)3254 UBool stringContainsUnpairedSurrogate(const UChar *s, int32_t length) {
3255 UChar c, c2;
3256
3257 if(length>=0) {
3258 while(length>0) {
3259 c=*s++;
3260 --length;
3261 if(0xd800<=c && c<0xe000) {
3262 if(c>=0xdc00 || length==0 || !U16_IS_TRAIL(c2=*s++)) {
3263 return TRUE;
3264 }
3265 --length;
3266 }
3267 }
3268 } else {
3269 while((c=*s++)!=0) {
3270 if(0xd800<=c && c<0xe000) {
3271 if(c>=0xdc00 || !U16_IS_TRAIL(c2=*s++)) {
3272 return TRUE;
3273 }
3274 }
3275 }
3276 }
3277 return FALSE;
3278 }
3279
3280 // Test both UTF-16 and UTF-8 versions of span() etc. on the same sets and text,
3281 // unless either UTF is turned off in whichSpans.
3282 // Testing UTF-16 and UTF-8 together requires that surrogate code points
3283 // have the same contains(c) value as U+FFFD.
testSpanBothUTFs(const UnicodeSetWithStrings * sets[4],const UChar * s16,int32_t length16,uint32_t whichSpans,const char * testName,int32_t index)3284 void UnicodeSetTest::testSpanBothUTFs(const UnicodeSetWithStrings *sets[4],
3285 const UChar *s16, int32_t length16,
3286 uint32_t whichSpans,
3287 const char *testName, int32_t index) {
3288 int32_t expectLimits[500];
3289 int32_t expectCount;
3290
3291 expectCount=-1; // Get expectLimits[] from testSpan().
3292
3293 if((whichSpans&SPAN_UTF16)!=0) {
3294 testSpan(sets, s16, length16, TRUE, whichSpans, expectLimits, expectCount, testName, index);
3295 }
3296 if((whichSpans&SPAN_UTF8)==0) {
3297 return;
3298 }
3299
3300 // Convert s16[] and expectLimits[] to UTF-8.
3301 uint8_t s8[3000];
3302 int32_t offsets[3000];
3303
3304 const UChar *s16Limit=s16+length16;
3305 char *t=(char *)s8;
3306 char *tLimit=t+sizeof(s8);
3307 int32_t *o=offsets;
3308 UErrorCode errorCode=U_ZERO_ERROR;
3309
3310 // Convert with substitution: Turn unpaired surrogates into U+FFFD.
3311 ucnv_fromUnicode(openUTF8Converter(), &t, tLimit, &s16, s16Limit, o, TRUE, &errorCode);
3312 if(U_FAILURE(errorCode)) {
3313 errln("FAIL: %s[0x%lx] ucnv_fromUnicode(to UTF-8) fails with %s",
3314 testName, (long)index, u_errorName(errorCode));
3315 ucnv_resetFromUnicode(utf8Cnv);
3316 return;
3317 }
3318 int32_t length8=(int32_t)(t-(char *)s8);
3319
3320 // Convert expectLimits[].
3321 int32_t i, j, expect;
3322 for(i=j=0; i<expectCount; ++i) {
3323 expect=expectLimits[i];
3324 if(expect==length16) {
3325 expectLimits[i]=length8;
3326 } else {
3327 while(offsets[j]<expect) {
3328 ++j;
3329 }
3330 expectLimits[i]=j;
3331 }
3332 }
3333
3334 testSpan(sets, s8, length8, FALSE, whichSpans, expectLimits, expectCount, testName, index);
3335 }
3336
nextCodePoint(UChar32 c)3337 static UChar32 nextCodePoint(UChar32 c) {
3338 // Skip some large and boring ranges.
3339 switch(c) {
3340 case 0x3441:
3341 return 0x4d7f;
3342 case 0x5100:
3343 return 0x9f00;
3344 case 0xb040:
3345 return 0xd780;
3346 case 0xe041:
3347 return 0xf8fe;
3348 case 0x10100:
3349 return 0x20000;
3350 case 0x20041:
3351 return 0xe0000;
3352 case 0xe0101:
3353 return 0x10fffd;
3354 default:
3355 return c+1;
3356 }
3357 }
3358
3359 // Verify that all implementations represent the same set.
testSpanContents(const UnicodeSetWithStrings * sets[4],uint32_t whichSpans,const char * testName)3360 void UnicodeSetTest::testSpanContents(const UnicodeSetWithStrings *sets[4], uint32_t whichSpans, const char *testName) {
3361 // contains(U+FFFD) is inconsistent with contains(some surrogates),
3362 // or the set contains strings with unpaired surrogates which don't translate to valid UTF-8:
3363 // Skip the UTF-8 part of the test - if the string contains surrogates -
3364 // because it is likely to produce a different result.
3365 UBool inconsistentSurrogates=
3366 (!(sets[0]->getSet().contains(0xfffd) ?
3367 sets[0]->getSet().contains(0xd800, 0xdfff) :
3368 sets[0]->getSet().containsNone(0xd800, 0xdfff)) ||
3369 sets[0]->hasStringsWithSurrogates());
3370
3371 UChar s[1000];
3372 int32_t length=0;
3373 uint32_t localWhichSpans;
3374
3375 UChar32 c, first;
3376 for(first=c=0;; c=nextCodePoint(c)) {
3377 if(c>0x10ffff || length>(UPRV_LENGTHOF(s)-U16_MAX_LENGTH)) {
3378 localWhichSpans=whichSpans;
3379 if(stringContainsUnpairedSurrogate(s, length) && inconsistentSurrogates) {
3380 localWhichSpans&=~SPAN_UTF8;
3381 }
3382 testSpanBothUTFs(sets, s, length, localWhichSpans, testName, first);
3383 if(c>0x10ffff) {
3384 break;
3385 }
3386 length=0;
3387 first=c;
3388 }
3389 U16_APPEND_UNSAFE(s, length, c);
3390 }
3391 }
3392
3393 // Test with a particular, interesting string.
3394 // Specify length and try NUL-termination.
testSpanUTF16String(const UnicodeSetWithStrings * sets[4],uint32_t whichSpans,const char * testName)3395 void UnicodeSetTest::testSpanUTF16String(const UnicodeSetWithStrings *sets[4], uint32_t whichSpans, const char *testName) {
3396 static const UChar s[]={
3397 0x61, 0x62, 0x20, // Latin, space
3398 0x3b1, 0x3b2, 0x3b3, // Greek
3399 0xd900, // lead surrogate
3400 0x3000, 0x30ab, 0x30ad, // wide space, Katakana
3401 0xdc05, // trail surrogate
3402 0xa0, 0xac00, 0xd7a3, // nbsp, Hangul
3403 0xd900, 0xdc05, // unassigned supplementary
3404 0xd840, 0xdfff, 0xd860, 0xdffe, // Han supplementary
3405 0xd7a4, 0xdc05, 0xd900, 0x2028, // unassigned, surrogates in wrong order, LS
3406 0 // NUL
3407 };
3408
3409 if((whichSpans&SPAN_UTF16)==0) {
3410 return;
3411 }
3412 testSpan(sets, s, -1, TRUE, (whichSpans&~SPAN_UTF8), testName, 0);
3413 testSpan(sets, s, UPRV_LENGTHOF(s)-1, TRUE, (whichSpans&~SPAN_UTF8), testName, 1);
3414 }
3415
testSpanUTF8String(const UnicodeSetWithStrings * sets[4],uint32_t whichSpans,const char * testName)3416 void UnicodeSetTest::testSpanUTF8String(const UnicodeSetWithStrings *sets[4], uint32_t whichSpans, const char *testName) {
3417 static const char s[]={
3418 "abc" // Latin
3419
3420 /* trail byte in lead position */
3421 "\x80"
3422
3423 " " // space
3424
3425 /* truncated multi-byte sequences */
3426 "\xd0"
3427 "\xe0"
3428 "\xe1"
3429 "\xed"
3430 "\xee"
3431 "\xf0"
3432 "\xf1"
3433 "\xf4"
3434 "\xf8"
3435 "\xfc"
3436
3437 "\xCE\xB1\xCE\xB2\xCE\xB3" // Greek
3438
3439 /* trail byte in lead position */
3440 "\x80"
3441
3442 "\xe0\x80"
3443 "\xe0\xa0"
3444 "\xe1\x80"
3445 "\xed\x80"
3446 "\xed\xa0"
3447 "\xee\x80"
3448 "\xf0\x80"
3449 "\xf0\x90"
3450 "\xf1\x80"
3451 "\xf4\x80"
3452 "\xf4\x90"
3453 "\xf8\x80"
3454 "\xfc\x80"
3455
3456 "\xE3\x80\x80\xE3\x82\xAB\xE3\x82\xAD" // wide space, Katakana
3457
3458 /* trail byte in lead position */
3459 "\x80"
3460
3461 "\xf0\x80\x80"
3462 "\xf0\x90\x80"
3463 "\xf1\x80\x80"
3464 "\xf4\x80\x80"
3465 "\xf4\x90\x80"
3466 "\xf8\x80\x80"
3467 "\xfc\x80\x80"
3468
3469 "\xC2\xA0\xEA\xB0\x80\xED\x9E\xA3" // nbsp, Hangul
3470
3471 /* trail byte in lead position */
3472 "\x80"
3473
3474 "\xf8\x80\x80\x80"
3475 "\xfc\x80\x80\x80"
3476
3477 "\xF1\x90\x80\x85" // unassigned supplementary
3478
3479 /* trail byte in lead position */
3480 "\x80"
3481
3482 "\xfc\x80\x80\x80\x80"
3483
3484 "\xF0\xA0\x8F\xBF\xF0\xA8\x8F\xBE" // Han supplementary
3485
3486 /* trail byte in lead position */
3487 "\x80"
3488
3489 /* complete sequences but non-shortest forms or out of range etc. */
3490 "\xc0\x80"
3491 "\xe0\x80\x80"
3492 "\xed\xa0\x80"
3493 "\xf0\x80\x80\x80"
3494 "\xf4\x90\x80\x80"
3495 "\xf8\x80\x80\x80\x80"
3496 "\xfc\x80\x80\x80\x80\x80"
3497 "\xfe"
3498 "\xff"
3499
3500 /* trail byte in lead position */
3501 "\x80"
3502
3503 "\xED\x9E\xA4\xE2\x80\xA8" // unassigned, LS, NUL-terminated
3504 };
3505
3506 if((whichSpans&SPAN_UTF8)==0) {
3507 return;
3508 }
3509 testSpan(sets, s, -1, FALSE, (whichSpans&~SPAN_UTF16), testName, 0);
3510 testSpan(sets, s, UPRV_LENGTHOF(s)-1, FALSE, (whichSpans&~SPAN_UTF16), testName, 1);
3511 }
3512
3513 // Take a set of span options and multiply them so that
3514 // each portion only has one of the options a, b and c.
3515 // If b==0, then the set of options is just modified with mask and a.
3516 // If b!=0 and c==0, then the set of options is just modified with mask, a and b.
3517 static int32_t
addAlternative(uint32_t whichSpans[],int32_t whichSpansCount,uint32_t mask,uint32_t a,uint32_t b,uint32_t c)3518 addAlternative(uint32_t whichSpans[], int32_t whichSpansCount,
3519 uint32_t mask, uint32_t a, uint32_t b, uint32_t c) {
3520 uint32_t s;
3521 int32_t i;
3522
3523 for(i=0; i<whichSpansCount; ++i) {
3524 s=whichSpans[i]&mask;
3525 whichSpans[i]=s|a;
3526 if(b!=0) {
3527 whichSpans[whichSpansCount+i]=s|b;
3528 if(c!=0) {
3529 whichSpans[2*whichSpansCount+i]=s|c;
3530 }
3531 }
3532 }
3533 return b==0 ? whichSpansCount : c==0 ? 2*whichSpansCount : 3*whichSpansCount;
3534 }
3535
3536 #define _63_a "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa"
3537 #define _64_a "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa"
3538 #define _63_b "bbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbb"
3539 #define _64_b "bbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbb"
3540
TestSpan()3541 void UnicodeSetTest::TestSpan() {
3542 // "[...]" is a UnicodeSet pattern.
3543 // "*" performs tests on all Unicode code points and on a selection of
3544 // malformed UTF-8/16 strings.
3545 // "-options" limits the scope of testing for the current set.
3546 // By default, the test verifies that equivalent boundaries are found
3547 // for UTF-16 and UTF-8, going forward and backward,
3548 // alternating USET_SPAN_NOT_CONTAINED with
3549 // either USET_SPAN_CONTAINED or USET_SPAN_SIMPLE.
3550 // Single-character options:
3551 // 8 -- UTF-16 and UTF-8 boundaries may differ.
3552 // Cause: contains(U+FFFD) is inconsistent with contains(some surrogates),
3553 // or the set contains strings with unpaired surrogates
3554 // which do not translate to valid UTF-8.
3555 // c -- set.span() and set.complement().span() boundaries may differ.
3556 // Cause: Set strings are not complemented.
3557 // b -- span() and spanBack() boundaries may differ.
3558 // Cause: Strings in the set overlap, and spanBack(USET_SPAN_CONTAINED)
3559 // and spanBack(USET_SPAN_SIMPLE) are defined to
3560 // match with non-overlapping substrings.
3561 // For example, with a set containing "ab" and "ba",
3562 // span() of "aba" yields boundaries { 0, 2, 3 }
3563 // because the initial "ab" matches from 0 to 2,
3564 // while spanBack() yields boundaries { 0, 1, 3 }
3565 // because the final "ba" matches from 1 to 3.
3566 // l -- USET_SPAN_CONTAINED and USET_SPAN_SIMPLE boundaries may differ.
3567 // Cause: Strings in the set overlap, and a longer match may
3568 // require a sequence including non-longest substrings.
3569 // For example, with a set containing "ab", "abc" and "cd",
3570 // span(contained) of "abcd" spans the entire string
3571 // but span(longest match) only spans the first 3 characters.
3572 // Each "-options" first resets all options and then applies the specified options.
3573 // A "-" without options resets the options.
3574 // The options are also reset for each new set.
3575 // Other strings will be spanned.
3576 static const char *const testdata[]={
3577 "[:ID_Continue:]",
3578 "*",
3579 "[:White_Space:]",
3580 "*",
3581 "[]",
3582 "*",
3583 "[\\u0000-\\U0010FFFF]",
3584 "*",
3585 "[\\u0000\\u0080\\u0800\\U00010000]",
3586 "*",
3587 "[\\u007F\\u07FF\\uFFFF\\U0010FFFF]",
3588 "*",
3589 "[[[:ID_Continue:]-[\\u30ab\\u30ad]]{\\u3000\\u30ab}{\\u3000\\u30ab\\u30ad}]",
3590 "-c",
3591 "*",
3592 "[[[:ID_Continue:]-[\\u30ab\\u30ad]]{\\u30ab\\u30ad}{\\u3000\\u30ab\\u30ad}]",
3593 "-c",
3594 "*",
3595
3596 // Overlapping strings cause overlapping attempts to match.
3597 "[x{xy}{xya}{axy}{ax}]",
3598 "-cl",
3599
3600 // More repetitions of "xya" would take too long with the recursive
3601 // reference implementation.
3602 // containsAll()=FALSE
3603 // test_string 0x14
3604 "xx"
3605 "xyaxyaxyaxya" // set.complement().span(longest match) will stop here.
3606 "xx" // set.complement().span(contained) will stop between the two 'x'es.
3607 "xyaxyaxyaxya"
3608 "xx"
3609 "xyaxyaxyaxya" // span() ends here.
3610 "aaa",
3611
3612 // containsAll()=TRUE
3613 // test_string 0x15
3614 "xx"
3615 "xyaxyaxyaxya"
3616 "xx"
3617 "xyaxyaxyaxya"
3618 "xx"
3619 "xyaxyaxyaxy",
3620
3621 "-bc",
3622 // test_string 0x17
3623 "byayaxya", // span() -> { 4, 7, 8 } spanBack() -> { 5, 8 }
3624 "-c",
3625 "byayaxy", // span() -> { 4, 7 } complement.span() -> { 7 }
3626 "byayax", // span() -> { 4, 6 } complement.span() -> { 6 }
3627 "-",
3628 "byaya", // span() -> { 5 }
3629 "byay", // span() -> { 4 }
3630 "bya", // span() -> { 3 }
3631
3632 // span(longest match) will not span the whole string.
3633 "[a{ab}{bc}]",
3634 "-cl",
3635 // test_string 0x21
3636 "abc",
3637
3638 "[a{ab}{abc}{cd}]",
3639 "-cl",
3640 "acdabcdabccd",
3641
3642 // spanBack(longest match) will not span the whole string.
3643 "[c{ab}{bc}]",
3644 "-cl",
3645 "abc",
3646
3647 "[d{cd}{bcd}{ab}]",
3648 "-cl",
3649 "abbcdabcdabd",
3650
3651 // Test with non-ASCII set strings - test proper handling of surrogate pairs
3652 // and UTF-8 trail bytes.
3653 // Copies of above test sets and strings, but transliterated to have
3654 // different code points with similar trail units.
3655 // Previous: a b c d
3656 // Unicode: 042B 30AB 200AB 204AB
3657 // UTF-16: 042B 30AB D840 DCAB D841 DCAB
3658 // UTF-8: D0 AB E3 82 AB F0 A0 82 AB F0 A0 92 AB
3659 "[\\u042B{\\u042B\\u30AB}{\\u042B\\u30AB\\U000200AB}{\\U000200AB\\U000204AB}]",
3660 "-cl",
3661 "\\u042B\\U000200AB\\U000204AB\\u042B\\u30AB\\U000200AB\\U000204AB\\u042B\\u30AB\\U000200AB\\U000200AB\\U000204AB",
3662
3663 "[\\U000204AB{\\U000200AB\\U000204AB}{\\u30AB\\U000200AB\\U000204AB}{\\u042B\\u30AB}]",
3664 "-cl",
3665 "\\u042B\\u30AB\\u30AB\\U000200AB\\U000204AB\\u042B\\u30AB\\U000200AB\\U000204AB\\u042B\\u30AB\\U000204AB",
3666
3667 // Stress bookkeeping and recursion.
3668 // The following strings are barely doable with the recursive
3669 // reference implementation.
3670 // The not-contained character at the end prevents an early exit from the span().
3671 "[b{bb}]",
3672 "-c",
3673 // test_string 0x33
3674 "bbbbbbbbbbbbbbbbbbbbbbbb-",
3675 // On complement sets, span() and spanBack() get different results
3676 // because b is not in the complement set and there is an odd number of b's
3677 // in the test string.
3678 "-bc",
3679 "bbbbbbbbbbbbbbbbbbbbbbbbb-",
3680
3681 // Test with set strings with an initial or final code point span
3682 // longer than 254.
3683 "[a{" _64_a _64_a _64_a _64_a "b}"
3684 "{a" _64_b _64_b _64_b _64_b "}]",
3685 "-c",
3686 _64_a _64_a _64_a _63_a "b",
3687 _64_a _64_a _64_a _64_a "b",
3688 _64_a _64_a _64_a _64_a "aaaabbbb",
3689 "a" _64_b _64_b _64_b _63_b,
3690 "a" _64_b _64_b _64_b _64_b,
3691 "aaaabbbb" _64_b _64_b _64_b _64_b,
3692
3693 // Test with strings containing unpaired surrogates.
3694 // They are not representable in UTF-8, and a leading trail surrogate
3695 // and a trailing lead surrogate must not match in the middle of a proper surrogate pair.
3696 // U+20001 == \\uD840\\uDC01
3697 // U+20400 == \\uD841\\uDC00
3698 "[a\\U00020001\\U00020400{ab}{b\\uD840}{\\uDC00a}]",
3699 "-8cl",
3700 "aaab\\U00020001ba\\U00020400aba\\uD840ab\\uD840\\U00020000b\\U00020000a\\U00020000\\uDC00a\\uDC00babbb"
3701 };
3702 uint32_t whichSpans[96]={ SPAN_ALL };
3703 int32_t whichSpansCount=1;
3704
3705 UnicodeSet *sets[SET_COUNT]={ NULL };
3706 const UnicodeSetWithStrings *sets_with_str[SET_COUNT]={ NULL };
3707
3708 char testName[1024];
3709 char *testNameLimit=testName;
3710
3711 int32_t i, j;
3712 for(i=0; i<UPRV_LENGTHOF(testdata); ++i) {
3713 const char *s=testdata[i];
3714 if(s[0]=='[') {
3715 // Create new test sets from this pattern.
3716 for(j=0; j<SET_COUNT; ++j) {
3717 delete sets_with_str[j];
3718 delete sets[j];
3719 }
3720 UErrorCode errorCode=U_ZERO_ERROR;
3721 sets[SLOW]=new UnicodeSet(UnicodeString(s, -1, US_INV).unescape(), errorCode);
3722 if(U_FAILURE(errorCode)) {
3723 dataerrln("FAIL: Unable to create UnicodeSet(%s) - %s", s, u_errorName(errorCode));
3724 break;
3725 }
3726 sets[SLOW_NOT]=new UnicodeSet(*sets[SLOW]);
3727 sets[SLOW_NOT]->complement();
3728 // Intermediate set: Test cloning of a frozen set.
3729 UnicodeSet *fast=new UnicodeSet(*sets[SLOW]);
3730 fast->freeze();
3731 sets[FAST]=fast->clone();
3732 delete fast;
3733 UnicodeSet *fastNot=new UnicodeSet(*sets[SLOW_NOT]);
3734 fastNot->freeze();
3735 sets[FAST_NOT]=fastNot->clone();
3736 delete fastNot;
3737
3738 for(j=0; j<SET_COUNT; ++j) {
3739 sets_with_str[j]=new UnicodeSetWithStrings(*sets[j]);
3740 }
3741
3742 strcpy(testName, s);
3743 testNameLimit=strchr(testName, 0);
3744 *testNameLimit++=':';
3745 *testNameLimit=0;
3746
3747 whichSpans[0]=SPAN_ALL;
3748 whichSpansCount=1;
3749 } else if(s[0]=='-') {
3750 whichSpans[0]=SPAN_ALL;
3751 whichSpansCount=1;
3752
3753 while(*++s!=0) {
3754 switch(*s) {
3755 case 'c':
3756 whichSpansCount=addAlternative(whichSpans, whichSpansCount,
3757 ~SPAN_POLARITY,
3758 SPAN_SET,
3759 SPAN_COMPLEMENT,
3760 0);
3761 break;
3762 case 'b':
3763 whichSpansCount=addAlternative(whichSpans, whichSpansCount,
3764 ~SPAN_DIRS,
3765 SPAN_FWD,
3766 SPAN_BACK,
3767 0);
3768 break;
3769 case 'l':
3770 // test USET_SPAN_CONTAINED FWD & BACK, and separately
3771 // USET_SPAN_SIMPLE only FWD, and separately
3772 // USET_SPAN_SIMPLE only BACK
3773 whichSpansCount=addAlternative(whichSpans, whichSpansCount,
3774 ~(SPAN_DIRS|SPAN_CONDITION),
3775 SPAN_DIRS|SPAN_CONTAINED,
3776 SPAN_FWD|SPAN_SIMPLE,
3777 SPAN_BACK|SPAN_SIMPLE);
3778 break;
3779 case '8':
3780 whichSpansCount=addAlternative(whichSpans, whichSpansCount,
3781 ~SPAN_UTFS,
3782 SPAN_UTF16,
3783 SPAN_UTF8,
3784 0);
3785 break;
3786 default:
3787 errln("FAIL: unrecognized span set option in \"%s\"", testdata[i]);
3788 break;
3789 }
3790 }
3791 } else if(0==strcmp(s, "*")) {
3792 strcpy(testNameLimit, "bad_string");
3793 for(j=0; j<whichSpansCount; ++j) {
3794 if(whichSpansCount>1) {
3795 sprintf(testNameLimit+10 /* strlen("bad_string") */,
3796 "%%0x%3x",
3797 whichSpans[j]);
3798 }
3799 testSpanUTF16String(sets_with_str, whichSpans[j], testName);
3800 testSpanUTF8String(sets_with_str, whichSpans[j], testName);
3801 }
3802
3803 strcpy(testNameLimit, "contents");
3804 for(j=0; j<whichSpansCount; ++j) {
3805 if(whichSpansCount>1) {
3806 sprintf(testNameLimit+8 /* strlen("contents") */,
3807 "%%0x%3x",
3808 whichSpans[j]);
3809 }
3810 testSpanContents(sets_with_str, whichSpans[j], testName);
3811 }
3812 } else {
3813 UnicodeString string=UnicodeString(s, -1, US_INV).unescape();
3814 strcpy(testNameLimit, "test_string");
3815 for(j=0; j<whichSpansCount; ++j) {
3816 if(whichSpansCount>1) {
3817 sprintf(testNameLimit+11 /* strlen("test_string") */,
3818 "%%0x%3x",
3819 whichSpans[j]);
3820 }
3821 testSpanBothUTFs(sets_with_str, string.getBuffer(), string.length(), whichSpans[j], testName, i);
3822 }
3823 }
3824 }
3825 for(j=0; j<SET_COUNT; ++j) {
3826 delete sets_with_str[j];
3827 delete sets[j];
3828 }
3829 }
3830
3831 // Test select patterns and strings, and test USET_SPAN_SIMPLE.
TestStringSpan()3832 void UnicodeSetTest::TestStringSpan() {
3833 static const char *pattern="[x{xy}{xya}{axy}{ax}]";
3834 static const char *const string=
3835 "xx"
3836 "xyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxya"
3837 "xx"
3838 "xyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxya"
3839 "xx"
3840 "xyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxy"
3841 "aaaa";
3842
3843 UErrorCode errorCode=U_ZERO_ERROR;
3844 UnicodeString pattern16=UnicodeString(pattern, -1, US_INV);
3845 UnicodeSet set(pattern16, errorCode);
3846 if(U_FAILURE(errorCode)) {
3847 errln("FAIL: Unable to create UnicodeSet(%s) - %s", pattern, u_errorName(errorCode));
3848 return;
3849 }
3850
3851 UnicodeString string16=UnicodeString(string, -1, US_INV).unescape();
3852
3853 if(set.containsAll(string16)) {
3854 errln("FAIL: UnicodeSet(%s).containsAll(%s) should be FALSE", pattern, string);
3855 }
3856
3857 // Remove trailing "aaaa".
3858 string16.truncate(string16.length()-4);
3859 if(!set.containsAll(string16)) {
3860 errln("FAIL: UnicodeSet(%s).containsAll(%s[:-4]) should be TRUE", pattern, string);
3861 }
3862
3863 string16=u"byayaxya";
3864 const UChar *s16=string16.getBuffer();
3865 int32_t length16=string16.length();
3866 (void)length16; // Suppress set but not used warning.
3867 if( set.span(s16, 8, USET_SPAN_NOT_CONTAINED)!=4 ||
3868 set.span(s16, 7, USET_SPAN_NOT_CONTAINED)!=4 ||
3869 set.span(s16, 6, USET_SPAN_NOT_CONTAINED)!=4 ||
3870 set.span(s16, 5, USET_SPAN_NOT_CONTAINED)!=5 ||
3871 set.span(s16, 4, USET_SPAN_NOT_CONTAINED)!=4 ||
3872 set.span(s16, 3, USET_SPAN_NOT_CONTAINED)!=3
3873 ) {
3874 errln("FAIL: UnicodeSet(%s).span(while not) returns the wrong value", pattern);
3875 }
3876
3877 pattern="[a{ab}{abc}{cd}]";
3878 pattern16=UnicodeString(pattern, -1, US_INV);
3879 set.applyPattern(pattern16, errorCode);
3880 if(U_FAILURE(errorCode)) {
3881 errln("FAIL: Unable to create UnicodeSet(%s) - %s", pattern, u_errorName(errorCode));
3882 return;
3883 }
3884 string16=u"acdabcdabccd";
3885 s16=string16.getBuffer();
3886 length16=string16.length();
3887 if( set.span(s16, 12, USET_SPAN_CONTAINED)!=12 ||
3888 set.span(s16, 12, USET_SPAN_SIMPLE)!=6 ||
3889 set.span(s16+7, 5, USET_SPAN_SIMPLE)!=5
3890 ) {
3891 errln("FAIL: UnicodeSet(%s).span(while longest match) returns the wrong value", pattern);
3892 }
3893
3894 pattern="[d{cd}{bcd}{ab}]";
3895 pattern16=UnicodeString(pattern, -1, US_INV);
3896 set.applyPattern(pattern16, errorCode).freeze();
3897 if(U_FAILURE(errorCode)) {
3898 errln("FAIL: Unable to create UnicodeSet(%s) - %s", pattern, u_errorName(errorCode));
3899 return;
3900 }
3901 string16=u"abbcdabcdabd";
3902 s16=string16.getBuffer();
3903 length16=string16.length();
3904 if( set.spanBack(s16, 12, USET_SPAN_CONTAINED)!=0 ||
3905 set.spanBack(s16, 12, USET_SPAN_SIMPLE)!=6 ||
3906 set.spanBack(s16, 5, USET_SPAN_SIMPLE)!=0
3907 ) {
3908 errln("FAIL: UnicodeSet(%s).spanBack(while longest match) returns the wrong value", pattern);
3909 }
3910 }
3911
TestPatternWithSurrogates()3912 void UnicodeSetTest::TestPatternWithSurrogates() {
3913 IcuTestErrorCode errorCode(*this, "TestPatternWithSurrogates");
3914 // Regression test for ICU-11891
3915 UnicodeSet surrogates;
3916 surrogates.add(0xd000, 0xd82f); // a range ending with a lead surrogate code point
3917 surrogates.add(0xd83a); // a lead surrogate
3918 surrogates.add(0xdc00, 0xdfff); // a range of trail surrogates
3919 UnicodeString pat;
3920 surrogates.toPattern(pat, false); // bad if U+D83A is immediately followed by U+DC00
3921 UnicodeSet s2;
3922 // was: U_MALFORMED_SET
3923 // Java: IllegalArgumentException: Error: Invalid range at "[...\U0001E800-\uDFFF|...]"
3924 s2.applyPattern(pat, errorCode);
3925 if (errorCode.errIfFailureAndReset("surrogates (1) to/from pattern")) { return; }
3926 checkEqual(surrogates, s2, "surrogates (1) to/from pattern");
3927
3928 // create a range of DBFF-DC00, and in the complement form a range of DC01-DC03
3929 surrogates.add(0xdbff).remove(0xdc01, 0xdc03);
3930 // add a beyond-surrogates range, up to the last code point
3931 surrogates.add(0x10affe, 0x10ffff);
3932 surrogates.toPattern(pat, false); // bad if U+DBFF is immediately followed by U+DC00
3933 s2.applyPattern(pat, errorCode);
3934 if (errorCode.errIfFailureAndReset("surrogates (2) to/from pattern")) { return; }
3935 checkEqual(surrogates, s2, "surrogates (2) to/from pattern");
3936
3937 // Test the toPattern() code path when the pattern is shorter in complement form:
3938 // [^opposite-ranges]
3939 surrogates.add(0, 0x6789);
3940 surrogates.toPattern(pat, false);
3941 s2.applyPattern(pat, errorCode);
3942 if (errorCode.errIfFailureAndReset("surrogates (3) to/from pattern")) { return; }
3943 checkEqual(surrogates, s2, "surrogates (3) to/from pattern");
3944
3945 // Start with a pattern, in case the original pattern is kept but
3946 // without the extra white space.
3947 surrogates.applyPattern(u"[\\uD83A \\uDC00-\\uDFFF]", errorCode);
3948 if (errorCode.errIfFailureAndReset("surrogates from pattern")) { return; }
3949 surrogates.toPattern(pat, false);
3950 s2.applyPattern(pat, errorCode);
3951 if (errorCode.errIfFailureAndReset("surrogates from/to/from pattern")) { return; }
3952 checkEqual(surrogates, s2, "surrogates from/to/from pattern");
3953 }
3954
TestIntOverflow()3955 void UnicodeSetTest::TestIntOverflow() {
3956 // This test triggers undefined double->int conversion behavior
3957 // if the implementation is not careful.
3958 IcuTestErrorCode errorCode(*this, "TestIntOverflow");
3959 UnicodeSet set(u"[:ccc=2222222222222222222:]", errorCode);
3960 assertTrue("[:ccc=int_overflow:] -> empty set", set.isEmpty());
3961 assertEquals("[:ccc=int_overflow:] -> illegal argument",
3962 U_ILLEGAL_ARGUMENT_ERROR, errorCode.reset());
3963 }
3964
TestUnusedCcc()3965 void UnicodeSetTest::TestUnusedCcc() {
3966 #if !UCONFIG_NO_NORMALIZATION
3967 // All numeric ccc values 0..255 are valid, but many are unused.
3968 IcuTestErrorCode errorCode(*this, "TestUnusedCcc");
3969 UnicodeSet ccc2(u"[:ccc=2:]", errorCode);
3970 assertSuccess("[:ccc=2:]", errorCode);
3971 assertTrue("[:ccc=2:] -> empty set", ccc2.isEmpty());
3972
3973 UnicodeSet ccc255(u"[:ccc=255:]", errorCode);
3974 assertSuccess("[:ccc=255:]", errorCode);
3975 assertTrue("[:ccc=255:] -> empty set", ccc255.isEmpty());
3976
3977 // Non-integer values and values outside 0..255 are invalid.
3978 UnicodeSet ccc_1(u"[:ccc=-1:]", errorCode);
3979 assertEquals("[:ccc=-1:] -> illegal argument",
3980 U_ILLEGAL_ARGUMENT_ERROR, errorCode.reset());
3981 assertTrue("[:ccc=-1:] -> empty set", ccc_1.isEmpty());
3982
3983 UnicodeSet ccc256(u"[:ccc=256:]", errorCode);
3984 assertEquals("[:ccc=256:] -> illegal argument",
3985 U_ILLEGAL_ARGUMENT_ERROR, errorCode.reset());
3986 assertTrue("[:ccc=256:] -> empty set", ccc256.isEmpty());
3987
3988 UnicodeSet ccc1_1(u"[:ccc=1.1:]", errorCode);
3989 assertEquals("[:ccc=1.1:] -> illegal argument",
3990 U_ILLEGAL_ARGUMENT_ERROR, errorCode.reset());
3991 assertTrue("[:ccc=1.1:] -> empty set", ccc1_1.isEmpty());
3992 #endif
3993 }
3994
TestDeepPattern()3995 void UnicodeSetTest::TestDeepPattern() {
3996 IcuTestErrorCode errorCode(*this, "TestDeepPattern");
3997 // Nested ranges are parsed via recursion which can use a lot of stack space.
3998 // After a reasonable limit, we should get an error.
3999 constexpr int32_t DEPTH = 20000;
4000 UnicodeString pattern, suffix;
4001 for (int32_t i = 0; i < DEPTH; ++i) {
4002 pattern.append(u"[a", 2);
4003 suffix.append(']');
4004 }
4005 pattern.append(suffix);
4006 UnicodeSet set(pattern, errorCode);
4007 assertTrue("[a[a[a...1000s...]]] -> error", errorCode.isFailure());
4008 errorCode.reset();
4009 }
4010
TestEmptyString()4011 void UnicodeSetTest::TestEmptyString() {
4012 IcuTestErrorCode errorCode(*this, "TestEmptyString");
4013 // Starting with ICU 69, the empty string is allowed in UnicodeSet. ICU-13702
4014 UnicodeSet set(u"[{}]", errorCode);
4015 if (!assertSuccess("set from pattern with {}", errorCode)) { return; }
4016 assertTrue("set from pattern with {}", set.contains(u""));
4017 assertEquals("set from pattern with {}: size", 1, set.size());
4018 assertFalse("set from pattern with {}: isEmpty", set.isEmpty());
4019
4020 // Remove, add back, ...
4021 assertFalse("remove empty string", set.remove(u"").contains(u""));
4022 assertEquals("remove empty string: size", 0, set.size());
4023 assertTrue("remove empty string: isEmpty", set.isEmpty());
4024 assertTrue("add empty string", set.add(u"").contains(u""));
4025 // missing API -- assertTrue("retain empty string", set.retain(u"").contains(u""));
4026 assertFalse("complement-remove empty string", set.complement(u"").contains(u""));
4027 assertTrue("complement-add empty string", set.complement(u"").contains(u""));
4028
4029 assertFalse("clear", set.clear().contains(u""));
4030 assertTrue("add empty string 2", set.add(u"").contains(u""));
4031 assertFalse("removeAllStrings", set.removeAllStrings().contains(u""));
4032 assertTrue("add empty string 3", set.add(u"").contains(u""));
4033 // Note that this leaves the set containing exactly the empty string.
4034
4035 // strings() access and iteration
4036 // no C++ equivalent for Java strings() -- assertTrue("strings()", set.strings().contains(u""));
4037 UnicodeSetIterator sit(set);
4038 assertTrue("set iterator.next()", sit.next());
4039 assertTrue("set iterator has empty string", sit.isString() && sit.getString().isEmpty());
4040
4041 // The empty string is ignored in matching.
4042 set.add(u'a').add(u'c');
4043 assertEquals("span", 1, set.span(u"abc", 3, USET_SPAN_SIMPLE));
4044 assertEquals("spanBack", 2, set.spanBack(u"abc", 3, USET_SPAN_SIMPLE));
4045 assertTrue("containsNone", set.containsNone(u"def"));
4046 assertFalse("containsSome", set.containsSome(u"def"));
4047 set.freeze();
4048 assertEquals("frozen span", 1, set.span(u"abc", 3, USET_SPAN_SIMPLE));
4049 assertEquals("frozen spanBack", 2, set.spanBack(u"abc", 3, USET_SPAN_SIMPLE));
4050 assertTrue("frozen containsNone", set.containsNone(u"def"));
4051 assertFalse("frozen containsSome", set.containsSome(u"def"));
4052 }
4053
assertNext(UnicodeSetIterator & iter,const UnicodeString & expected)4054 void UnicodeSetTest::assertNext(UnicodeSetIterator &iter, const UnicodeString &expected) {
4055 assertTrue(expected + ".next()", iter.next());
4056 assertEquals(expected + ".getString()", expected, iter.getString());
4057 }
4058
TestSkipToStrings()4059 void UnicodeSetTest::TestSkipToStrings() {
4060 IcuTestErrorCode errorCode(*this, "TestSkipToStrings");
4061 UnicodeSet set(u"[0189{}{ch}]", errorCode);
4062 UnicodeSetIterator iter(set);
4063 assertNext(iter.skipToStrings(), u"");
4064 assertNext(iter, u"ch");
4065 assertFalse("no next", iter.next());
4066
4067 iter.reset();
4068 assertNext(iter, u"0");
4069 assertNext(iter, u"1");
4070 assertNext(iter, u"8");
4071 assertNext(iter, u"9");
4072 assertNext(iter, u"");
4073 assertNext(iter, u"ch");
4074 assertFalse("no next", iter.next());
4075
4076 iter.reset();
4077 assertNext(iter, u"0");
4078 iter.skipToStrings();
4079 assertNext(iter, u"");
4080 assertNext(iter, u"ch");
4081 assertFalse("no next", iter.next());
4082
4083 iter.reset();
4084 iter.nextRange();
4085 assertNext(iter, u"8");
4086 iter.skipToStrings();
4087 assertNext(iter, u"");
4088 assertNext(iter, u"ch");
4089 assertFalse("no next", iter.next());
4090
4091 iter.reset();
4092 iter.nextRange();
4093 iter.nextRange();
4094 iter.nextRange();
4095 iter.skipToStrings();
4096 assertNext(iter, u"ch");
4097 assertFalse("no next", iter.next());
4098 }
4099
TestPatternCodePointComplement()4100 void UnicodeSetTest::TestPatternCodePointComplement() {
4101 IcuTestErrorCode errorCode(*this, "TestPatternCodePointComplement");
4102 // ICU-21524 changes pattern ^ and equivalent functions to perform a "code point complement".
4103 // [^abc{ch}] = [[:Any:]-[abc{ch}]] which removes all strings.
4104 {
4105 UnicodeSet simple(u"[^abc{ch}]", errorCode);
4106 assertEquals("[^abc{ch}] --> lots of elements", 0x110000 - 3, simple.size());
4107 assertFalse("[^abc{ch}] --> no strings", simple.hasStrings());
4108 assertFalse("[^abc{ch}] --> no 'a'", simple.contains(u'a'));
4109 }
4110
4111 {
4112 UnicodeSet notBasic(u"[:^Basic_Emoji:]", errorCode);
4113 if (errorCode.errDataIfFailureAndReset("[:^Basic_Emoji:]")) {
4114 return;
4115 }
4116 assertTrue("[:^Basic_Emoji:] --> lots of elements", notBasic.size() > 1000);
4117 assertFalse("[:^Basic_Emoji:] --> no strings", notBasic.hasStrings());
4118 assertFalse("[:^Basic_Emoji:] --> no bicycle", notBasic.contains(U''));
4119 }
4120
4121 {
4122 UnicodeSet notBasic(u"[:Basic_Emoji=No:]", errorCode);
4123 assertTrue("[:Basic_Emoji=No:] --> lots of elements", notBasic.size() > 1000);
4124 assertFalse("[:Basic_Emoji=No:] --> no strings", notBasic.hasStrings());
4125 assertFalse("[:Basic_Emoji=No:] --> no bicycle", notBasic.contains(U''));
4126 }
4127
4128 {
4129 UnicodeSet notBasic;
4130 notBasic.applyIntPropertyValue(UCHAR_BASIC_EMOJI, 0, errorCode);
4131 assertTrue("[].applyIntPropertyValue(Basic_Emoji, 0) --> lots of elements",
4132 notBasic.size() > 1000);
4133 assertFalse("[].applyIntPropertyValue(Basic_Emoji, 0) --> no strings",
4134 notBasic.hasStrings());
4135 assertFalse("[].applyIntPropertyValue(Basic_Emoji, 0) --> no bicycle",
4136 notBasic.contains(U''));
4137 }
4138
4139 {
4140 UnicodeSet notBasic;
4141 notBasic.applyPropertyAlias("Basic_Emoji", "No", errorCode);
4142 assertTrue("[].applyPropertyAlias(Basic_Emoji, No) --> lots of elements",
4143 notBasic.size() > 1000);
4144 assertFalse("[].applyPropertyAlias(Basic_Emoji, No) --> no strings",
4145 notBasic.hasStrings());
4146 assertFalse("[].applyPropertyAlias(Basic_Emoji, No) --> no bicycle",
4147 notBasic.contains(U''));
4148 }
4149
4150 // When there are strings, we must not use the complement for a more compact toPattern().
4151 {
4152 UnicodeSet set;
4153 set.add(0, u'Y').add(u'b', u'q').add(u'x', 0x10ffff);
4154 UnicodeString pattern;
4155 set.toPattern(pattern, true);
4156 UnicodeSet set2(pattern, errorCode);
4157 checkEqual(set, set2, "set(with 0 & max, only code points) pattern round-trip");
4158 assertEquals("set(with 0 & max, only code points).toPattern()", u"[^Z-ar-w]", pattern);
4159
4160 set.add("ch").add("ss");
4161 set.toPattern(pattern, true);
4162 set2 = UnicodeSet(pattern, errorCode);
4163 checkEqual(set, set2, "set(with 0 & max, with strings) pattern round-trip");
4164 assertEquals("set(with 0 & max, with strings).toPattern()",
4165 u"[\\u0000-Yb-qx-\\U0010FFFF{ch}{ss}]", pattern);
4166 }
4167
4168 // The complement() API behavior does not change under this ticket.
4169 {
4170 UnicodeSet notBasic(u"[:Basic_Emoji:]", errorCode);
4171 notBasic.complement();
4172 assertTrue("[:Basic_Emoji:].complement() --> lots of elements", notBasic.size() > 1000);
4173 assertTrue("[:Basic_Emoji:].complement() --> has strings", notBasic.hasStrings());
4174 assertTrue("[:Basic_Emoji:].complement().contains(chipmunk+emoji)",
4175 notBasic.contains(u"\uFE0F"));
4176 assertFalse("[:Basic_Emoji:].complement() --> no bicycle", notBasic.contains(U''));
4177 }
4178 }
4179