1 // © 2016 and later: Unicode, Inc. and others.
2 // License & terms of use: http://www.unicode.org/copyright.html
3 /*
4 **********************************************************************
5 * Copyright (C) 1999-2015, International Business Machines
6 * Corporation and others. All Rights Reserved.
7 **********************************************************************
8 * Date Name Description
9 * 10/20/99 alan Creation.
10 **********************************************************************
11 */
12
13 #include "unicode/utypes.h"
14 #include "unicode/parsepos.h"
15 #include "unicode/symtable.h"
16 #include "unicode/uniset.h"
17 #include "unicode/ustring.h"
18 #include "unicode/utf8.h"
19 #include "unicode/utf16.h"
20 #include "ruleiter.h"
21 #include "cmemory.h"
22 #include "cstring.h"
23 #include "patternprops.h"
24 #include "uelement.h"
25 #include "util.h"
26 #include "uvector.h"
27 #include "charstr.h"
28 #include "ustrfmt.h"
29 #include "uassert.h"
30 #include "bmpset.h"
31 #include "unisetspan.h"
32
33 // HIGH_VALUE > all valid values. 110000 for codepoints
34 #define UNICODESET_HIGH 0x0110000
35
36 // LOW <= all valid values. ZERO for codepoints
37 #define UNICODESET_LOW 0x000000
38
39 /** Max list [0, 1, 2, ..., max code point, HIGH] */
40 constexpr int32_t MAX_LENGTH = UNICODESET_HIGH + 1;
41
42 U_NAMESPACE_BEGIN
43
~SymbolTable()44 SymbolTable::~SymbolTable() {}
45
UOBJECT_DEFINE_RTTI_IMPLEMENTATION(UnicodeSet)46 UOBJECT_DEFINE_RTTI_IMPLEMENTATION(UnicodeSet)
47
48 /**
49 * Modify the given UChar32 variable so that it is in range, by
50 * pinning values < UNICODESET_LOW to UNICODESET_LOW, and
51 * pinning values > UNICODESET_HIGH-1 to UNICODESET_HIGH-1.
52 * It modifies its argument in-place and also returns it.
53 */
54 static inline UChar32 pinCodePoint(UChar32& c) {
55 if (c < UNICODESET_LOW) {
56 c = UNICODESET_LOW;
57 } else if (c > (UNICODESET_HIGH-1)) {
58 c = (UNICODESET_HIGH-1);
59 }
60 return c;
61 }
62
63 //----------------------------------------------------------------
64 // Debugging
65 //----------------------------------------------------------------
66
67 // DO NOT DELETE THIS CODE. This code is used to debug memory leaks.
68 // To enable the debugging, define the symbol DEBUG_MEM in the line
69 // below. This will result in text being sent to stdout that looks
70 // like this:
71 // DEBUG UnicodeSet: ct 0x00A39B20; 397 [\u0A81-\u0A83\u0A85-
72 // DEBUG UnicodeSet: dt 0x00A39B20; 396 [\u0A81-\u0A83\u0A85-
73 // Each line lists a construction (ct) or destruction (dt) event, the
74 // object address, the number of outstanding objects after the event,
75 // and the pattern of the object in question.
76
77 // #define DEBUG_MEM
78
79 #ifdef DEBUG_MEM
80 #include <stdio.h>
81 static int32_t _dbgCount = 0;
82
_dbgct(UnicodeSet * set)83 static inline void _dbgct(UnicodeSet* set) {
84 UnicodeString str;
85 set->toPattern(str, TRUE);
86 char buf[40];
87 str.extract(0, 39, buf, "");
88 printf("DEBUG UnicodeSet: ct 0x%08X; %d %s\n", set, ++_dbgCount, buf);
89 }
90
_dbgdt(UnicodeSet * set)91 static inline void _dbgdt(UnicodeSet* set) {
92 UnicodeString str;
93 set->toPattern(str, TRUE);
94 char buf[40];
95 str.extract(0, 39, buf, "");
96 printf("DEBUG UnicodeSet: dt 0x%08X; %d %s\n", set, --_dbgCount, buf);
97 }
98
99 #else
100
101 #define _dbgct(set)
102 #define _dbgdt(set)
103
104 #endif
105
106 //----------------------------------------------------------------
107 // UnicodeString in UVector support
108 //----------------------------------------------------------------
109
cloneUnicodeString(UElement * dst,UElement * src)110 static void U_CALLCONV cloneUnicodeString(UElement *dst, UElement *src) {
111 dst->pointer = new UnicodeString(*(UnicodeString*)src->pointer);
112 }
113
compareUnicodeString(UElement t1,UElement t2)114 static int8_t U_CALLCONV compareUnicodeString(UElement t1, UElement t2) {
115 const UnicodeString &a = *(const UnicodeString*)t1.pointer;
116 const UnicodeString &b = *(const UnicodeString*)t2.pointer;
117 return a.compare(b);
118 }
119
hasStrings() const120 UBool UnicodeSet::hasStrings() const {
121 return strings != nullptr && !strings->isEmpty();
122 }
123
stringsSize() const124 int32_t UnicodeSet::stringsSize() const {
125 return strings == nullptr ? 0 : strings->size();
126 }
127
stringsContains(const UnicodeString & s) const128 UBool UnicodeSet::stringsContains(const UnicodeString &s) const {
129 return strings != nullptr && strings->contains((void*) &s);
130 }
131
132 //----------------------------------------------------------------
133 // Constructors &c
134 //----------------------------------------------------------------
135
136 /**
137 * Constructs an empty set.
138 */
UnicodeSet()139 UnicodeSet::UnicodeSet() {
140 list[0] = UNICODESET_HIGH;
141 _dbgct(this);
142 }
143
144 /**
145 * Constructs a set containing the given range. If <code>end >
146 * start</code> then an empty set is created.
147 *
148 * @param start first character, inclusive, of range
149 * @param end last character, inclusive, of range
150 */
UnicodeSet(UChar32 start,UChar32 end)151 UnicodeSet::UnicodeSet(UChar32 start, UChar32 end) {
152 list[0] = UNICODESET_HIGH;
153 add(start, end);
154 _dbgct(this);
155 }
156
157 /**
158 * Constructs a set that is identical to the given UnicodeSet.
159 */
UnicodeSet(const UnicodeSet & o)160 UnicodeSet::UnicodeSet(const UnicodeSet& o) : UnicodeFilter(o) {
161 *this = o;
162 _dbgct(this);
163 }
164
165 // Copy-construct as thawed.
UnicodeSet(const UnicodeSet & o,UBool)166 UnicodeSet::UnicodeSet(const UnicodeSet& o, UBool /* asThawed */) : UnicodeFilter(o) {
167 if (ensureCapacity(o.len)) {
168 // *this = o except for bmpSet and stringSpan
169 len = o.len;
170 uprv_memcpy(list, o.list, (size_t)len*sizeof(UChar32));
171 if (o.hasStrings()) {
172 UErrorCode status = U_ZERO_ERROR;
173 if (!allocateStrings(status) ||
174 (strings->assign(*o.strings, cloneUnicodeString, status), U_FAILURE(status))) {
175 setToBogus();
176 return;
177 }
178 }
179 if (o.pat) {
180 setPattern(o.pat, o.patLen);
181 }
182 _dbgct(this);
183 }
184 }
185
186 /**
187 * Destructs the set.
188 */
~UnicodeSet()189 UnicodeSet::~UnicodeSet() {
190 _dbgdt(this); // first!
191 if (list != stackList) {
192 uprv_free(list);
193 }
194 delete bmpSet;
195 if (buffer != stackList) {
196 uprv_free(buffer);
197 }
198 delete strings;
199 delete stringSpan;
200 releasePattern();
201 }
202
203 /**
204 * Assigns this object to be a copy of another.
205 */
operator =(const UnicodeSet & o)206 UnicodeSet& UnicodeSet::operator=(const UnicodeSet& o) {
207 return copyFrom(o, FALSE);
208 }
209
copyFrom(const UnicodeSet & o,UBool asThawed)210 UnicodeSet& UnicodeSet::copyFrom(const UnicodeSet& o, UBool asThawed) {
211 if (this == &o) {
212 return *this;
213 }
214 if (isFrozen()) {
215 return *this;
216 }
217 if (o.isBogus()) {
218 setToBogus();
219 return *this;
220 }
221 if (!ensureCapacity(o.len)) {
222 // ensureCapacity will mark the UnicodeSet as Bogus if OOM failure happens.
223 return *this;
224 }
225 len = o.len;
226 uprv_memcpy(list, o.list, (size_t)len*sizeof(UChar32));
227 if (o.bmpSet != nullptr && !asThawed) {
228 bmpSet = new BMPSet(*o.bmpSet, list, len);
229 if (bmpSet == NULL) { // Check for memory allocation error.
230 setToBogus();
231 return *this;
232 }
233 }
234 if (o.hasStrings()) {
235 UErrorCode status = U_ZERO_ERROR;
236 if ((strings == nullptr && !allocateStrings(status)) ||
237 (strings->assign(*o.strings, cloneUnicodeString, status), U_FAILURE(status))) {
238 setToBogus();
239 return *this;
240 }
241 } else if (hasStrings()) {
242 strings->removeAllElements();
243 }
244 if (o.stringSpan != nullptr && !asThawed) {
245 stringSpan = new UnicodeSetStringSpan(*o.stringSpan, *strings);
246 if (stringSpan == NULL) { // Check for memory allocation error.
247 setToBogus();
248 return *this;
249 }
250 }
251 releasePattern();
252 if (o.pat) {
253 setPattern(o.pat, o.patLen);
254 }
255 return *this;
256 }
257
258 /**
259 * Returns a copy of this object. All UnicodeMatcher objects have
260 * to support cloning in order to allow classes using
261 * UnicodeMatchers, such as Transliterator, to implement cloning.
262 */
clone() const263 UnicodeSet* UnicodeSet::clone() const {
264 return new UnicodeSet(*this);
265 }
266
cloneAsThawed() const267 UnicodeSet *UnicodeSet::cloneAsThawed() const {
268 return new UnicodeSet(*this, TRUE);
269 }
270
271 /**
272 * Compares the specified object with this set for equality. Returns
273 * <tt>true</tt> if the two sets
274 * have the same size, and every member of the specified set is
275 * contained in this set (or equivalently, every member of this set is
276 * contained in the specified set).
277 *
278 * @param o set to be compared for equality with this set.
279 * @return <tt>true</tt> if the specified set is equal to this set.
280 */
operator ==(const UnicodeSet & o) const281 UBool UnicodeSet::operator==(const UnicodeSet& o) const {
282 if (len != o.len) return FALSE;
283 for (int32_t i = 0; i < len; ++i) {
284 if (list[i] != o.list[i]) return FALSE;
285 }
286 if (hasStrings() != o.hasStrings()) { return FALSE; }
287 if (hasStrings() && *strings != *o.strings) return FALSE;
288 return TRUE;
289 }
290
291 /**
292 * Returns the hash code value for this set.
293 *
294 * @return the hash code value for this set.
295 * @see Object#hashCode()
296 */
hashCode(void) const297 int32_t UnicodeSet::hashCode(void) const {
298 uint32_t result = static_cast<uint32_t>(len);
299 for (int32_t i = 0; i < len; ++i) {
300 result *= 1000003u;
301 result += list[i];
302 }
303 return static_cast<int32_t>(result);
304 }
305
306 //----------------------------------------------------------------
307 // Public API
308 //----------------------------------------------------------------
309
310 /**
311 * Returns the number of elements in this set (its cardinality),
312 * Note than the elements of a set may include both individual
313 * codepoints and strings.
314 *
315 * @return the number of elements in this set (its cardinality).
316 */
size(void) const317 int32_t UnicodeSet::size(void) const {
318 int32_t n = 0;
319 int32_t count = getRangeCount();
320 for (int32_t i = 0; i < count; ++i) {
321 n += getRangeEnd(i) - getRangeStart(i) + 1;
322 }
323 return n + stringsSize();
324 }
325
326 /**
327 * Returns <tt>true</tt> if this set contains no elements.
328 *
329 * @return <tt>true</tt> if this set contains no elements.
330 */
isEmpty(void) const331 UBool UnicodeSet::isEmpty(void) const {
332 return len == 1 && !hasStrings();
333 }
334
335 /**
336 * Returns true if this set contains the given character.
337 * @param c character to be checked for containment
338 * @return true if the test condition is met
339 */
contains(UChar32 c) const340 UBool UnicodeSet::contains(UChar32 c) const {
341 // Set i to the index of the start item greater than ch
342 // We know we will terminate without length test!
343 // LATER: for large sets, add binary search
344 //int32_t i = -1;
345 //for (;;) {
346 // if (c < list[++i]) break;
347 //}
348 if (bmpSet != NULL) {
349 return bmpSet->contains(c);
350 }
351 if (stringSpan != NULL) {
352 return stringSpan->contains(c);
353 }
354 if (c >= UNICODESET_HIGH) { // Don't need to check LOW bound
355 return FALSE;
356 }
357 int32_t i = findCodePoint(c);
358 return (UBool)(i & 1); // return true if odd
359 }
360
361 /**
362 * Returns the smallest value i such that c < list[i]. Caller
363 * must ensure that c is a legal value or this method will enter
364 * an infinite loop. This method performs a binary search.
365 * @param c a character in the range MIN_VALUE..MAX_VALUE
366 * inclusive
367 * @return the smallest integer i in the range 0..len-1,
368 * inclusive, such that c < list[i]
369 */
findCodePoint(UChar32 c) const370 int32_t UnicodeSet::findCodePoint(UChar32 c) const {
371 /* Examples:
372 findCodePoint(c)
373 set list[] c=0 1 3 4 7 8
374 === ============== ===========
375 [] [110000] 0 0 0 0 0 0
376 [\u0000-\u0003] [0, 4, 110000] 1 1 1 2 2 2
377 [\u0004-\u0007] [4, 8, 110000] 0 0 0 1 1 2
378 [:Any:] [0, 110000] 1 1 1 1 1 1
379 */
380
381 // Return the smallest i such that c < list[i]. Assume
382 // list[len - 1] == HIGH and that c is legal (0..HIGH-1).
383 if (c < list[0])
384 return 0;
385 // High runner test. c is often after the last range, so an
386 // initial check for this condition pays off.
387 int32_t lo = 0;
388 int32_t hi = len - 1;
389 if (lo >= hi || c >= list[hi-1])
390 return hi;
391 // invariant: c >= list[lo]
392 // invariant: c < list[hi]
393 for (;;) {
394 int32_t i = (lo + hi) >> 1;
395 if (i == lo) {
396 break; // Found!
397 } else if (c < list[i]) {
398 hi = i;
399 } else {
400 lo = i;
401 }
402 }
403 return hi;
404 }
405
406 /**
407 * Returns true if this set contains every character
408 * of the given range.
409 * @param start first character, inclusive, of the range
410 * @param end last character, inclusive, of the range
411 * @return true if the test condition is met
412 */
contains(UChar32 start,UChar32 end) const413 UBool UnicodeSet::contains(UChar32 start, UChar32 end) const {
414 //int32_t i = -1;
415 //for (;;) {
416 // if (start < list[++i]) break;
417 //}
418 int32_t i = findCodePoint(start);
419 return ((i & 1) != 0 && end < list[i]);
420 }
421
422 /**
423 * Returns <tt>true</tt> if this set contains the given
424 * multicharacter string.
425 * @param s string to be checked for containment
426 * @return <tt>true</tt> if this set contains the specified string
427 */
contains(const UnicodeString & s) const428 UBool UnicodeSet::contains(const UnicodeString& s) const {
429 int32_t cp = getSingleCP(s);
430 if (cp < 0) {
431 return stringsContains(s);
432 } else {
433 return contains((UChar32) cp);
434 }
435 }
436
437 /**
438 * Returns true if this set contains all the characters and strings
439 * of the given set.
440 * @param c set to be checked for containment
441 * @return true if the test condition is met
442 */
containsAll(const UnicodeSet & c) const443 UBool UnicodeSet::containsAll(const UnicodeSet& c) const {
444 // The specified set is a subset if all of its pairs are contained in
445 // this set. It's possible to code this more efficiently in terms of
446 // direct manipulation of the inversion lists if the need arises.
447 int32_t n = c.getRangeCount();
448 for (int i=0; i<n; ++i) {
449 if (!contains(c.getRangeStart(i), c.getRangeEnd(i))) {
450 return FALSE;
451 }
452 }
453 return !c.hasStrings() || (strings != nullptr && strings->containsAll(*c.strings));
454 }
455
456 /**
457 * Returns true if this set contains all the characters
458 * of the given string.
459 * @param s string containing characters to be checked for containment
460 * @return true if the test condition is met
461 */
containsAll(const UnicodeString & s) const462 UBool UnicodeSet::containsAll(const UnicodeString& s) const {
463 return (UBool)(span(s.getBuffer(), s.length(), USET_SPAN_CONTAINED) ==
464 s.length());
465 }
466
467 /**
468 * Returns true if this set contains none of the characters
469 * of the given range.
470 * @param start first character, inclusive, of the range
471 * @param end last character, inclusive, of the range
472 * @return true if the test condition is met
473 */
containsNone(UChar32 start,UChar32 end) const474 UBool UnicodeSet::containsNone(UChar32 start, UChar32 end) const {
475 //int32_t i = -1;
476 //for (;;) {
477 // if (start < list[++i]) break;
478 //}
479 int32_t i = findCodePoint(start);
480 return ((i & 1) == 0 && end < list[i]);
481 }
482
483 /**
484 * Returns true if this set contains none of the characters and strings
485 * of the given set.
486 * @param c set to be checked for containment
487 * @return true if the test condition is met
488 */
containsNone(const UnicodeSet & c) const489 UBool UnicodeSet::containsNone(const UnicodeSet& c) const {
490 // The specified set is a subset if all of its pairs are contained in
491 // this set. It's possible to code this more efficiently in terms of
492 // direct manipulation of the inversion lists if the need arises.
493 int32_t n = c.getRangeCount();
494 for (int32_t i=0; i<n; ++i) {
495 if (!containsNone(c.getRangeStart(i), c.getRangeEnd(i))) {
496 return FALSE;
497 }
498 }
499 return strings == nullptr || !c.hasStrings() || strings->containsNone(*c.strings);
500 }
501
502 /**
503 * Returns true if this set contains none of the characters
504 * of the given string.
505 * @param s string containing characters to be checked for containment
506 * @return true if the test condition is met
507 */
containsNone(const UnicodeString & s) const508 UBool UnicodeSet::containsNone(const UnicodeString& s) const {
509 return (UBool)(span(s.getBuffer(), s.length(), USET_SPAN_NOT_CONTAINED) ==
510 s.length());
511 }
512
513 /**
514 * Returns <tt>true</tt> if this set contains any character whose low byte
515 * is the given value. This is used by <tt>RuleBasedTransliterator</tt> for
516 * indexing.
517 */
matchesIndexValue(uint8_t v) const518 UBool UnicodeSet::matchesIndexValue(uint8_t v) const {
519 /* The index value v, in the range [0,255], is contained in this set if
520 * it is contained in any pair of this set. Pairs either have the high
521 * bytes equal, or unequal. If the high bytes are equal, then we have
522 * aaxx..aayy, where aa is the high byte. Then v is contained if xx <=
523 * v <= yy. If the high bytes are unequal we have aaxx..bbyy, bb>aa.
524 * Then v is contained if xx <= v || v <= yy. (This is identical to the
525 * time zone month containment logic.)
526 */
527 int32_t i;
528 int32_t rangeCount=getRangeCount();
529 for (i=0; i<rangeCount; ++i) {
530 UChar32 low = getRangeStart(i);
531 UChar32 high = getRangeEnd(i);
532 if ((low & ~0xFF) == (high & ~0xFF)) {
533 if ((low & 0xFF) <= v && v <= (high & 0xFF)) {
534 return TRUE;
535 }
536 } else if ((low & 0xFF) <= v || v <= (high & 0xFF)) {
537 return TRUE;
538 }
539 }
540 if (hasStrings()) {
541 for (i=0; i<strings->size(); ++i) {
542 const UnicodeString& s = *(const UnicodeString*)strings->elementAt(i);
543 if (s.isEmpty()) {
544 continue; // skip the empty string
545 }
546 UChar32 c = s.char32At(0);
547 if ((c & 0xFF) == v) {
548 return TRUE;
549 }
550 }
551 }
552 return FALSE;
553 }
554
555 /**
556 * Implementation of UnicodeMatcher::matches(). Always matches the
557 * longest possible multichar string.
558 */
matches(const Replaceable & text,int32_t & offset,int32_t limit,UBool incremental)559 UMatchDegree UnicodeSet::matches(const Replaceable& text,
560 int32_t& offset,
561 int32_t limit,
562 UBool incremental) {
563 if (offset == limit) {
564 if (contains(U_ETHER)) {
565 return incremental ? U_PARTIAL_MATCH : U_MATCH;
566 } else {
567 return U_MISMATCH;
568 }
569 } else {
570 if (hasStrings()) { // try strings first
571
572 // might separate forward and backward loops later
573 // for now they are combined
574
575 // TODO Improve efficiency of this, at least in the forward
576 // direction, if not in both. In the forward direction we
577 // can assume the strings are sorted.
578
579 int32_t i;
580 UBool forward = offset < limit;
581
582 // firstChar is the leftmost char to match in the
583 // forward direction or the rightmost char to match in
584 // the reverse direction.
585 UChar firstChar = text.charAt(offset);
586
587 // If there are multiple strings that can match we
588 // return the longest match.
589 int32_t highWaterLength = 0;
590
591 for (i=0; i<strings->size(); ++i) {
592 const UnicodeString& trial = *(const UnicodeString*)strings->elementAt(i);
593 if (trial.isEmpty()) {
594 continue; // skip the empty string
595 }
596
597 UChar c = trial.charAt(forward ? 0 : trial.length() - 1);
598
599 // Strings are sorted, so we can optimize in the
600 // forward direction.
601 if (forward && c > firstChar) break;
602 if (c != firstChar) continue;
603
604 int32_t matchLen = matchRest(text, offset, limit, trial);
605
606 if (incremental) {
607 int32_t maxLen = forward ? limit-offset : offset-limit;
608 if (matchLen == maxLen) {
609 // We have successfully matched but only up to limit.
610 return U_PARTIAL_MATCH;
611 }
612 }
613
614 if (matchLen == trial.length()) {
615 // We have successfully matched the whole string.
616 if (matchLen > highWaterLength) {
617 highWaterLength = matchLen;
618 }
619 // In the forward direction we know strings
620 // are sorted so we can bail early.
621 if (forward && matchLen < highWaterLength) {
622 break;
623 }
624 continue;
625 }
626 }
627
628 // We've checked all strings without a partial match.
629 // If we have full matches, return the longest one.
630 if (highWaterLength != 0) {
631 offset += forward ? highWaterLength : -highWaterLength;
632 return U_MATCH;
633 }
634 }
635 return UnicodeFilter::matches(text, offset, limit, incremental);
636 }
637 }
638
639 /**
640 * Returns the longest match for s in text at the given position.
641 * If limit > start then match forward from start+1 to limit
642 * matching all characters except s.charAt(0). If limit < start,
643 * go backward starting from start-1 matching all characters
644 * except s.charAt(s.length()-1). This method assumes that the
645 * first character, text.charAt(start), matches s, so it does not
646 * check it.
647 * @param text the text to match
648 * @param start the first character to match. In the forward
649 * direction, text.charAt(start) is matched against s.charAt(0).
650 * In the reverse direction, it is matched against
651 * s.charAt(s.length()-1).
652 * @param limit the limit offset for matching, either last+1 in
653 * the forward direction, or last-1 in the reverse direction,
654 * where last is the index of the last character to match.
655 * @return If part of s matches up to the limit, return |limit -
656 * start|. If all of s matches before reaching the limit, return
657 * s.length(). If there is a mismatch between s and text, return
658 * 0
659 */
matchRest(const Replaceable & text,int32_t start,int32_t limit,const UnicodeString & s)660 int32_t UnicodeSet::matchRest(const Replaceable& text,
661 int32_t start, int32_t limit,
662 const UnicodeString& s) {
663 int32_t i;
664 int32_t maxLen;
665 int32_t slen = s.length();
666 if (start < limit) {
667 maxLen = limit - start;
668 if (maxLen > slen) maxLen = slen;
669 for (i = 1; i < maxLen; ++i) {
670 if (text.charAt(start + i) != s.charAt(i)) return 0;
671 }
672 } else {
673 maxLen = start - limit;
674 if (maxLen > slen) maxLen = slen;
675 --slen; // <=> slen = s.length() - 1;
676 for (i = 1; i < maxLen; ++i) {
677 if (text.charAt(start - i) != s.charAt(slen - i)) return 0;
678 }
679 }
680 return maxLen;
681 }
682
683 /**
684 * Implement of UnicodeMatcher
685 */
addMatchSetTo(UnicodeSet & toUnionTo) const686 void UnicodeSet::addMatchSetTo(UnicodeSet& toUnionTo) const {
687 toUnionTo.addAll(*this);
688 }
689
690 /**
691 * Returns the index of the given character within this set, where
692 * the set is ordered by ascending code point. If the character
693 * is not in this set, return -1. The inverse of this method is
694 * <code>charAt()</code>.
695 * @return an index from 0..size()-1, or -1
696 */
indexOf(UChar32 c) const697 int32_t UnicodeSet::indexOf(UChar32 c) const {
698 if (c < MIN_VALUE || c > MAX_VALUE) {
699 return -1;
700 }
701 int32_t i = 0;
702 int32_t n = 0;
703 for (;;) {
704 UChar32 start = list[i++];
705 if (c < start) {
706 return -1;
707 }
708 UChar32 limit = list[i++];
709 if (c < limit) {
710 return n + c - start;
711 }
712 n += limit - start;
713 }
714 }
715
716 /**
717 * Returns the character at the given index within this set, where
718 * the set is ordered by ascending code point. If the index is
719 * out of range, return (UChar32)-1. The inverse of this method is
720 * <code>indexOf()</code>.
721 * @param index an index from 0..size()-1
722 * @return the character at the given index, or (UChar32)-1.
723 */
charAt(int32_t index) const724 UChar32 UnicodeSet::charAt(int32_t index) const {
725 if (index >= 0) {
726 // len2 is the largest even integer <= len, that is, it is len
727 // for even values and len-1 for odd values. With odd values
728 // the last entry is UNICODESET_HIGH.
729 int32_t len2 = len & ~1;
730 for (int32_t i=0; i < len2;) {
731 UChar32 start = list[i++];
732 int32_t count = list[i++] - start;
733 if (index < count) {
734 return (UChar32)(start + index);
735 }
736 index -= count;
737 }
738 }
739 return (UChar32)-1;
740 }
741
742 /**
743 * Make this object represent the range <code>start - end</code>.
744 * If <code>end > start</code> then this object is set to an
745 * an empty range.
746 *
747 * @param start first character in the set, inclusive
748 * @rparam end last character in the set, inclusive
749 */
set(UChar32 start,UChar32 end)750 UnicodeSet& UnicodeSet::set(UChar32 start, UChar32 end) {
751 clear();
752 complement(start, end);
753 return *this;
754 }
755
756 /**
757 * Adds the specified range to this set if it is not already
758 * present. If this set already contains the specified range,
759 * the call leaves this set unchanged. If <code>end > start</code>
760 * then an empty range is added, leaving the set unchanged.
761 *
762 * @param start first character, inclusive, of range to be added
763 * to this set.
764 * @param end last character, inclusive, of range to be added
765 * to this set.
766 */
add(UChar32 start,UChar32 end)767 UnicodeSet& UnicodeSet::add(UChar32 start, UChar32 end) {
768 if (pinCodePoint(start) < pinCodePoint(end)) {
769 UChar32 limit = end + 1;
770 // Fast path for adding a new range after the last one.
771 // Odd list length: [..., lastStart, lastLimit, HIGH]
772 if ((len & 1) != 0) {
773 // If the list is empty, set lastLimit low enough to not be adjacent to 0.
774 UChar32 lastLimit = len == 1 ? -2 : list[len - 2];
775 if (lastLimit <= start && !isFrozen() && !isBogus()) {
776 if (lastLimit == start) {
777 // Extend the last range.
778 list[len - 2] = limit;
779 if (limit == UNICODESET_HIGH) {
780 --len;
781 }
782 } else {
783 list[len - 1] = start;
784 if (limit < UNICODESET_HIGH) {
785 if (ensureCapacity(len + 2)) {
786 list[len++] = limit;
787 list[len++] = UNICODESET_HIGH;
788 }
789 } else { // limit == UNICODESET_HIGH
790 if (ensureCapacity(len + 1)) {
791 list[len++] = UNICODESET_HIGH;
792 }
793 }
794 }
795 releasePattern();
796 return *this;
797 }
798 }
799 // This is slow. Could be much faster using findCodePoint(start)
800 // and modifying the list, dealing with adjacent & overlapping ranges.
801 UChar32 range[3] = { start, limit, UNICODESET_HIGH };
802 add(range, 2, 0);
803 } else if (start == end) {
804 add(start);
805 }
806 return *this;
807 }
808
809 // #define DEBUG_US_ADD
810
811 #ifdef DEBUG_US_ADD
812 #include <stdio.h>
dump(UChar32 c)813 void dump(UChar32 c) {
814 if (c <= 0xFF) {
815 printf("%c", (char)c);
816 } else {
817 printf("U+%04X", c);
818 }
819 }
dump(const UChar32 * list,int32_t len)820 void dump(const UChar32* list, int32_t len) {
821 printf("[");
822 for (int32_t i=0; i<len; ++i) {
823 if (i != 0) printf(", ");
824 dump(list[i]);
825 }
826 printf("]");
827 }
828 #endif
829
830 /**
831 * Adds the specified character to this set if it is not already
832 * present. If this set already contains the specified character,
833 * the call leaves this set unchanged.
834 */
add(UChar32 c)835 UnicodeSet& UnicodeSet::add(UChar32 c) {
836 // find smallest i such that c < list[i]
837 // if odd, then it is IN the set
838 // if even, then it is OUT of the set
839 int32_t i = findCodePoint(pinCodePoint(c));
840
841 // already in set?
842 if ((i & 1) != 0 || isFrozen() || isBogus()) return *this;
843
844 // HIGH is 0x110000
845 // assert(list[len-1] == HIGH);
846
847 // empty = [HIGH]
848 // [start_0, limit_0, start_1, limit_1, HIGH]
849
850 // [..., start_k-1, limit_k-1, start_k, limit_k, ..., HIGH]
851 // ^
852 // list[i]
853
854 // i == 0 means c is before the first range
855
856 #ifdef DEBUG_US_ADD
857 printf("Add of ");
858 dump(c);
859 printf(" found at %d", i);
860 printf(": ");
861 dump(list, len);
862 printf(" => ");
863 #endif
864
865 if (c == list[i]-1) {
866 // c is before start of next range
867 list[i] = c;
868 // if we touched the HIGH mark, then add a new one
869 if (c == (UNICODESET_HIGH - 1)) {
870 if (!ensureCapacity(len+1)) {
871 // ensureCapacity will mark the object as Bogus if OOM failure happens.
872 return *this;
873 }
874 list[len++] = UNICODESET_HIGH;
875 }
876 if (i > 0 && c == list[i-1]) {
877 // collapse adjacent ranges
878
879 // [..., start_k-1, c, c, limit_k, ..., HIGH]
880 // ^
881 // list[i]
882
883 //for (int32_t k=i-1; k<len-2; ++k) {
884 // list[k] = list[k+2];
885 //}
886 UChar32* dst = list + i - 1;
887 UChar32* src = dst + 2;
888 UChar32* srclimit = list + len;
889 while (src < srclimit) *(dst++) = *(src++);
890
891 len -= 2;
892 }
893 }
894
895 else if (i > 0 && c == list[i-1]) {
896 // c is after end of prior range
897 list[i-1]++;
898 // no need to check for collapse here
899 }
900
901 else {
902 // At this point we know the new char is not adjacent to
903 // any existing ranges, and it is not 10FFFF.
904
905
906 // [..., start_k-1, limit_k-1, start_k, limit_k, ..., HIGH]
907 // ^
908 // list[i]
909
910 // [..., start_k-1, limit_k-1, c, c+1, start_k, limit_k, ..., HIGH]
911 // ^
912 // list[i]
913
914 if (!ensureCapacity(len+2)) {
915 // ensureCapacity will mark the object as Bogus if OOM failure happens.
916 return *this;
917 }
918
919 UChar32 *p = list + i;
920 uprv_memmove(p + 2, p, (len - i) * sizeof(*p));
921 list[i] = c;
922 list[i+1] = c+1;
923 len += 2;
924 }
925
926 #ifdef DEBUG_US_ADD
927 dump(list, len);
928 printf("\n");
929
930 for (i=1; i<len; ++i) {
931 if (list[i] <= list[i-1]) {
932 // Corrupt array!
933 printf("ERROR: list has been corrupted\n");
934 exit(1);
935 }
936 }
937 #endif
938
939 releasePattern();
940 return *this;
941 }
942
943 /**
944 * Adds the specified multicharacter to this set if it is not already
945 * present. If this set already contains the multicharacter,
946 * the call leaves this set unchanged.
947 * Thus "ch" => {"ch"}
948 *
949 * @param s the source string
950 * @return the modified set, for chaining
951 */
add(const UnicodeString & s)952 UnicodeSet& UnicodeSet::add(const UnicodeString& s) {
953 if (isFrozen() || isBogus()) return *this;
954 int32_t cp = getSingleCP(s);
955 if (cp < 0) {
956 if (!stringsContains(s)) {
957 _add(s);
958 releasePattern();
959 }
960 } else {
961 add((UChar32)cp);
962 }
963 return *this;
964 }
965
966 /**
967 * Adds the given string, in order, to 'strings'. The given string
968 * must have been checked by the caller to not already be in 'strings'.
969 */
_add(const UnicodeString & s)970 void UnicodeSet::_add(const UnicodeString& s) {
971 if (isFrozen() || isBogus()) {
972 return;
973 }
974 UErrorCode ec = U_ZERO_ERROR;
975 if (strings == nullptr && !allocateStrings(ec)) {
976 setToBogus();
977 return;
978 }
979 UnicodeString* t = new UnicodeString(s);
980 if (t == NULL) { // Check for memory allocation error.
981 setToBogus();
982 return;
983 }
984 strings->sortedInsert(t, compareUnicodeString, ec);
985 if (U_FAILURE(ec)) {
986 setToBogus();
987 delete t;
988 }
989 }
990
991 /**
992 * @return a code point IF the string consists of a single one.
993 * otherwise returns -1.
994 * @param string to test
995 */
getSingleCP(const UnicodeString & s)996 int32_t UnicodeSet::getSingleCP(const UnicodeString& s) {
997 int32_t sLength = s.length();
998 if (sLength == 1) return s.charAt(0);
999 if (sLength == 2) {
1000 UChar32 cp = s.char32At(0);
1001 if (cp > 0xFFFF) { // is surrogate pair
1002 return cp;
1003 }
1004 }
1005 return -1;
1006 }
1007
1008 /**
1009 * Adds each of the characters in this string to the set. Thus "ch" => {"c", "h"}
1010 * If this set already any particular character, it has no effect on that character.
1011 * @param the source string
1012 * @return the modified set, for chaining
1013 */
addAll(const UnicodeString & s)1014 UnicodeSet& UnicodeSet::addAll(const UnicodeString& s) {
1015 UChar32 cp;
1016 for (int32_t i = 0; i < s.length(); i += U16_LENGTH(cp)) {
1017 cp = s.char32At(i);
1018 add(cp);
1019 }
1020 return *this;
1021 }
1022
1023 /**
1024 * Retains EACH of the characters in this string. Note: "ch" == {"c", "h"}
1025 * If this set already any particular character, it has no effect on that character.
1026 * @param the source string
1027 * @return the modified set, for chaining
1028 */
retainAll(const UnicodeString & s)1029 UnicodeSet& UnicodeSet::retainAll(const UnicodeString& s) {
1030 UnicodeSet set;
1031 set.addAll(s);
1032 retainAll(set);
1033 return *this;
1034 }
1035
1036 /**
1037 * Complement EACH of the characters in this string. Note: "ch" == {"c", "h"}
1038 * If this set already any particular character, it has no effect on that character.
1039 * @param the source string
1040 * @return the modified set, for chaining
1041 */
complementAll(const UnicodeString & s)1042 UnicodeSet& UnicodeSet::complementAll(const UnicodeString& s) {
1043 UnicodeSet set;
1044 set.addAll(s);
1045 complementAll(set);
1046 return *this;
1047 }
1048
1049 /**
1050 * Remove EACH of the characters in this string. Note: "ch" == {"c", "h"}
1051 * If this set already any particular character, it has no effect on that character.
1052 * @param the source string
1053 * @return the modified set, for chaining
1054 */
removeAll(const UnicodeString & s)1055 UnicodeSet& UnicodeSet::removeAll(const UnicodeString& s) {
1056 UnicodeSet set;
1057 set.addAll(s);
1058 removeAll(set);
1059 return *this;
1060 }
1061
removeAllStrings()1062 UnicodeSet& UnicodeSet::removeAllStrings() {
1063 if (!isFrozen() && hasStrings()) {
1064 strings->removeAllElements();
1065 releasePattern();
1066 }
1067 return *this;
1068 }
1069
1070
1071 /**
1072 * Makes a set from a multicharacter string. Thus "ch" => {"ch"}
1073 * <br><b>Warning: you cannot add an empty string ("") to a UnicodeSet.</b>
1074 * @param the source string
1075 * @return a newly created set containing the given string
1076 */
createFrom(const UnicodeString & s)1077 UnicodeSet* U_EXPORT2 UnicodeSet::createFrom(const UnicodeString& s) {
1078 UnicodeSet *set = new UnicodeSet();
1079 if (set != NULL) { // Check for memory allocation error.
1080 set->add(s);
1081 }
1082 return set;
1083 }
1084
1085
1086 /**
1087 * Makes a set from each of the characters in the string. Thus "ch" => {"c", "h"}
1088 * @param the source string
1089 * @return a newly created set containing the given characters
1090 */
createFromAll(const UnicodeString & s)1091 UnicodeSet* U_EXPORT2 UnicodeSet::createFromAll(const UnicodeString& s) {
1092 UnicodeSet *set = new UnicodeSet();
1093 if (set != NULL) { // Check for memory allocation error.
1094 set->addAll(s);
1095 }
1096 return set;
1097 }
1098
1099 /**
1100 * Retain only the elements in this set that are contained in the
1101 * specified range. If <code>end > start</code> then an empty range is
1102 * retained, leaving the set empty.
1103 *
1104 * @param start first character, inclusive, of range to be retained
1105 * to this set.
1106 * @param end last character, inclusive, of range to be retained
1107 * to this set.
1108 */
retain(UChar32 start,UChar32 end)1109 UnicodeSet& UnicodeSet::retain(UChar32 start, UChar32 end) {
1110 if (pinCodePoint(start) <= pinCodePoint(end)) {
1111 UChar32 range[3] = { start, end+1, UNICODESET_HIGH };
1112 retain(range, 2, 0);
1113 } else {
1114 clear();
1115 }
1116 return *this;
1117 }
1118
retain(UChar32 c)1119 UnicodeSet& UnicodeSet::retain(UChar32 c) {
1120 return retain(c, c);
1121 }
1122
retain(const UnicodeString & s)1123 UnicodeSet& UnicodeSet::retain(const UnicodeString &s) {
1124 if (isFrozen() || isBogus()) { return *this; }
1125 UChar32 cp = getSingleCP(s);
1126 if (cp < 0) {
1127 bool isIn = stringsContains(s);
1128 // Check for getRangeCount() first to avoid somewhat-expensive size()
1129 // when there are single code points.
1130 if (isIn && getRangeCount() == 0 && size() == 1) {
1131 return *this;
1132 }
1133 clear();
1134 if (isIn) {
1135 _add(s);
1136 }
1137 } else {
1138 retain(cp, cp);
1139 }
1140 return *this;
1141 }
1142
1143 /**
1144 * Removes the specified range from this set if it is present.
1145 * The set will not contain the specified range once the call
1146 * returns. If <code>end > start</code> then an empty range is
1147 * removed, leaving the set unchanged.
1148 *
1149 * @param start first character, inclusive, of range to be removed
1150 * from this set.
1151 * @param end last character, inclusive, of range to be removed
1152 * from this set.
1153 */
remove(UChar32 start,UChar32 end)1154 UnicodeSet& UnicodeSet::remove(UChar32 start, UChar32 end) {
1155 if (pinCodePoint(start) <= pinCodePoint(end)) {
1156 UChar32 range[3] = { start, end+1, UNICODESET_HIGH };
1157 retain(range, 2, 2);
1158 }
1159 return *this;
1160 }
1161
1162 /**
1163 * Removes the specified character from this set if it is present.
1164 * The set will not contain the specified range once the call
1165 * returns.
1166 */
remove(UChar32 c)1167 UnicodeSet& UnicodeSet::remove(UChar32 c) {
1168 return remove(c, c);
1169 }
1170
1171 /**
1172 * Removes the specified string from this set if it is present.
1173 * The set will not contain the specified character once the call
1174 * returns.
1175 * @param the source string
1176 * @return the modified set, for chaining
1177 */
remove(const UnicodeString & s)1178 UnicodeSet& UnicodeSet::remove(const UnicodeString& s) {
1179 if (isFrozen() || isBogus()) return *this;
1180 int32_t cp = getSingleCP(s);
1181 if (cp < 0) {
1182 if (strings != nullptr && strings->removeElement((void*) &s)) {
1183 releasePattern();
1184 }
1185 } else {
1186 remove((UChar32)cp, (UChar32)cp);
1187 }
1188 return *this;
1189 }
1190
1191 /**
1192 * Complements the specified range in this set. Any character in
1193 * the range will be removed if it is in this set, or will be
1194 * added if it is not in this set. If <code>end > start</code>
1195 * then an empty range is xor'ed, leaving the set unchanged.
1196 *
1197 * @param start first character, inclusive, of range to be removed
1198 * from this set.
1199 * @param end last character, inclusive, of range to be removed
1200 * from this set.
1201 */
complement(UChar32 start,UChar32 end)1202 UnicodeSet& UnicodeSet::complement(UChar32 start, UChar32 end) {
1203 if (isFrozen() || isBogus()) {
1204 return *this;
1205 }
1206 if (pinCodePoint(start) <= pinCodePoint(end)) {
1207 UChar32 range[3] = { start, end+1, UNICODESET_HIGH };
1208 exclusiveOr(range, 2, 0);
1209 }
1210 releasePattern();
1211 return *this;
1212 }
1213
complement(UChar32 c)1214 UnicodeSet& UnicodeSet::complement(UChar32 c) {
1215 return complement(c, c);
1216 }
1217
1218 /**
1219 * This is equivalent to
1220 * <code>complement(MIN_VALUE, MAX_VALUE)</code>.
1221 */
complement(void)1222 UnicodeSet& UnicodeSet::complement(void) {
1223 if (isFrozen() || isBogus()) {
1224 return *this;
1225 }
1226 if (list[0] == UNICODESET_LOW) {
1227 uprv_memmove(list, list + 1, (size_t)(len-1)*sizeof(UChar32));
1228 --len;
1229 } else {
1230 if (!ensureCapacity(len+1)) {
1231 return *this;
1232 }
1233 uprv_memmove(list + 1, list, (size_t)len*sizeof(UChar32));
1234 list[0] = UNICODESET_LOW;
1235 ++len;
1236 }
1237 releasePattern();
1238 return *this;
1239 }
1240
1241 /**
1242 * Complement the specified string in this set.
1243 * The set will not contain the specified string once the call
1244 * returns.
1245 *
1246 * @param s the string to complement
1247 * @return this object, for chaining
1248 */
complement(const UnicodeString & s)1249 UnicodeSet& UnicodeSet::complement(const UnicodeString& s) {
1250 if (isFrozen() || isBogus()) return *this;
1251 int32_t cp = getSingleCP(s);
1252 if (cp < 0) {
1253 if (stringsContains(s)) {
1254 strings->removeElement((void*) &s);
1255 } else {
1256 _add(s);
1257 }
1258 releasePattern();
1259 } else {
1260 complement((UChar32)cp, (UChar32)cp);
1261 }
1262 return *this;
1263 }
1264
1265 /**
1266 * Adds all of the elements in the specified set to this set if
1267 * they're not already present. This operation effectively
1268 * modifies this set so that its value is the <i>union</i> of the two
1269 * sets. The behavior of this operation is unspecified if the specified
1270 * collection is modified while the operation is in progress.
1271 *
1272 * @param c set whose elements are to be added to this set.
1273 * @see #add(char, char)
1274 */
addAll(const UnicodeSet & c)1275 UnicodeSet& UnicodeSet::addAll(const UnicodeSet& c) {
1276 if ( c.len>0 && c.list!=NULL ) {
1277 add(c.list, c.len, 0);
1278 }
1279
1280 // Add strings in order
1281 if ( c.strings!=NULL ) {
1282 for (int32_t i=0; i<c.strings->size(); ++i) {
1283 const UnicodeString* s = (const UnicodeString*)c.strings->elementAt(i);
1284 if (!stringsContains(*s)) {
1285 _add(*s);
1286 }
1287 }
1288 }
1289 return *this;
1290 }
1291
1292 /**
1293 * Retains only the elements in this set that are contained in the
1294 * specified set. In other words, removes from this set all of
1295 * its elements that are not contained in the specified set. This
1296 * operation effectively modifies this set so that its value is
1297 * the <i>intersection</i> of the two sets.
1298 *
1299 * @param c set that defines which elements this set will retain.
1300 */
retainAll(const UnicodeSet & c)1301 UnicodeSet& UnicodeSet::retainAll(const UnicodeSet& c) {
1302 if (isFrozen() || isBogus()) {
1303 return *this;
1304 }
1305 retain(c.list, c.len, 0);
1306 if (hasStrings()) {
1307 if (!c.hasStrings()) {
1308 strings->removeAllElements();
1309 } else {
1310 strings->retainAll(*c.strings);
1311 }
1312 }
1313 return *this;
1314 }
1315
1316 /**
1317 * Removes from this set all of its elements that are contained in the
1318 * specified set. This operation effectively modifies this
1319 * set so that its value is the <i>asymmetric set difference</i> of
1320 * the two sets.
1321 *
1322 * @param c set that defines which elements will be removed from
1323 * this set.
1324 */
removeAll(const UnicodeSet & c)1325 UnicodeSet& UnicodeSet::removeAll(const UnicodeSet& c) {
1326 if (isFrozen() || isBogus()) {
1327 return *this;
1328 }
1329 retain(c.list, c.len, 2);
1330 if (hasStrings() && c.hasStrings()) {
1331 strings->removeAll(*c.strings);
1332 }
1333 return *this;
1334 }
1335
1336 /**
1337 * Complements in this set all elements contained in the specified
1338 * set. Any character in the other set will be removed if it is
1339 * in this set, or will be added if it is not in this set.
1340 *
1341 * @param c set that defines which elements will be xor'ed from
1342 * this set.
1343 */
complementAll(const UnicodeSet & c)1344 UnicodeSet& UnicodeSet::complementAll(const UnicodeSet& c) {
1345 if (isFrozen() || isBogus()) {
1346 return *this;
1347 }
1348 exclusiveOr(c.list, c.len, 0);
1349
1350 if (c.strings != nullptr) {
1351 for (int32_t i=0; i<c.strings->size(); ++i) {
1352 void* e = c.strings->elementAt(i);
1353 if (strings == nullptr || !strings->removeElement(e)) {
1354 _add(*(const UnicodeString*)e);
1355 }
1356 }
1357 }
1358 return *this;
1359 }
1360
1361 /**
1362 * Removes all of the elements from this set. This set will be
1363 * empty after this call returns.
1364 */
clear(void)1365 UnicodeSet& UnicodeSet::clear(void) {
1366 if (isFrozen()) {
1367 return *this;
1368 }
1369 list[0] = UNICODESET_HIGH;
1370 len = 1;
1371 releasePattern();
1372 if (strings != NULL) {
1373 strings->removeAllElements();
1374 }
1375 // Remove bogus
1376 fFlags = 0;
1377 return *this;
1378 }
1379
1380 /**
1381 * Iteration method that returns the number of ranges contained in
1382 * this set.
1383 * @see #getRangeStart
1384 * @see #getRangeEnd
1385 */
getRangeCount() const1386 int32_t UnicodeSet::getRangeCount() const {
1387 return len/2;
1388 }
1389
1390 /**
1391 * Iteration method that returns the first character in the
1392 * specified range of this set.
1393 * @see #getRangeCount
1394 * @see #getRangeEnd
1395 */
getRangeStart(int32_t index) const1396 UChar32 UnicodeSet::getRangeStart(int32_t index) const {
1397 return list[index*2];
1398 }
1399
1400 /**
1401 * Iteration method that returns the last character in the
1402 * specified range of this set.
1403 * @see #getRangeStart
1404 * @see #getRangeEnd
1405 */
getRangeEnd(int32_t index) const1406 UChar32 UnicodeSet::getRangeEnd(int32_t index) const {
1407 return list[index*2 + 1] - 1;
1408 }
1409
getString(int32_t index) const1410 const UnicodeString* UnicodeSet::getString(int32_t index) const {
1411 return (const UnicodeString*) strings->elementAt(index);
1412 }
1413
1414 /**
1415 * Reallocate this objects internal structures to take up the least
1416 * possible space, without changing this object's value.
1417 */
compact()1418 UnicodeSet& UnicodeSet::compact() {
1419 if (isFrozen() || isBogus()) {
1420 return *this;
1421 }
1422 // Delete buffer first to defragment memory less.
1423 if (buffer != stackList) {
1424 uprv_free(buffer);
1425 buffer = NULL;
1426 bufferCapacity = 0;
1427 }
1428 if (list == stackList) {
1429 // pass
1430 } else if (len <= INITIAL_CAPACITY) {
1431 uprv_memcpy(stackList, list, len * sizeof(UChar32));
1432 uprv_free(list);
1433 list = stackList;
1434 capacity = INITIAL_CAPACITY;
1435 } else if ((len + 7) < capacity) {
1436 // If we have more than a little unused capacity, shrink it to len.
1437 UChar32* temp = (UChar32*) uprv_realloc(list, sizeof(UChar32) * len);
1438 if (temp) {
1439 list = temp;
1440 capacity = len;
1441 }
1442 // else what the heck happened?! We allocated less memory!
1443 // Oh well. We'll keep our original array.
1444 }
1445 if (strings != nullptr && strings->isEmpty()) {
1446 delete strings;
1447 strings = nullptr;
1448 }
1449 return *this;
1450 }
1451
1452 #ifdef DEBUG_SERIALIZE
1453 #include <stdio.h>
1454 #endif
1455
1456 /**
1457 * Deserialize constructor.
1458 */
UnicodeSet(const uint16_t data[],int32_t dataLen,ESerialization serialization,UErrorCode & ec)1459 UnicodeSet::UnicodeSet(const uint16_t data[], int32_t dataLen, ESerialization serialization,
1460 UErrorCode &ec) {
1461
1462 if(U_FAILURE(ec)) {
1463 setToBogus();
1464 return;
1465 }
1466
1467 if( (serialization != kSerialized)
1468 || (data==NULL)
1469 || (dataLen < 1)) {
1470 ec = U_ILLEGAL_ARGUMENT_ERROR;
1471 setToBogus();
1472 return;
1473 }
1474
1475 // bmp?
1476 int32_t headerSize = ((data[0]&0x8000)) ?2:1;
1477 int32_t bmpLength = (headerSize==1)?data[0]:data[1];
1478
1479 int32_t newLength = (((data[0]&0x7FFF)-bmpLength)/2)+bmpLength;
1480 #ifdef DEBUG_SERIALIZE
1481 printf("dataLen %d headerSize %d bmpLen %d len %d. data[0]=%X/%X/%X/%X\n", dataLen,headerSize,bmpLength,newLength, data[0],data[1],data[2],data[3]);
1482 #endif
1483 if(!ensureCapacity(newLength + 1)) { // +1 for HIGH
1484 return;
1485 }
1486 // copy bmp
1487 int32_t i;
1488 for(i = 0; i< bmpLength;i++) {
1489 list[i] = data[i+headerSize];
1490 #ifdef DEBUG_SERIALIZE
1491 printf("<<16@%d[%d] %X\n", i+headerSize, i, list[i]);
1492 #endif
1493 }
1494 // copy smp
1495 for(i=bmpLength;i<newLength;i++) {
1496 list[i] = ((UChar32)data[headerSize+bmpLength+(i-bmpLength)*2+0] << 16) +
1497 ((UChar32)data[headerSize+bmpLength+(i-bmpLength)*2+1]);
1498 #ifdef DEBUG_SERIALIZE
1499 printf("<<32@%d+[%d] %lX\n", headerSize+bmpLength+i, i, list[i]);
1500 #endif
1501 }
1502 U_ASSERT(i == newLength);
1503 if (i == 0 || list[i - 1] != UNICODESET_HIGH) {
1504 list[i++] = UNICODESET_HIGH;
1505 }
1506 len = i;
1507 }
1508
1509
serialize(uint16_t * dest,int32_t destCapacity,UErrorCode & ec) const1510 int32_t UnicodeSet::serialize(uint16_t *dest, int32_t destCapacity, UErrorCode& ec) const {
1511 int32_t bmpLength, length, destLength;
1512
1513 if (U_FAILURE(ec)) {
1514 return 0;
1515 }
1516
1517 if (destCapacity<0 || (destCapacity>0 && dest==NULL)) {
1518 ec=U_ILLEGAL_ARGUMENT_ERROR;
1519 return 0;
1520 }
1521
1522 /* count necessary 16-bit units */
1523 length=this->len-1; // Subtract 1 to ignore final UNICODESET_HIGH
1524 // assert(length>=0);
1525 if (length==0) {
1526 /* empty set */
1527 if (destCapacity>0) {
1528 *dest=0;
1529 } else {
1530 ec=U_BUFFER_OVERFLOW_ERROR;
1531 }
1532 return 1;
1533 }
1534 /* now length>0 */
1535
1536 if (this->list[length-1]<=0xffff) {
1537 /* all BMP */
1538 bmpLength=length;
1539 } else if (this->list[0]>=0x10000) {
1540 /* all supplementary */
1541 bmpLength=0;
1542 length*=2;
1543 } else {
1544 /* some BMP, some supplementary */
1545 for (bmpLength=0; bmpLength<length && this->list[bmpLength]<=0xffff; ++bmpLength) {}
1546 length=bmpLength+2*(length-bmpLength);
1547 }
1548 #ifdef DEBUG_SERIALIZE
1549 printf(">> bmpLength%d length%d len%d\n", bmpLength, length, len);
1550 #endif
1551 /* length: number of 16-bit array units */
1552 if (length>0x7fff) {
1553 /* there are only 15 bits for the length in the first serialized word */
1554 ec=U_INDEX_OUTOFBOUNDS_ERROR;
1555 return 0;
1556 }
1557
1558 /*
1559 * total serialized length:
1560 * number of 16-bit array units (length) +
1561 * 1 length unit (always) +
1562 * 1 bmpLength unit (if there are supplementary values)
1563 */
1564 destLength=length+((length>bmpLength)?2:1);
1565 if (destLength<=destCapacity) {
1566 const UChar32 *p;
1567 int32_t i;
1568
1569 #ifdef DEBUG_SERIALIZE
1570 printf("writeHdr\n");
1571 #endif
1572 *dest=(uint16_t)length;
1573 if (length>bmpLength) {
1574 *dest|=0x8000;
1575 *++dest=(uint16_t)bmpLength;
1576 }
1577 ++dest;
1578
1579 /* write the BMP part of the array */
1580 p=this->list;
1581 for (i=0; i<bmpLength; ++i) {
1582 #ifdef DEBUG_SERIALIZE
1583 printf("writebmp: %x\n", (int)*p);
1584 #endif
1585 *dest++=(uint16_t)*p++;
1586 }
1587
1588 /* write the supplementary part of the array */
1589 for (; i<length; i+=2) {
1590 #ifdef DEBUG_SERIALIZE
1591 printf("write32: %x\n", (int)*p);
1592 #endif
1593 *dest++=(uint16_t)(*p>>16);
1594 *dest++=(uint16_t)*p++;
1595 }
1596 } else {
1597 ec=U_BUFFER_OVERFLOW_ERROR;
1598 }
1599 return destLength;
1600 }
1601
1602 //----------------------------------------------------------------
1603 // Implementation: Utility methods
1604 //----------------------------------------------------------------
1605
1606 /**
1607 * Allocate our strings vector and return TRUE if successful.
1608 */
allocateStrings(UErrorCode & status)1609 UBool UnicodeSet::allocateStrings(UErrorCode &status) {
1610 if (U_FAILURE(status)) {
1611 return FALSE;
1612 }
1613 strings = new UVector(uprv_deleteUObject,
1614 uhash_compareUnicodeString, 1, status);
1615 if (strings == NULL) { // Check for memory allocation error.
1616 status = U_MEMORY_ALLOCATION_ERROR;
1617 return FALSE;
1618 }
1619 if (U_FAILURE(status)) {
1620 delete strings;
1621 strings = NULL;
1622 return FALSE;
1623 }
1624 return TRUE;
1625 }
1626
nextCapacity(int32_t minCapacity)1627 int32_t UnicodeSet::nextCapacity(int32_t minCapacity) {
1628 // Grow exponentially to reduce the frequency of allocations.
1629 if (minCapacity < INITIAL_CAPACITY) {
1630 return minCapacity + INITIAL_CAPACITY;
1631 } else if (minCapacity <= 2500) {
1632 return 5 * minCapacity;
1633 } else {
1634 int32_t newCapacity = 2 * minCapacity;
1635 if (newCapacity > MAX_LENGTH) {
1636 newCapacity = MAX_LENGTH;
1637 }
1638 return newCapacity;
1639 }
1640 }
1641
ensureCapacity(int32_t newLen)1642 bool UnicodeSet::ensureCapacity(int32_t newLen) {
1643 if (newLen > MAX_LENGTH) {
1644 newLen = MAX_LENGTH;
1645 }
1646 if (newLen <= capacity) {
1647 return true;
1648 }
1649 int32_t newCapacity = nextCapacity(newLen);
1650 UChar32* temp = (UChar32*) uprv_malloc(newCapacity * sizeof(UChar32));
1651 if (temp == NULL) {
1652 setToBogus(); // set the object to bogus state if an OOM failure occurred.
1653 return false;
1654 }
1655 // Copy only the actual contents.
1656 uprv_memcpy(temp, list, len * sizeof(UChar32));
1657 if (list != stackList) {
1658 uprv_free(list);
1659 }
1660 list = temp;
1661 capacity = newCapacity;
1662 return true;
1663 }
1664
ensureBufferCapacity(int32_t newLen)1665 bool UnicodeSet::ensureBufferCapacity(int32_t newLen) {
1666 if (newLen > MAX_LENGTH) {
1667 newLen = MAX_LENGTH;
1668 }
1669 if (newLen <= bufferCapacity) {
1670 return true;
1671 }
1672 int32_t newCapacity = nextCapacity(newLen);
1673 UChar32* temp = (UChar32*) uprv_malloc(newCapacity * sizeof(UChar32));
1674 if (temp == NULL) {
1675 setToBogus();
1676 return false;
1677 }
1678 // The buffer has no contents to be copied.
1679 // It is always filled from scratch after this call.
1680 if (buffer != stackList) {
1681 uprv_free(buffer);
1682 }
1683 buffer = temp;
1684 bufferCapacity = newCapacity;
1685 return true;
1686 }
1687
1688 /**
1689 * Swap list and buffer.
1690 */
swapBuffers(void)1691 void UnicodeSet::swapBuffers(void) {
1692 // swap list and buffer
1693 UChar32* temp = list;
1694 list = buffer;
1695 buffer = temp;
1696
1697 int32_t c = capacity;
1698 capacity = bufferCapacity;
1699 bufferCapacity = c;
1700 }
1701
setToBogus()1702 void UnicodeSet::setToBogus() {
1703 clear(); // Remove everything in the set.
1704 fFlags = kIsBogus;
1705 }
1706
1707 //----------------------------------------------------------------
1708 // Implementation: Fundamental operators
1709 //----------------------------------------------------------------
1710
max(UChar32 a,UChar32 b)1711 static inline UChar32 max(UChar32 a, UChar32 b) {
1712 return (a > b) ? a : b;
1713 }
1714
1715 // polarity = 0, 3 is normal: x xor y
1716 // polarity = 1, 2: x xor ~y == x === y
1717
exclusiveOr(const UChar32 * other,int32_t otherLen,int8_t polarity)1718 void UnicodeSet::exclusiveOr(const UChar32* other, int32_t otherLen, int8_t polarity) {
1719 if (isFrozen() || isBogus()) {
1720 return;
1721 }
1722 if (!ensureBufferCapacity(len + otherLen)) {
1723 return;
1724 }
1725
1726 int32_t i = 0, j = 0, k = 0;
1727 UChar32 a = list[i++];
1728 UChar32 b;
1729 if (polarity == 1 || polarity == 2) {
1730 b = UNICODESET_LOW;
1731 if (other[j] == UNICODESET_LOW) { // skip base if already LOW
1732 ++j;
1733 b = other[j];
1734 }
1735 } else {
1736 b = other[j++];
1737 }
1738 // simplest of all the routines
1739 // sort the values, discarding identicals!
1740 for (;;) {
1741 if (a < b) {
1742 buffer[k++] = a;
1743 a = list[i++];
1744 } else if (b < a) {
1745 buffer[k++] = b;
1746 b = other[j++];
1747 } else if (a != UNICODESET_HIGH) { // at this point, a == b
1748 // discard both values!
1749 a = list[i++];
1750 b = other[j++];
1751 } else { // DONE!
1752 buffer[k++] = UNICODESET_HIGH;
1753 len = k;
1754 break;
1755 }
1756 }
1757 swapBuffers();
1758 releasePattern();
1759 }
1760
1761 // polarity = 0 is normal: x union y
1762 // polarity = 2: x union ~y
1763 // polarity = 1: ~x union y
1764 // polarity = 3: ~x union ~y
1765
add(const UChar32 * other,int32_t otherLen,int8_t polarity)1766 void UnicodeSet::add(const UChar32* other, int32_t otherLen, int8_t polarity) {
1767 if (isFrozen() || isBogus() || other==NULL) {
1768 return;
1769 }
1770 if (!ensureBufferCapacity(len + otherLen)) {
1771 return;
1772 }
1773
1774 int32_t i = 0, j = 0, k = 0;
1775 UChar32 a = list[i++];
1776 UChar32 b = other[j++];
1777 // change from xor is that we have to check overlapping pairs
1778 // polarity bit 1 means a is second, bit 2 means b is.
1779 for (;;) {
1780 switch (polarity) {
1781 case 0: // both first; take lower if unequal
1782 if (a < b) { // take a
1783 // Back up over overlapping ranges in buffer[]
1784 if (k > 0 && a <= buffer[k-1]) {
1785 // Pick latter end value in buffer[] vs. list[]
1786 a = max(list[i], buffer[--k]);
1787 } else {
1788 // No overlap
1789 buffer[k++] = a;
1790 a = list[i];
1791 }
1792 i++; // Common if/else code factored out
1793 polarity ^= 1;
1794 } else if (b < a) { // take b
1795 if (k > 0 && b <= buffer[k-1]) {
1796 b = max(other[j], buffer[--k]);
1797 } else {
1798 buffer[k++] = b;
1799 b = other[j];
1800 }
1801 j++;
1802 polarity ^= 2;
1803 } else { // a == b, take a, drop b
1804 if (a == UNICODESET_HIGH) goto loop_end;
1805 // This is symmetrical; it doesn't matter if
1806 // we backtrack with a or b. - liu
1807 if (k > 0 && a <= buffer[k-1]) {
1808 a = max(list[i], buffer[--k]);
1809 } else {
1810 // No overlap
1811 buffer[k++] = a;
1812 a = list[i];
1813 }
1814 i++;
1815 polarity ^= 1;
1816 b = other[j++];
1817 polarity ^= 2;
1818 }
1819 break;
1820 case 3: // both second; take higher if unequal, and drop other
1821 if (b <= a) { // take a
1822 if (a == UNICODESET_HIGH) goto loop_end;
1823 buffer[k++] = a;
1824 } else { // take b
1825 if (b == UNICODESET_HIGH) goto loop_end;
1826 buffer[k++] = b;
1827 }
1828 a = list[i++];
1829 polarity ^= 1; // factored common code
1830 b = other[j++];
1831 polarity ^= 2;
1832 break;
1833 case 1: // a second, b first; if b < a, overlap
1834 if (a < b) { // no overlap, take a
1835 buffer[k++] = a; a = list[i++]; polarity ^= 1;
1836 } else if (b < a) { // OVERLAP, drop b
1837 b = other[j++];
1838 polarity ^= 2;
1839 } else { // a == b, drop both!
1840 if (a == UNICODESET_HIGH) goto loop_end;
1841 a = list[i++];
1842 polarity ^= 1;
1843 b = other[j++];
1844 polarity ^= 2;
1845 }
1846 break;
1847 case 2: // a first, b second; if a < b, overlap
1848 if (b < a) { // no overlap, take b
1849 buffer[k++] = b;
1850 b = other[j++];
1851 polarity ^= 2;
1852 } else if (a < b) { // OVERLAP, drop a
1853 a = list[i++];
1854 polarity ^= 1;
1855 } else { // a == b, drop both!
1856 if (a == UNICODESET_HIGH) goto loop_end;
1857 a = list[i++];
1858 polarity ^= 1;
1859 b = other[j++];
1860 polarity ^= 2;
1861 }
1862 break;
1863 }
1864 }
1865 loop_end:
1866 buffer[k++] = UNICODESET_HIGH; // terminate
1867 len = k;
1868 swapBuffers();
1869 releasePattern();
1870 }
1871
1872 // polarity = 0 is normal: x intersect y
1873 // polarity = 2: x intersect ~y == set-minus
1874 // polarity = 1: ~x intersect y
1875 // polarity = 3: ~x intersect ~y
1876
retain(const UChar32 * other,int32_t otherLen,int8_t polarity)1877 void UnicodeSet::retain(const UChar32* other, int32_t otherLen, int8_t polarity) {
1878 if (isFrozen() || isBogus()) {
1879 return;
1880 }
1881 if (!ensureBufferCapacity(len + otherLen)) {
1882 return;
1883 }
1884
1885 int32_t i = 0, j = 0, k = 0;
1886 UChar32 a = list[i++];
1887 UChar32 b = other[j++];
1888 // change from xor is that we have to check overlapping pairs
1889 // polarity bit 1 means a is second, bit 2 means b is.
1890 for (;;) {
1891 switch (polarity) {
1892 case 0: // both first; drop the smaller
1893 if (a < b) { // drop a
1894 a = list[i++];
1895 polarity ^= 1;
1896 } else if (b < a) { // drop b
1897 b = other[j++];
1898 polarity ^= 2;
1899 } else { // a == b, take one, drop other
1900 if (a == UNICODESET_HIGH) goto loop_end;
1901 buffer[k++] = a;
1902 a = list[i++];
1903 polarity ^= 1;
1904 b = other[j++];
1905 polarity ^= 2;
1906 }
1907 break;
1908 case 3: // both second; take lower if unequal
1909 if (a < b) { // take a
1910 buffer[k++] = a;
1911 a = list[i++];
1912 polarity ^= 1;
1913 } else if (b < a) { // take b
1914 buffer[k++] = b;
1915 b = other[j++];
1916 polarity ^= 2;
1917 } else { // a == b, take one, drop other
1918 if (a == UNICODESET_HIGH) goto loop_end;
1919 buffer[k++] = a;
1920 a = list[i++];
1921 polarity ^= 1;
1922 b = other[j++];
1923 polarity ^= 2;
1924 }
1925 break;
1926 case 1: // a second, b first;
1927 if (a < b) { // NO OVERLAP, drop a
1928 a = list[i++];
1929 polarity ^= 1;
1930 } else if (b < a) { // OVERLAP, take b
1931 buffer[k++] = b;
1932 b = other[j++];
1933 polarity ^= 2;
1934 } else { // a == b, drop both!
1935 if (a == UNICODESET_HIGH) goto loop_end;
1936 a = list[i++];
1937 polarity ^= 1;
1938 b = other[j++];
1939 polarity ^= 2;
1940 }
1941 break;
1942 case 2: // a first, b second; if a < b, overlap
1943 if (b < a) { // no overlap, drop b
1944 b = other[j++];
1945 polarity ^= 2;
1946 } else if (a < b) { // OVERLAP, take a
1947 buffer[k++] = a;
1948 a = list[i++];
1949 polarity ^= 1;
1950 } else { // a == b, drop both!
1951 if (a == UNICODESET_HIGH) goto loop_end;
1952 a = list[i++];
1953 polarity ^= 1;
1954 b = other[j++];
1955 polarity ^= 2;
1956 }
1957 break;
1958 }
1959 }
1960 loop_end:
1961 buffer[k++] = UNICODESET_HIGH; // terminate
1962 len = k;
1963 swapBuffers();
1964 releasePattern();
1965 }
1966
1967 /**
1968 * Append the <code>toPattern()</code> representation of a
1969 * string to the given <code>StringBuffer</code>.
1970 */
_appendToPat(UnicodeString & buf,const UnicodeString & s,UBool escapeUnprintable)1971 void UnicodeSet::_appendToPat(UnicodeString& buf, const UnicodeString& s, UBool
1972 escapeUnprintable) {
1973 UChar32 cp;
1974 for (int32_t i = 0; i < s.length(); i += U16_LENGTH(cp)) {
1975 _appendToPat(buf, cp = s.char32At(i), escapeUnprintable);
1976 }
1977 }
1978
1979 /**
1980 * Append the <code>toPattern()</code> representation of a
1981 * character to the given <code>StringBuffer</code>.
1982 */
_appendToPat(UnicodeString & buf,UChar32 c,UBool escapeUnprintable)1983 void UnicodeSet::_appendToPat(UnicodeString& buf, UChar32 c, UBool
1984 escapeUnprintable) {
1985 if (escapeUnprintable && ICU_Utility::isUnprintable(c)) {
1986 // Use hex escape notation (\uxxxx or \Uxxxxxxxx) for anything
1987 // unprintable
1988 if (ICU_Utility::escapeUnprintable(buf, c)) {
1989 return;
1990 }
1991 }
1992 // Okay to let ':' pass through
1993 switch (c) {
1994 case u'[':
1995 case u']':
1996 case u'-':
1997 case u'^':
1998 case u'&':
1999 case u'\\':
2000 case u'{':
2001 case u'}':
2002 case u':':
2003 case SymbolTable::SYMBOL_REF:
2004 buf.append(u'\\');
2005 break;
2006 default:
2007 // Escape whitespace
2008 if (PatternProps::isWhiteSpace(c)) {
2009 buf.append(u'\\');
2010 }
2011 break;
2012 }
2013 buf.append(c);
2014 }
2015
2016 /**
2017 * Append a string representation of this set to result. This will be
2018 * a cleaned version of the string passed to applyPattern(), if there
2019 * is one. Otherwise it will be generated.
2020 */
_toPattern(UnicodeString & result,UBool escapeUnprintable) const2021 UnicodeString& UnicodeSet::_toPattern(UnicodeString& result,
2022 UBool escapeUnprintable) const
2023 {
2024 if (pat != NULL) {
2025 int32_t i;
2026 int32_t backslashCount = 0;
2027 for (i=0; i<patLen; ) {
2028 UChar32 c;
2029 U16_NEXT(pat, i, patLen, c);
2030 if (escapeUnprintable && ICU_Utility::isUnprintable(c)) {
2031 // If the unprintable character is preceded by an odd
2032 // number of backslashes, then it has been escaped.
2033 // Before unescaping it, we delete the final
2034 // backslash.
2035 if ((backslashCount % 2) == 1) {
2036 result.truncate(result.length() - 1);
2037 }
2038 ICU_Utility::escapeUnprintable(result, c);
2039 backslashCount = 0;
2040 } else {
2041 result.append(c);
2042 if (c == u'\\') {
2043 ++backslashCount;
2044 } else {
2045 backslashCount = 0;
2046 }
2047 }
2048 }
2049 return result;
2050 }
2051
2052 return _generatePattern(result, escapeUnprintable);
2053 }
2054
2055 /**
2056 * Returns a string representation of this set. If the result of
2057 * calling this function is passed to a UnicodeSet constructor, it
2058 * will produce another set that is equal to this one.
2059 */
toPattern(UnicodeString & result,UBool escapeUnprintable) const2060 UnicodeString& UnicodeSet::toPattern(UnicodeString& result,
2061 UBool escapeUnprintable) const
2062 {
2063 result.truncate(0);
2064 return _toPattern(result, escapeUnprintable);
2065 }
2066
2067 /**
2068 * Generate and append a string representation of this set to result.
2069 * This does not use this.pat, the cleaned up copy of the string
2070 * passed to applyPattern().
2071 */
_generatePattern(UnicodeString & result,UBool escapeUnprintable) const2072 UnicodeString& UnicodeSet::_generatePattern(UnicodeString& result,
2073 UBool escapeUnprintable) const
2074 {
2075 result.append(u'[');
2076
2077 // // Check against the predefined categories. We implicitly build
2078 // // up ALL category sets the first time toPattern() is called.
2079 // for (int8_t cat=0; cat<Unicode::GENERAL_TYPES_COUNT; ++cat) {
2080 // if (*this == getCategorySet(cat)) {
2081 // result.append(u':');
2082 // result.append(CATEGORY_NAMES, cat*2, 2);
2083 // return result.append(CATEGORY_CLOSE);
2084 // }
2085 // }
2086
2087 int32_t count = getRangeCount();
2088
2089 // If the set contains at least 2 intervals and includes both
2090 // MIN_VALUE and MAX_VALUE, then the inverse representation will
2091 // be more economical.
2092 if (count > 1 &&
2093 getRangeStart(0) == MIN_VALUE &&
2094 getRangeEnd(count-1) == MAX_VALUE) {
2095
2096 // Emit the inverse
2097 result.append(u'^');
2098
2099 for (int32_t i = 1; i < count; ++i) {
2100 UChar32 start = getRangeEnd(i-1)+1;
2101 UChar32 end = getRangeStart(i)-1;
2102 _appendToPat(result, start, escapeUnprintable);
2103 if (start != end) {
2104 if ((start+1) != end) {
2105 result.append(u'-');
2106 }
2107 _appendToPat(result, end, escapeUnprintable);
2108 }
2109 }
2110 }
2111
2112 // Default; emit the ranges as pairs
2113 else {
2114 for (int32_t i = 0; i < count; ++i) {
2115 UChar32 start = getRangeStart(i);
2116 UChar32 end = getRangeEnd(i);
2117 _appendToPat(result, start, escapeUnprintable);
2118 if (start != end) {
2119 if ((start+1) != end) {
2120 result.append(u'-');
2121 }
2122 _appendToPat(result, end, escapeUnprintable);
2123 }
2124 }
2125 }
2126
2127 if (strings != nullptr) {
2128 for (int32_t i = 0; i<strings->size(); ++i) {
2129 result.append(u'{');
2130 _appendToPat(result,
2131 *(const UnicodeString*) strings->elementAt(i),
2132 escapeUnprintable);
2133 result.append(u'}');
2134 }
2135 }
2136 return result.append(u']');
2137 }
2138
2139 /**
2140 * Release existing cached pattern
2141 */
releasePattern()2142 void UnicodeSet::releasePattern() {
2143 if (pat) {
2144 uprv_free(pat);
2145 pat = NULL;
2146 patLen = 0;
2147 }
2148 }
2149
2150 /**
2151 * Set the new pattern to cache.
2152 */
setPattern(const char16_t * newPat,int32_t newPatLen)2153 void UnicodeSet::setPattern(const char16_t *newPat, int32_t newPatLen) {
2154 releasePattern();
2155 pat = (UChar *)uprv_malloc((newPatLen + 1) * sizeof(UChar));
2156 if (pat) {
2157 patLen = newPatLen;
2158 u_memcpy(pat, newPat, patLen);
2159 pat[patLen] = 0;
2160 }
2161 // else we don't care if malloc failed. This was just a nice cache.
2162 // We can regenerate an equivalent pattern later when requested.
2163 }
2164
freeze()2165 UnicodeSet *UnicodeSet::freeze() {
2166 if(!isFrozen() && !isBogus()) {
2167 compact();
2168
2169 // Optimize contains() and span() and similar functions.
2170 if (hasStrings()) {
2171 stringSpan = new UnicodeSetStringSpan(*this, *strings, UnicodeSetStringSpan::ALL);
2172 if (stringSpan == nullptr) {
2173 setToBogus();
2174 return this;
2175 } else if (!stringSpan->needsStringSpanUTF16()) {
2176 // All strings are irrelevant for span() etc. because
2177 // all of each string's code points are contained in this set.
2178 // Do not check needsStringSpanUTF8() because UTF-8 has at most as
2179 // many relevant strings as UTF-16.
2180 // (Thus needsStringSpanUTF8() implies needsStringSpanUTF16().)
2181 delete stringSpan;
2182 stringSpan = NULL;
2183 }
2184 }
2185 if (stringSpan == NULL) {
2186 // No span-relevant strings: Optimize for code point spans.
2187 bmpSet=new BMPSet(list, len);
2188 if (bmpSet == NULL) { // Check for memory allocation error.
2189 setToBogus();
2190 }
2191 }
2192 }
2193 return this;
2194 }
2195
span(const UChar * s,int32_t length,USetSpanCondition spanCondition) const2196 int32_t UnicodeSet::span(const UChar *s, int32_t length, USetSpanCondition spanCondition) const {
2197 if(length>0 && bmpSet!=NULL) {
2198 return (int32_t)(bmpSet->span(s, s+length, spanCondition)-s);
2199 }
2200 if(length<0) {
2201 length=u_strlen(s);
2202 }
2203 if(length==0) {
2204 return 0;
2205 }
2206 if(stringSpan!=NULL) {
2207 return stringSpan->span(s, length, spanCondition);
2208 } else if(hasStrings()) {
2209 uint32_t which= spanCondition==USET_SPAN_NOT_CONTAINED ?
2210 UnicodeSetStringSpan::FWD_UTF16_NOT_CONTAINED :
2211 UnicodeSetStringSpan::FWD_UTF16_CONTAINED;
2212 UnicodeSetStringSpan strSpan(*this, *strings, which);
2213 if(strSpan.needsStringSpanUTF16()) {
2214 return strSpan.span(s, length, spanCondition);
2215 }
2216 }
2217
2218 if(spanCondition!=USET_SPAN_NOT_CONTAINED) {
2219 spanCondition=USET_SPAN_CONTAINED; // Pin to 0/1 values.
2220 }
2221
2222 UChar32 c;
2223 int32_t start=0, prev=0;
2224 do {
2225 U16_NEXT(s, start, length, c);
2226 if(spanCondition!=contains(c)) {
2227 break;
2228 }
2229 } while((prev=start)<length);
2230 return prev;
2231 }
2232
spanBack(const UChar * s,int32_t length,USetSpanCondition spanCondition) const2233 int32_t UnicodeSet::spanBack(const UChar *s, int32_t length, USetSpanCondition spanCondition) const {
2234 if(length>0 && bmpSet!=NULL) {
2235 return (int32_t)(bmpSet->spanBack(s, s+length, spanCondition)-s);
2236 }
2237 if(length<0) {
2238 length=u_strlen(s);
2239 }
2240 if(length==0) {
2241 return 0;
2242 }
2243 if(stringSpan!=NULL) {
2244 return stringSpan->spanBack(s, length, spanCondition);
2245 } else if(hasStrings()) {
2246 uint32_t which= spanCondition==USET_SPAN_NOT_CONTAINED ?
2247 UnicodeSetStringSpan::BACK_UTF16_NOT_CONTAINED :
2248 UnicodeSetStringSpan::BACK_UTF16_CONTAINED;
2249 UnicodeSetStringSpan strSpan(*this, *strings, which);
2250 if(strSpan.needsStringSpanUTF16()) {
2251 return strSpan.spanBack(s, length, spanCondition);
2252 }
2253 }
2254
2255 if(spanCondition!=USET_SPAN_NOT_CONTAINED) {
2256 spanCondition=USET_SPAN_CONTAINED; // Pin to 0/1 values.
2257 }
2258
2259 UChar32 c;
2260 int32_t prev=length;
2261 do {
2262 U16_PREV(s, 0, length, c);
2263 if(spanCondition!=contains(c)) {
2264 break;
2265 }
2266 } while((prev=length)>0);
2267 return prev;
2268 }
2269
spanUTF8(const char * s,int32_t length,USetSpanCondition spanCondition) const2270 int32_t UnicodeSet::spanUTF8(const char *s, int32_t length, USetSpanCondition spanCondition) const {
2271 if(length>0 && bmpSet!=NULL) {
2272 const uint8_t *s0=(const uint8_t *)s;
2273 return (int32_t)(bmpSet->spanUTF8(s0, length, spanCondition)-s0);
2274 }
2275 if(length<0) {
2276 length=(int32_t)uprv_strlen(s);
2277 }
2278 if(length==0) {
2279 return 0;
2280 }
2281 if(stringSpan!=NULL) {
2282 return stringSpan->spanUTF8((const uint8_t *)s, length, spanCondition);
2283 } else if(hasStrings()) {
2284 uint32_t which= spanCondition==USET_SPAN_NOT_CONTAINED ?
2285 UnicodeSetStringSpan::FWD_UTF8_NOT_CONTAINED :
2286 UnicodeSetStringSpan::FWD_UTF8_CONTAINED;
2287 UnicodeSetStringSpan strSpan(*this, *strings, which);
2288 if(strSpan.needsStringSpanUTF8()) {
2289 return strSpan.spanUTF8((const uint8_t *)s, length, spanCondition);
2290 }
2291 }
2292
2293 if(spanCondition!=USET_SPAN_NOT_CONTAINED) {
2294 spanCondition=USET_SPAN_CONTAINED; // Pin to 0/1 values.
2295 }
2296
2297 UChar32 c;
2298 int32_t start=0, prev=0;
2299 do {
2300 U8_NEXT_OR_FFFD(s, start, length, c);
2301 if(spanCondition!=contains(c)) {
2302 break;
2303 }
2304 } while((prev=start)<length);
2305 return prev;
2306 }
2307
spanBackUTF8(const char * s,int32_t length,USetSpanCondition spanCondition) const2308 int32_t UnicodeSet::spanBackUTF8(const char *s, int32_t length, USetSpanCondition spanCondition) const {
2309 if(length>0 && bmpSet!=NULL) {
2310 const uint8_t *s0=(const uint8_t *)s;
2311 return bmpSet->spanBackUTF8(s0, length, spanCondition);
2312 }
2313 if(length<0) {
2314 length=(int32_t)uprv_strlen(s);
2315 }
2316 if(length==0) {
2317 return 0;
2318 }
2319 if(stringSpan!=NULL) {
2320 return stringSpan->spanBackUTF8((const uint8_t *)s, length, spanCondition);
2321 } else if(hasStrings()) {
2322 uint32_t which= spanCondition==USET_SPAN_NOT_CONTAINED ?
2323 UnicodeSetStringSpan::BACK_UTF8_NOT_CONTAINED :
2324 UnicodeSetStringSpan::BACK_UTF8_CONTAINED;
2325 UnicodeSetStringSpan strSpan(*this, *strings, which);
2326 if(strSpan.needsStringSpanUTF8()) {
2327 return strSpan.spanBackUTF8((const uint8_t *)s, length, spanCondition);
2328 }
2329 }
2330
2331 if(spanCondition!=USET_SPAN_NOT_CONTAINED) {
2332 spanCondition=USET_SPAN_CONTAINED; // Pin to 0/1 values.
2333 }
2334
2335 UChar32 c;
2336 int32_t prev=length;
2337 do {
2338 U8_PREV_OR_FFFD(s, 0, length, c);
2339 if(spanCondition!=contains(c)) {
2340 break;
2341 }
2342 } while((prev=length)>0);
2343 return prev;
2344 }
2345
2346 U_NAMESPACE_END
2347