1 // © 2016 and later: Unicode, Inc. and others.
2 // License & terms of use: http://www.unicode.org/copyright.html
3 /*
4 **********************************************************************
5 *   Copyright (C) 1999-2015, International Business Machines
6 *   Corporation and others.  All Rights Reserved.
7 **********************************************************************
8 *   Date        Name        Description
9 *   10/20/99    alan        Creation.
10 **********************************************************************
11 */
12 
13 #include "unicode/utypes.h"
14 #include "unicode/parsepos.h"
15 #include "unicode/symtable.h"
16 #include "unicode/uniset.h"
17 #include "unicode/ustring.h"
18 #include "unicode/utf8.h"
19 #include "unicode/utf16.h"
20 #include "ruleiter.h"
21 #include "cmemory.h"
22 #include "cstring.h"
23 #include "patternprops.h"
24 #include "uelement.h"
25 #include "util.h"
26 #include "uvector.h"
27 #include "charstr.h"
28 #include "ustrfmt.h"
29 #include "uassert.h"
30 #include "bmpset.h"
31 #include "unisetspan.h"
32 
33 // HIGH_VALUE > all valid values. 110000 for codepoints
34 #define UNICODESET_HIGH 0x0110000
35 
36 // LOW <= all valid values. ZERO for codepoints
37 #define UNICODESET_LOW 0x000000
38 
39 /** Max list [0, 1, 2, ..., max code point, HIGH] */
40 constexpr int32_t MAX_LENGTH = UNICODESET_HIGH + 1;
41 
42 U_NAMESPACE_BEGIN
43 
~SymbolTable()44 SymbolTable::~SymbolTable() {}
45 
UOBJECT_DEFINE_RTTI_IMPLEMENTATION(UnicodeSet)46 UOBJECT_DEFINE_RTTI_IMPLEMENTATION(UnicodeSet)
47 
48 /**
49  * Modify the given UChar32 variable so that it is in range, by
50  * pinning values < UNICODESET_LOW to UNICODESET_LOW, and
51  * pinning values > UNICODESET_HIGH-1 to UNICODESET_HIGH-1.
52  * It modifies its argument in-place and also returns it.
53  */
54 static inline UChar32 pinCodePoint(UChar32& c) {
55     if (c < UNICODESET_LOW) {
56         c = UNICODESET_LOW;
57     } else if (c > (UNICODESET_HIGH-1)) {
58         c = (UNICODESET_HIGH-1);
59     }
60     return c;
61 }
62 
63 //----------------------------------------------------------------
64 // Debugging
65 //----------------------------------------------------------------
66 
67 // DO NOT DELETE THIS CODE.  This code is used to debug memory leaks.
68 // To enable the debugging, define the symbol DEBUG_MEM in the line
69 // below.  This will result in text being sent to stdout that looks
70 // like this:
71 //   DEBUG UnicodeSet: ct 0x00A39B20; 397 [\u0A81-\u0A83\u0A85-
72 //   DEBUG UnicodeSet: dt 0x00A39B20; 396 [\u0A81-\u0A83\u0A85-
73 // Each line lists a construction (ct) or destruction (dt) event, the
74 // object address, the number of outstanding objects after the event,
75 // and the pattern of the object in question.
76 
77 // #define DEBUG_MEM
78 
79 #ifdef DEBUG_MEM
80 #include <stdio.h>
81 static int32_t _dbgCount = 0;
82 
_dbgct(UnicodeSet * set)83 static inline void _dbgct(UnicodeSet* set) {
84     UnicodeString str;
85     set->toPattern(str, TRUE);
86     char buf[40];
87     str.extract(0, 39, buf, "");
88     printf("DEBUG UnicodeSet: ct 0x%08X; %d %s\n", set, ++_dbgCount, buf);
89 }
90 
_dbgdt(UnicodeSet * set)91 static inline void _dbgdt(UnicodeSet* set) {
92     UnicodeString str;
93     set->toPattern(str, TRUE);
94     char buf[40];
95     str.extract(0, 39, buf, "");
96     printf("DEBUG UnicodeSet: dt 0x%08X; %d %s\n", set, --_dbgCount, buf);
97 }
98 
99 #else
100 
101 #define _dbgct(set)
102 #define _dbgdt(set)
103 
104 #endif
105 
106 //----------------------------------------------------------------
107 // UnicodeString in UVector support
108 //----------------------------------------------------------------
109 
cloneUnicodeString(UElement * dst,UElement * src)110 static void U_CALLCONV cloneUnicodeString(UElement *dst, UElement *src) {
111     dst->pointer = new UnicodeString(*(UnicodeString*)src->pointer);
112 }
113 
compareUnicodeString(UElement t1,UElement t2)114 static int8_t U_CALLCONV compareUnicodeString(UElement t1, UElement t2) {
115     const UnicodeString &a = *(const UnicodeString*)t1.pointer;
116     const UnicodeString &b = *(const UnicodeString*)t2.pointer;
117     return a.compare(b);
118 }
119 
hasStrings() const120 UBool UnicodeSet::hasStrings() const {
121     return strings != nullptr && !strings->isEmpty();
122 }
123 
stringsSize() const124 int32_t UnicodeSet::stringsSize() const {
125     return strings == nullptr ? 0 : strings->size();
126 }
127 
stringsContains(const UnicodeString & s) const128 UBool UnicodeSet::stringsContains(const UnicodeString &s) const {
129     return strings != nullptr && strings->contains((void*) &s);
130 }
131 
132 //----------------------------------------------------------------
133 // Constructors &c
134 //----------------------------------------------------------------
135 
136 /**
137  * Constructs an empty set.
138  */
UnicodeSet()139 UnicodeSet::UnicodeSet() {
140     list[0] = UNICODESET_HIGH;
141     _dbgct(this);
142 }
143 
144 /**
145  * Constructs a set containing the given range. If <code>end >
146  * start</code> then an empty set is created.
147  *
148  * @param start first character, inclusive, of range
149  * @param end last character, inclusive, of range
150  */
UnicodeSet(UChar32 start,UChar32 end)151 UnicodeSet::UnicodeSet(UChar32 start, UChar32 end) {
152     list[0] = UNICODESET_HIGH;
153     add(start, end);
154     _dbgct(this);
155 }
156 
157 /**
158  * Constructs a set that is identical to the given UnicodeSet.
159  */
UnicodeSet(const UnicodeSet & o)160 UnicodeSet::UnicodeSet(const UnicodeSet& o) : UnicodeFilter(o) {
161     *this = o;
162     _dbgct(this);
163 }
164 
165 // Copy-construct as thawed.
UnicodeSet(const UnicodeSet & o,UBool)166 UnicodeSet::UnicodeSet(const UnicodeSet& o, UBool /* asThawed */) : UnicodeFilter(o) {
167     if (ensureCapacity(o.len)) {
168         // *this = o except for bmpSet and stringSpan
169         len = o.len;
170         uprv_memcpy(list, o.list, (size_t)len*sizeof(UChar32));
171         if (o.hasStrings()) {
172             UErrorCode status = U_ZERO_ERROR;
173             if (!allocateStrings(status) ||
174                     (strings->assign(*o.strings, cloneUnicodeString, status), U_FAILURE(status))) {
175                 setToBogus();
176                 return;
177             }
178         }
179         if (o.pat) {
180             setPattern(o.pat, o.patLen);
181         }
182         _dbgct(this);
183     }
184 }
185 
186 /**
187  * Destructs the set.
188  */
~UnicodeSet()189 UnicodeSet::~UnicodeSet() {
190     _dbgdt(this); // first!
191     if (list != stackList) {
192         uprv_free(list);
193     }
194     delete bmpSet;
195     if (buffer != stackList) {
196         uprv_free(buffer);
197     }
198     delete strings;
199     delete stringSpan;
200     releasePattern();
201 }
202 
203 /**
204  * Assigns this object to be a copy of another.
205  */
operator =(const UnicodeSet & o)206 UnicodeSet& UnicodeSet::operator=(const UnicodeSet& o) {
207     return copyFrom(o, FALSE);
208 }
209 
copyFrom(const UnicodeSet & o,UBool asThawed)210 UnicodeSet& UnicodeSet::copyFrom(const UnicodeSet& o, UBool asThawed) {
211     if (this == &o) {
212         return *this;
213     }
214     if (isFrozen()) {
215         return *this;
216     }
217     if (o.isBogus()) {
218         setToBogus();
219         return *this;
220     }
221     if (!ensureCapacity(o.len)) {
222         // ensureCapacity will mark the UnicodeSet as Bogus if OOM failure happens.
223         return *this;
224     }
225     len = o.len;
226     uprv_memcpy(list, o.list, (size_t)len*sizeof(UChar32));
227     if (o.bmpSet != nullptr && !asThawed) {
228         bmpSet = new BMPSet(*o.bmpSet, list, len);
229         if (bmpSet == NULL) { // Check for memory allocation error.
230             setToBogus();
231             return *this;
232         }
233     }
234     if (o.hasStrings()) {
235         UErrorCode status = U_ZERO_ERROR;
236         if ((strings == nullptr && !allocateStrings(status)) ||
237                 (strings->assign(*o.strings, cloneUnicodeString, status), U_FAILURE(status))) {
238             setToBogus();
239             return *this;
240         }
241     } else if (hasStrings()) {
242         strings->removeAllElements();
243     }
244     if (o.stringSpan != nullptr && !asThawed) {
245         stringSpan = new UnicodeSetStringSpan(*o.stringSpan, *strings);
246         if (stringSpan == NULL) { // Check for memory allocation error.
247             setToBogus();
248             return *this;
249         }
250     }
251     releasePattern();
252     if (o.pat) {
253         setPattern(o.pat, o.patLen);
254     }
255     return *this;
256 }
257 
258 /**
259  * Returns a copy of this object.  All UnicodeMatcher objects have
260  * to support cloning in order to allow classes using
261  * UnicodeMatchers, such as Transliterator, to implement cloning.
262  */
clone() const263 UnicodeSet* UnicodeSet::clone() const {
264     return new UnicodeSet(*this);
265 }
266 
cloneAsThawed() const267 UnicodeSet *UnicodeSet::cloneAsThawed() const {
268     return new UnicodeSet(*this, TRUE);
269 }
270 
271 /**
272  * Compares the specified object with this set for equality.  Returns
273  * <tt>true</tt> if the two sets
274  * have the same size, and every member of the specified set is
275  * contained in this set (or equivalently, every member of this set is
276  * contained in the specified set).
277  *
278  * @param o set to be compared for equality with this set.
279  * @return <tt>true</tt> if the specified set is equal to this set.
280  */
operator ==(const UnicodeSet & o) const281 UBool UnicodeSet::operator==(const UnicodeSet& o) const {
282     if (len != o.len) return FALSE;
283     for (int32_t i = 0; i < len; ++i) {
284         if (list[i] != o.list[i]) return FALSE;
285     }
286     if (hasStrings() != o.hasStrings()) { return FALSE; }
287     if (hasStrings() && *strings != *o.strings) return FALSE;
288     return TRUE;
289 }
290 
291 /**
292  * Returns the hash code value for this set.
293  *
294  * @return the hash code value for this set.
295  * @see Object#hashCode()
296  */
hashCode(void) const297 int32_t UnicodeSet::hashCode(void) const {
298     uint32_t result = static_cast<uint32_t>(len);
299     for (int32_t i = 0; i < len; ++i) {
300         result *= 1000003u;
301         result += list[i];
302     }
303     return static_cast<int32_t>(result);
304 }
305 
306 //----------------------------------------------------------------
307 // Public API
308 //----------------------------------------------------------------
309 
310 /**
311  * Returns the number of elements in this set (its cardinality),
312  * Note than the elements of a set may include both individual
313  * codepoints and strings.
314  *
315  * @return the number of elements in this set (its cardinality).
316  */
size(void) const317 int32_t UnicodeSet::size(void) const {
318     int32_t n = 0;
319     int32_t count = getRangeCount();
320     for (int32_t i = 0; i < count; ++i) {
321         n += getRangeEnd(i) - getRangeStart(i) + 1;
322     }
323     return n + stringsSize();
324 }
325 
326 /**
327  * Returns <tt>true</tt> if this set contains no elements.
328  *
329  * @return <tt>true</tt> if this set contains no elements.
330  */
isEmpty(void) const331 UBool UnicodeSet::isEmpty(void) const {
332     return len == 1 && !hasStrings();
333 }
334 
335 /**
336  * Returns true if this set contains the given character.
337  * @param c character to be checked for containment
338  * @return true if the test condition is met
339  */
contains(UChar32 c) const340 UBool UnicodeSet::contains(UChar32 c) const {
341     // Set i to the index of the start item greater than ch
342     // We know we will terminate without length test!
343     // LATER: for large sets, add binary search
344     //int32_t i = -1;
345     //for (;;) {
346     //    if (c < list[++i]) break;
347     //}
348     if (bmpSet != NULL) {
349         return bmpSet->contains(c);
350     }
351     if (stringSpan != NULL) {
352         return stringSpan->contains(c);
353     }
354     if (c >= UNICODESET_HIGH) { // Don't need to check LOW bound
355         return FALSE;
356     }
357     int32_t i = findCodePoint(c);
358     return (UBool)(i & 1); // return true if odd
359 }
360 
361 /**
362  * Returns the smallest value i such that c < list[i].  Caller
363  * must ensure that c is a legal value or this method will enter
364  * an infinite loop.  This method performs a binary search.
365  * @param c a character in the range MIN_VALUE..MAX_VALUE
366  * inclusive
367  * @return the smallest integer i in the range 0..len-1,
368  * inclusive, such that c < list[i]
369  */
findCodePoint(UChar32 c) const370 int32_t UnicodeSet::findCodePoint(UChar32 c) const {
371     /* Examples:
372                                        findCodePoint(c)
373        set              list[]         c=0 1 3 4 7 8
374        ===              ==============   ===========
375        []               [110000]         0 0 0 0 0 0
376        [\u0000-\u0003]  [0, 4, 110000]   1 1 1 2 2 2
377        [\u0004-\u0007]  [4, 8, 110000]   0 0 0 1 1 2
378        [:Any:]          [0, 110000]      1 1 1 1 1 1
379      */
380 
381     // Return the smallest i such that c < list[i].  Assume
382     // list[len - 1] == HIGH and that c is legal (0..HIGH-1).
383     if (c < list[0])
384         return 0;
385     // High runner test.  c is often after the last range, so an
386     // initial check for this condition pays off.
387     int32_t lo = 0;
388     int32_t hi = len - 1;
389     if (lo >= hi || c >= list[hi-1])
390         return hi;
391     // invariant: c >= list[lo]
392     // invariant: c < list[hi]
393     for (;;) {
394         int32_t i = (lo + hi) >> 1;
395         if (i == lo) {
396             break; // Found!
397         } else if (c < list[i]) {
398             hi = i;
399         } else {
400             lo = i;
401         }
402     }
403     return hi;
404 }
405 
406 /**
407  * Returns true if this set contains every character
408  * of the given range.
409  * @param start first character, inclusive, of the range
410  * @param end last character, inclusive, of the range
411  * @return true if the test condition is met
412  */
contains(UChar32 start,UChar32 end) const413 UBool UnicodeSet::contains(UChar32 start, UChar32 end) const {
414     //int32_t i = -1;
415     //for (;;) {
416     //    if (start < list[++i]) break;
417     //}
418     int32_t i = findCodePoint(start);
419     return ((i & 1) != 0 && end < list[i]);
420 }
421 
422 /**
423  * Returns <tt>true</tt> if this set contains the given
424  * multicharacter string.
425  * @param s string to be checked for containment
426  * @return <tt>true</tt> if this set contains the specified string
427  */
contains(const UnicodeString & s) const428 UBool UnicodeSet::contains(const UnicodeString& s) const {
429     int32_t cp = getSingleCP(s);
430     if (cp < 0) {
431         return stringsContains(s);
432     } else {
433         return contains((UChar32) cp);
434     }
435 }
436 
437 /**
438  * Returns true if this set contains all the characters and strings
439  * of the given set.
440  * @param c set to be checked for containment
441  * @return true if the test condition is met
442  */
containsAll(const UnicodeSet & c) const443 UBool UnicodeSet::containsAll(const UnicodeSet& c) const {
444     // The specified set is a subset if all of its pairs are contained in
445     // this set.  It's possible to code this more efficiently in terms of
446     // direct manipulation of the inversion lists if the need arises.
447     int32_t n = c.getRangeCount();
448     for (int i=0; i<n; ++i) {
449         if (!contains(c.getRangeStart(i), c.getRangeEnd(i))) {
450             return FALSE;
451         }
452     }
453     return !c.hasStrings() || (strings != nullptr && strings->containsAll(*c.strings));
454 }
455 
456 /**
457  * Returns true if this set contains all the characters
458  * of the given string.
459  * @param s string containing characters to be checked for containment
460  * @return true if the test condition is met
461  */
containsAll(const UnicodeString & s) const462 UBool UnicodeSet::containsAll(const UnicodeString& s) const {
463     return (UBool)(span(s.getBuffer(), s.length(), USET_SPAN_CONTAINED) ==
464                    s.length());
465 }
466 
467 /**
468  * Returns true if this set contains none of the characters
469  * of the given range.
470  * @param start first character, inclusive, of the range
471  * @param end last character, inclusive, of the range
472  * @return true if the test condition is met
473  */
containsNone(UChar32 start,UChar32 end) const474 UBool UnicodeSet::containsNone(UChar32 start, UChar32 end) const {
475     //int32_t i = -1;
476     //for (;;) {
477     //    if (start < list[++i]) break;
478     //}
479     int32_t i = findCodePoint(start);
480     return ((i & 1) == 0 && end < list[i]);
481 }
482 
483 /**
484  * Returns true if this set contains none of the characters and strings
485  * of the given set.
486  * @param c set to be checked for containment
487  * @return true if the test condition is met
488  */
containsNone(const UnicodeSet & c) const489 UBool UnicodeSet::containsNone(const UnicodeSet& c) const {
490     // The specified set is a subset if all of its pairs are contained in
491     // this set.  It's possible to code this more efficiently in terms of
492     // direct manipulation of the inversion lists if the need arises.
493     int32_t n = c.getRangeCount();
494     for (int32_t i=0; i<n; ++i) {
495         if (!containsNone(c.getRangeStart(i), c.getRangeEnd(i))) {
496             return FALSE;
497         }
498     }
499     return strings == nullptr || !c.hasStrings() || strings->containsNone(*c.strings);
500 }
501 
502 /**
503  * Returns true if this set contains none of the characters
504  * of the given string.
505  * @param s string containing characters to be checked for containment
506  * @return true if the test condition is met
507  */
containsNone(const UnicodeString & s) const508 UBool UnicodeSet::containsNone(const UnicodeString& s) const {
509     return (UBool)(span(s.getBuffer(), s.length(), USET_SPAN_NOT_CONTAINED) ==
510                    s.length());
511 }
512 
513 /**
514  * Returns <tt>true</tt> if this set contains any character whose low byte
515  * is the given value.  This is used by <tt>RuleBasedTransliterator</tt> for
516  * indexing.
517  */
matchesIndexValue(uint8_t v) const518 UBool UnicodeSet::matchesIndexValue(uint8_t v) const {
519     /* The index value v, in the range [0,255], is contained in this set if
520      * it is contained in any pair of this set.  Pairs either have the high
521      * bytes equal, or unequal.  If the high bytes are equal, then we have
522      * aaxx..aayy, where aa is the high byte.  Then v is contained if xx <=
523      * v <= yy.  If the high bytes are unequal we have aaxx..bbyy, bb>aa.
524      * Then v is contained if xx <= v || v <= yy.  (This is identical to the
525      * time zone month containment logic.)
526      */
527     int32_t i;
528     int32_t rangeCount=getRangeCount();
529     for (i=0; i<rangeCount; ++i) {
530         UChar32 low = getRangeStart(i);
531         UChar32 high = getRangeEnd(i);
532         if ((low & ~0xFF) == (high & ~0xFF)) {
533             if ((low & 0xFF) <= v && v <= (high & 0xFF)) {
534                 return TRUE;
535             }
536         } else if ((low & 0xFF) <= v || v <= (high & 0xFF)) {
537             return TRUE;
538         }
539     }
540     if (hasStrings()) {
541         for (i=0; i<strings->size(); ++i) {
542             const UnicodeString& s = *(const UnicodeString*)strings->elementAt(i);
543             if (s.isEmpty()) {
544                 continue;  // skip the empty string
545             }
546             UChar32 c = s.char32At(0);
547             if ((c & 0xFF) == v) {
548                 return TRUE;
549             }
550         }
551     }
552     return FALSE;
553 }
554 
555 /**
556  * Implementation of UnicodeMatcher::matches().  Always matches the
557  * longest possible multichar string.
558  */
matches(const Replaceable & text,int32_t & offset,int32_t limit,UBool incremental)559 UMatchDegree UnicodeSet::matches(const Replaceable& text,
560                                  int32_t& offset,
561                                  int32_t limit,
562                                  UBool incremental) {
563     if (offset == limit) {
564         if (contains(U_ETHER)) {
565             return incremental ? U_PARTIAL_MATCH : U_MATCH;
566         } else {
567             return U_MISMATCH;
568         }
569     } else {
570         if (hasStrings()) { // try strings first
571 
572             // might separate forward and backward loops later
573             // for now they are combined
574 
575             // TODO Improve efficiency of this, at least in the forward
576             // direction, if not in both.  In the forward direction we
577             // can assume the strings are sorted.
578 
579             int32_t i;
580             UBool forward = offset < limit;
581 
582             // firstChar is the leftmost char to match in the
583             // forward direction or the rightmost char to match in
584             // the reverse direction.
585             UChar firstChar = text.charAt(offset);
586 
587             // If there are multiple strings that can match we
588             // return the longest match.
589             int32_t highWaterLength = 0;
590 
591             for (i=0; i<strings->size(); ++i) {
592                 const UnicodeString& trial = *(const UnicodeString*)strings->elementAt(i);
593                 if (trial.isEmpty()) {
594                     continue;  // skip the empty string
595                 }
596 
597                 UChar c = trial.charAt(forward ? 0 : trial.length() - 1);
598 
599                 // Strings are sorted, so we can optimize in the
600                 // forward direction.
601                 if (forward && c > firstChar) break;
602                 if (c != firstChar) continue;
603 
604                 int32_t matchLen = matchRest(text, offset, limit, trial);
605 
606                 if (incremental) {
607                     int32_t maxLen = forward ? limit-offset : offset-limit;
608                     if (matchLen == maxLen) {
609                         // We have successfully matched but only up to limit.
610                         return U_PARTIAL_MATCH;
611                     }
612                 }
613 
614                 if (matchLen == trial.length()) {
615                     // We have successfully matched the whole string.
616                     if (matchLen > highWaterLength) {
617                         highWaterLength = matchLen;
618                     }
619                     // In the forward direction we know strings
620                     // are sorted so we can bail early.
621                     if (forward && matchLen < highWaterLength) {
622                         break;
623                     }
624                     continue;
625                 }
626             }
627 
628             // We've checked all strings without a partial match.
629             // If we have full matches, return the longest one.
630             if (highWaterLength != 0) {
631                 offset += forward ? highWaterLength : -highWaterLength;
632                 return U_MATCH;
633             }
634         }
635         return UnicodeFilter::matches(text, offset, limit, incremental);
636     }
637 }
638 
639 /**
640  * Returns the longest match for s in text at the given position.
641  * If limit > start then match forward from start+1 to limit
642  * matching all characters except s.charAt(0).  If limit < start,
643  * go backward starting from start-1 matching all characters
644  * except s.charAt(s.length()-1).  This method assumes that the
645  * first character, text.charAt(start), matches s, so it does not
646  * check it.
647  * @param text the text to match
648  * @param start the first character to match.  In the forward
649  * direction, text.charAt(start) is matched against s.charAt(0).
650  * In the reverse direction, it is matched against
651  * s.charAt(s.length()-1).
652  * @param limit the limit offset for matching, either last+1 in
653  * the forward direction, or last-1 in the reverse direction,
654  * where last is the index of the last character to match.
655  * @return If part of s matches up to the limit, return |limit -
656  * start|.  If all of s matches before reaching the limit, return
657  * s.length().  If there is a mismatch between s and text, return
658  * 0
659  */
matchRest(const Replaceable & text,int32_t start,int32_t limit,const UnicodeString & s)660 int32_t UnicodeSet::matchRest(const Replaceable& text,
661                               int32_t start, int32_t limit,
662                               const UnicodeString& s) {
663     int32_t i;
664     int32_t maxLen;
665     int32_t slen = s.length();
666     if (start < limit) {
667         maxLen = limit - start;
668         if (maxLen > slen) maxLen = slen;
669         for (i = 1; i < maxLen; ++i) {
670             if (text.charAt(start + i) != s.charAt(i)) return 0;
671         }
672     } else {
673         maxLen = start - limit;
674         if (maxLen > slen) maxLen = slen;
675         --slen; // <=> slen = s.length() - 1;
676         for (i = 1; i < maxLen; ++i) {
677             if (text.charAt(start - i) != s.charAt(slen - i)) return 0;
678         }
679     }
680     return maxLen;
681 }
682 
683 /**
684  * Implement of UnicodeMatcher
685  */
addMatchSetTo(UnicodeSet & toUnionTo) const686 void UnicodeSet::addMatchSetTo(UnicodeSet& toUnionTo) const {
687     toUnionTo.addAll(*this);
688 }
689 
690 /**
691  * Returns the index of the given character within this set, where
692  * the set is ordered by ascending code point.  If the character
693  * is not in this set, return -1.  The inverse of this method is
694  * <code>charAt()</code>.
695  * @return an index from 0..size()-1, or -1
696  */
indexOf(UChar32 c) const697 int32_t UnicodeSet::indexOf(UChar32 c) const {
698     if (c < MIN_VALUE || c > MAX_VALUE) {
699         return -1;
700     }
701     int32_t i = 0;
702     int32_t n = 0;
703     for (;;) {
704         UChar32 start = list[i++];
705         if (c < start) {
706             return -1;
707         }
708         UChar32 limit = list[i++];
709         if (c < limit) {
710             return n + c - start;
711         }
712         n += limit - start;
713     }
714 }
715 
716 /**
717  * Returns the character at the given index within this set, where
718  * the set is ordered by ascending code point.  If the index is
719  * out of range, return (UChar32)-1.  The inverse of this method is
720  * <code>indexOf()</code>.
721  * @param index an index from 0..size()-1
722  * @return the character at the given index, or (UChar32)-1.
723  */
charAt(int32_t index) const724 UChar32 UnicodeSet::charAt(int32_t index) const {
725     if (index >= 0) {
726         // len2 is the largest even integer <= len, that is, it is len
727         // for even values and len-1 for odd values.  With odd values
728         // the last entry is UNICODESET_HIGH.
729         int32_t len2 = len & ~1;
730         for (int32_t i=0; i < len2;) {
731             UChar32 start = list[i++];
732             int32_t count = list[i++] - start;
733             if (index < count) {
734                 return (UChar32)(start + index);
735             }
736             index -= count;
737         }
738     }
739     return (UChar32)-1;
740 }
741 
742 /**
743  * Make this object represent the range <code>start - end</code>.
744  * If <code>end > start</code> then this object is set to an
745  * an empty range.
746  *
747  * @param start first character in the set, inclusive
748  * @rparam end last character in the set, inclusive
749  */
set(UChar32 start,UChar32 end)750 UnicodeSet& UnicodeSet::set(UChar32 start, UChar32 end) {
751     clear();
752     complement(start, end);
753     return *this;
754 }
755 
756 /**
757  * Adds the specified range to this set if it is not already
758  * present.  If this set already contains the specified range,
759  * the call leaves this set unchanged.  If <code>end > start</code>
760  * then an empty range is added, leaving the set unchanged.
761  *
762  * @param start first character, inclusive, of range to be added
763  * to this set.
764  * @param end last character, inclusive, of range to be added
765  * to this set.
766  */
add(UChar32 start,UChar32 end)767 UnicodeSet& UnicodeSet::add(UChar32 start, UChar32 end) {
768     if (pinCodePoint(start) < pinCodePoint(end)) {
769         UChar32 limit = end + 1;
770         // Fast path for adding a new range after the last one.
771         // Odd list length: [..., lastStart, lastLimit, HIGH]
772         if ((len & 1) != 0) {
773             // If the list is empty, set lastLimit low enough to not be adjacent to 0.
774             UChar32 lastLimit = len == 1 ? -2 : list[len - 2];
775             if (lastLimit <= start && !isFrozen() && !isBogus()) {
776                 if (lastLimit == start) {
777                     // Extend the last range.
778                     list[len - 2] = limit;
779                     if (limit == UNICODESET_HIGH) {
780                         --len;
781                     }
782                 } else {
783                     list[len - 1] = start;
784                     if (limit < UNICODESET_HIGH) {
785                         if (ensureCapacity(len + 2)) {
786                             list[len++] = limit;
787                             list[len++] = UNICODESET_HIGH;
788                         }
789                     } else {  // limit == UNICODESET_HIGH
790                         if (ensureCapacity(len + 1)) {
791                             list[len++] = UNICODESET_HIGH;
792                         }
793                     }
794                 }
795                 releasePattern();
796                 return *this;
797             }
798         }
799         // This is slow. Could be much faster using findCodePoint(start)
800         // and modifying the list, dealing with adjacent & overlapping ranges.
801         UChar32 range[3] = { start, limit, UNICODESET_HIGH };
802         add(range, 2, 0);
803     } else if (start == end) {
804         add(start);
805     }
806     return *this;
807 }
808 
809 // #define DEBUG_US_ADD
810 
811 #ifdef DEBUG_US_ADD
812 #include <stdio.h>
dump(UChar32 c)813 void dump(UChar32 c) {
814     if (c <= 0xFF) {
815         printf("%c", (char)c);
816     } else {
817         printf("U+%04X", c);
818     }
819 }
dump(const UChar32 * list,int32_t len)820 void dump(const UChar32* list, int32_t len) {
821     printf("[");
822     for (int32_t i=0; i<len; ++i) {
823         if (i != 0) printf(", ");
824         dump(list[i]);
825     }
826     printf("]");
827 }
828 #endif
829 
830 /**
831  * Adds the specified character to this set if it is not already
832  * present.  If this set already contains the specified character,
833  * the call leaves this set unchanged.
834  */
add(UChar32 c)835 UnicodeSet& UnicodeSet::add(UChar32 c) {
836     // find smallest i such that c < list[i]
837     // if odd, then it is IN the set
838     // if even, then it is OUT of the set
839     int32_t i = findCodePoint(pinCodePoint(c));
840 
841     // already in set?
842     if ((i & 1) != 0  || isFrozen() || isBogus()) return *this;
843 
844     // HIGH is 0x110000
845     // assert(list[len-1] == HIGH);
846 
847     // empty = [HIGH]
848     // [start_0, limit_0, start_1, limit_1, HIGH]
849 
850     // [..., start_k-1, limit_k-1, start_k, limit_k, ..., HIGH]
851     //                             ^
852     //                             list[i]
853 
854     // i == 0 means c is before the first range
855 
856 #ifdef DEBUG_US_ADD
857     printf("Add of ");
858     dump(c);
859     printf(" found at %d", i);
860     printf(": ");
861     dump(list, len);
862     printf(" => ");
863 #endif
864 
865     if (c == list[i]-1) {
866         // c is before start of next range
867         list[i] = c;
868         // if we touched the HIGH mark, then add a new one
869         if (c == (UNICODESET_HIGH - 1)) {
870             if (!ensureCapacity(len+1)) {
871                 // ensureCapacity will mark the object as Bogus if OOM failure happens.
872                 return *this;
873             }
874             list[len++] = UNICODESET_HIGH;
875         }
876         if (i > 0 && c == list[i-1]) {
877             // collapse adjacent ranges
878 
879             // [..., start_k-1, c, c, limit_k, ..., HIGH]
880             //                     ^
881             //                     list[i]
882 
883             //for (int32_t k=i-1; k<len-2; ++k) {
884             //    list[k] = list[k+2];
885             //}
886             UChar32* dst = list + i - 1;
887             UChar32* src = dst + 2;
888             UChar32* srclimit = list + len;
889             while (src < srclimit) *(dst++) = *(src++);
890 
891             len -= 2;
892         }
893     }
894 
895     else if (i > 0 && c == list[i-1]) {
896         // c is after end of prior range
897         list[i-1]++;
898         // no need to check for collapse here
899     }
900 
901     else {
902         // At this point we know the new char is not adjacent to
903         // any existing ranges, and it is not 10FFFF.
904 
905 
906         // [..., start_k-1, limit_k-1, start_k, limit_k, ..., HIGH]
907         //                             ^
908         //                             list[i]
909 
910         // [..., start_k-1, limit_k-1, c, c+1, start_k, limit_k, ..., HIGH]
911         //                             ^
912         //                             list[i]
913 
914         if (!ensureCapacity(len+2)) {
915             // ensureCapacity will mark the object as Bogus if OOM failure happens.
916             return *this;
917         }
918 
919         UChar32 *p = list + i;
920         uprv_memmove(p + 2, p, (len - i) * sizeof(*p));
921         list[i] = c;
922         list[i+1] = c+1;
923         len += 2;
924     }
925 
926 #ifdef DEBUG_US_ADD
927     dump(list, len);
928     printf("\n");
929 
930     for (i=1; i<len; ++i) {
931         if (list[i] <= list[i-1]) {
932             // Corrupt array!
933             printf("ERROR: list has been corrupted\n");
934             exit(1);
935         }
936     }
937 #endif
938 
939     releasePattern();
940     return *this;
941 }
942 
943 /**
944  * Adds the specified multicharacter to this set if it is not already
945  * present.  If this set already contains the multicharacter,
946  * the call leaves this set unchanged.
947  * Thus "ch" => {"ch"}
948  *
949  * @param s the source string
950  * @return the modified set, for chaining
951  */
add(const UnicodeString & s)952 UnicodeSet& UnicodeSet::add(const UnicodeString& s) {
953     if (isFrozen() || isBogus()) return *this;
954     int32_t cp = getSingleCP(s);
955     if (cp < 0) {
956         if (!stringsContains(s)) {
957             _add(s);
958             releasePattern();
959         }
960     } else {
961         add((UChar32)cp);
962     }
963     return *this;
964 }
965 
966 /**
967  * Adds the given string, in order, to 'strings'.  The given string
968  * must have been checked by the caller to not already be in 'strings'.
969  */
_add(const UnicodeString & s)970 void UnicodeSet::_add(const UnicodeString& s) {
971     if (isFrozen() || isBogus()) {
972         return;
973     }
974     UErrorCode ec = U_ZERO_ERROR;
975     if (strings == nullptr && !allocateStrings(ec)) {
976         setToBogus();
977         return;
978     }
979     UnicodeString* t = new UnicodeString(s);
980     if (t == NULL) { // Check for memory allocation error.
981         setToBogus();
982         return;
983     }
984     strings->sortedInsert(t, compareUnicodeString, ec);
985     if (U_FAILURE(ec)) {
986         setToBogus();
987         delete t;
988     }
989 }
990 
991 /**
992  * @return a code point IF the string consists of a single one.
993  * otherwise returns -1.
994  * @param string to test
995  */
getSingleCP(const UnicodeString & s)996 int32_t UnicodeSet::getSingleCP(const UnicodeString& s) {
997     int32_t sLength = s.length();
998     if (sLength == 1) return s.charAt(0);
999     if (sLength == 2) {
1000         UChar32 cp = s.char32At(0);
1001         if (cp > 0xFFFF) { // is surrogate pair
1002             return cp;
1003         }
1004     }
1005     return -1;
1006 }
1007 
1008 /**
1009  * Adds each of the characters in this string to the set. Thus "ch" => {"c", "h"}
1010  * If this set already any particular character, it has no effect on that character.
1011  * @param the source string
1012  * @return the modified set, for chaining
1013  */
addAll(const UnicodeString & s)1014 UnicodeSet& UnicodeSet::addAll(const UnicodeString& s) {
1015     UChar32 cp;
1016     for (int32_t i = 0; i < s.length(); i += U16_LENGTH(cp)) {
1017         cp = s.char32At(i);
1018         add(cp);
1019     }
1020     return *this;
1021 }
1022 
1023 /**
1024  * Retains EACH of the characters in this string. Note: "ch" == {"c", "h"}
1025  * If this set already any particular character, it has no effect on that character.
1026  * @param the source string
1027  * @return the modified set, for chaining
1028  */
retainAll(const UnicodeString & s)1029 UnicodeSet& UnicodeSet::retainAll(const UnicodeString& s) {
1030     UnicodeSet set;
1031     set.addAll(s);
1032     retainAll(set);
1033     return *this;
1034 }
1035 
1036 /**
1037  * Complement EACH of the characters in this string. Note: "ch" == {"c", "h"}
1038  * If this set already any particular character, it has no effect on that character.
1039  * @param the source string
1040  * @return the modified set, for chaining
1041  */
complementAll(const UnicodeString & s)1042 UnicodeSet& UnicodeSet::complementAll(const UnicodeString& s) {
1043     UnicodeSet set;
1044     set.addAll(s);
1045     complementAll(set);
1046     return *this;
1047 }
1048 
1049 /**
1050  * Remove EACH of the characters in this string. Note: "ch" == {"c", "h"}
1051  * If this set already any particular character, it has no effect on that character.
1052  * @param the source string
1053  * @return the modified set, for chaining
1054  */
removeAll(const UnicodeString & s)1055 UnicodeSet& UnicodeSet::removeAll(const UnicodeString& s) {
1056     UnicodeSet set;
1057     set.addAll(s);
1058     removeAll(set);
1059     return *this;
1060 }
1061 
removeAllStrings()1062 UnicodeSet& UnicodeSet::removeAllStrings() {
1063     if (!isFrozen() && hasStrings()) {
1064         strings->removeAllElements();
1065         releasePattern();
1066     }
1067     return *this;
1068 }
1069 
1070 
1071 /**
1072  * Makes a set from a multicharacter string. Thus "ch" => {"ch"}
1073  * <br><b>Warning: you cannot add an empty string ("") to a UnicodeSet.</b>
1074  * @param the source string
1075  * @return a newly created set containing the given string
1076  */
createFrom(const UnicodeString & s)1077 UnicodeSet* U_EXPORT2 UnicodeSet::createFrom(const UnicodeString& s) {
1078     UnicodeSet *set = new UnicodeSet();
1079     if (set != NULL) { // Check for memory allocation error.
1080         set->add(s);
1081     }
1082     return set;
1083 }
1084 
1085 
1086 /**
1087  * Makes a set from each of the characters in the string. Thus "ch" => {"c", "h"}
1088  * @param the source string
1089  * @return a newly created set containing the given characters
1090  */
createFromAll(const UnicodeString & s)1091 UnicodeSet* U_EXPORT2 UnicodeSet::createFromAll(const UnicodeString& s) {
1092     UnicodeSet *set = new UnicodeSet();
1093     if (set != NULL) { // Check for memory allocation error.
1094         set->addAll(s);
1095     }
1096     return set;
1097 }
1098 
1099 /**
1100  * Retain only the elements in this set that are contained in the
1101  * specified range.  If <code>end > start</code> then an empty range is
1102  * retained, leaving the set empty.
1103  *
1104  * @param start first character, inclusive, of range to be retained
1105  * to this set.
1106  * @param end last character, inclusive, of range to be retained
1107  * to this set.
1108  */
retain(UChar32 start,UChar32 end)1109 UnicodeSet& UnicodeSet::retain(UChar32 start, UChar32 end) {
1110     if (pinCodePoint(start) <= pinCodePoint(end)) {
1111         UChar32 range[3] = { start, end+1, UNICODESET_HIGH };
1112         retain(range, 2, 0);
1113     } else {
1114         clear();
1115     }
1116     return *this;
1117 }
1118 
retain(UChar32 c)1119 UnicodeSet& UnicodeSet::retain(UChar32 c) {
1120     return retain(c, c);
1121 }
1122 
retain(const UnicodeString & s)1123 UnicodeSet& UnicodeSet::retain(const UnicodeString &s) {
1124     if (isFrozen() || isBogus()) { return *this; }
1125     UChar32 cp = getSingleCP(s);
1126     if (cp < 0) {
1127         bool isIn = stringsContains(s);
1128         // Check for getRangeCount() first to avoid somewhat-expensive size()
1129         // when there are single code points.
1130         if (isIn && getRangeCount() == 0 && size() == 1) {
1131             return *this;
1132         }
1133         clear();
1134         if (isIn) {
1135             _add(s);
1136         }
1137     } else {
1138         retain(cp, cp);
1139     }
1140     return *this;
1141 }
1142 
1143 /**
1144  * Removes the specified range from this set if it is present.
1145  * The set will not contain the specified range once the call
1146  * returns.  If <code>end > start</code> then an empty range is
1147  * removed, leaving the set unchanged.
1148  *
1149  * @param start first character, inclusive, of range to be removed
1150  * from this set.
1151  * @param end last character, inclusive, of range to be removed
1152  * from this set.
1153  */
remove(UChar32 start,UChar32 end)1154 UnicodeSet& UnicodeSet::remove(UChar32 start, UChar32 end) {
1155     if (pinCodePoint(start) <= pinCodePoint(end)) {
1156         UChar32 range[3] = { start, end+1, UNICODESET_HIGH };
1157         retain(range, 2, 2);
1158     }
1159     return *this;
1160 }
1161 
1162 /**
1163  * Removes the specified character from this set if it is present.
1164  * The set will not contain the specified range once the call
1165  * returns.
1166  */
remove(UChar32 c)1167 UnicodeSet& UnicodeSet::remove(UChar32 c) {
1168     return remove(c, c);
1169 }
1170 
1171 /**
1172  * Removes the specified string from this set if it is present.
1173  * The set will not contain the specified character once the call
1174  * returns.
1175  * @param the source string
1176  * @return the modified set, for chaining
1177  */
remove(const UnicodeString & s)1178 UnicodeSet& UnicodeSet::remove(const UnicodeString& s) {
1179     if (isFrozen() || isBogus()) return *this;
1180     int32_t cp = getSingleCP(s);
1181     if (cp < 0) {
1182         if (strings != nullptr && strings->removeElement((void*) &s)) {
1183             releasePattern();
1184         }
1185     } else {
1186         remove((UChar32)cp, (UChar32)cp);
1187     }
1188     return *this;
1189 }
1190 
1191 /**
1192  * Complements the specified range in this set.  Any character in
1193  * the range will be removed if it is in this set, or will be
1194  * added if it is not in this set.  If <code>end > start</code>
1195  * then an empty range is xor'ed, leaving the set unchanged.
1196  *
1197  * @param start first character, inclusive, of range to be removed
1198  * from this set.
1199  * @param end last character, inclusive, of range to be removed
1200  * from this set.
1201  */
complement(UChar32 start,UChar32 end)1202 UnicodeSet& UnicodeSet::complement(UChar32 start, UChar32 end) {
1203     if (isFrozen() || isBogus()) {
1204         return *this;
1205     }
1206     if (pinCodePoint(start) <= pinCodePoint(end)) {
1207         UChar32 range[3] = { start, end+1, UNICODESET_HIGH };
1208         exclusiveOr(range, 2, 0);
1209     }
1210     releasePattern();
1211     return *this;
1212 }
1213 
complement(UChar32 c)1214 UnicodeSet& UnicodeSet::complement(UChar32 c) {
1215     return complement(c, c);
1216 }
1217 
1218 /**
1219  * This is equivalent to
1220  * <code>complement(MIN_VALUE, MAX_VALUE)</code>.
1221  */
complement(void)1222 UnicodeSet& UnicodeSet::complement(void) {
1223     if (isFrozen() || isBogus()) {
1224         return *this;
1225     }
1226     if (list[0] == UNICODESET_LOW) {
1227         uprv_memmove(list, list + 1, (size_t)(len-1)*sizeof(UChar32));
1228         --len;
1229     } else {
1230         if (!ensureCapacity(len+1)) {
1231             return *this;
1232         }
1233         uprv_memmove(list + 1, list, (size_t)len*sizeof(UChar32));
1234         list[0] = UNICODESET_LOW;
1235         ++len;
1236     }
1237     releasePattern();
1238     return *this;
1239 }
1240 
1241 /**
1242  * Complement the specified string in this set.
1243  * The set will not contain the specified string once the call
1244  * returns.
1245  *
1246  * @param s the string to complement
1247  * @return this object, for chaining
1248  */
complement(const UnicodeString & s)1249 UnicodeSet& UnicodeSet::complement(const UnicodeString& s) {
1250     if (isFrozen() || isBogus()) return *this;
1251     int32_t cp = getSingleCP(s);
1252     if (cp < 0) {
1253         if (stringsContains(s)) {
1254             strings->removeElement((void*) &s);
1255         } else {
1256             _add(s);
1257         }
1258         releasePattern();
1259     } else {
1260         complement((UChar32)cp, (UChar32)cp);
1261     }
1262     return *this;
1263 }
1264 
1265 /**
1266  * Adds all of the elements in the specified set to this set if
1267  * they're not already present.  This operation effectively
1268  * modifies this set so that its value is the <i>union</i> of the two
1269  * sets.  The behavior of this operation is unspecified if the specified
1270  * collection is modified while the operation is in progress.
1271  *
1272  * @param c set whose elements are to be added to this set.
1273  * @see #add(char, char)
1274  */
addAll(const UnicodeSet & c)1275 UnicodeSet& UnicodeSet::addAll(const UnicodeSet& c) {
1276     if ( c.len>0 && c.list!=NULL ) {
1277         add(c.list, c.len, 0);
1278     }
1279 
1280     // Add strings in order
1281     if ( c.strings!=NULL ) {
1282         for (int32_t i=0; i<c.strings->size(); ++i) {
1283             const UnicodeString* s = (const UnicodeString*)c.strings->elementAt(i);
1284             if (!stringsContains(*s)) {
1285                 _add(*s);
1286             }
1287         }
1288     }
1289     return *this;
1290 }
1291 
1292 /**
1293  * Retains only the elements in this set that are contained in the
1294  * specified set.  In other words, removes from this set all of
1295  * its elements that are not contained in the specified set.  This
1296  * operation effectively modifies this set so that its value is
1297  * the <i>intersection</i> of the two sets.
1298  *
1299  * @param c set that defines which elements this set will retain.
1300  */
retainAll(const UnicodeSet & c)1301 UnicodeSet& UnicodeSet::retainAll(const UnicodeSet& c) {
1302     if (isFrozen() || isBogus()) {
1303         return *this;
1304     }
1305     retain(c.list, c.len, 0);
1306     if (hasStrings()) {
1307         if (!c.hasStrings()) {
1308             strings->removeAllElements();
1309         } else {
1310             strings->retainAll(*c.strings);
1311         }
1312     }
1313     return *this;
1314 }
1315 
1316 /**
1317  * Removes from this set all of its elements that are contained in the
1318  * specified set.  This operation effectively modifies this
1319  * set so that its value is the <i>asymmetric set difference</i> of
1320  * the two sets.
1321  *
1322  * @param c set that defines which elements will be removed from
1323  *          this set.
1324  */
removeAll(const UnicodeSet & c)1325 UnicodeSet& UnicodeSet::removeAll(const UnicodeSet& c) {
1326     if (isFrozen() || isBogus()) {
1327         return *this;
1328     }
1329     retain(c.list, c.len, 2);
1330     if (hasStrings() && c.hasStrings()) {
1331         strings->removeAll(*c.strings);
1332     }
1333     return *this;
1334 }
1335 
1336 /**
1337  * Complements in this set all elements contained in the specified
1338  * set.  Any character in the other set will be removed if it is
1339  * in this set, or will be added if it is not in this set.
1340  *
1341  * @param c set that defines which elements will be xor'ed from
1342  *          this set.
1343  */
complementAll(const UnicodeSet & c)1344 UnicodeSet& UnicodeSet::complementAll(const UnicodeSet& c) {
1345     if (isFrozen() || isBogus()) {
1346         return *this;
1347     }
1348     exclusiveOr(c.list, c.len, 0);
1349 
1350     if (c.strings != nullptr) {
1351         for (int32_t i=0; i<c.strings->size(); ++i) {
1352             void* e = c.strings->elementAt(i);
1353             if (strings == nullptr || !strings->removeElement(e)) {
1354                 _add(*(const UnicodeString*)e);
1355             }
1356         }
1357     }
1358     return *this;
1359 }
1360 
1361 /**
1362  * Removes all of the elements from this set.  This set will be
1363  * empty after this call returns.
1364  */
clear(void)1365 UnicodeSet& UnicodeSet::clear(void) {
1366     if (isFrozen()) {
1367         return *this;
1368     }
1369     list[0] = UNICODESET_HIGH;
1370     len = 1;
1371     releasePattern();
1372     if (strings != NULL) {
1373         strings->removeAllElements();
1374     }
1375     // Remove bogus
1376     fFlags = 0;
1377     return *this;
1378 }
1379 
1380 /**
1381  * Iteration method that returns the number of ranges contained in
1382  * this set.
1383  * @see #getRangeStart
1384  * @see #getRangeEnd
1385  */
getRangeCount() const1386 int32_t UnicodeSet::getRangeCount() const {
1387     return len/2;
1388 }
1389 
1390 /**
1391  * Iteration method that returns the first character in the
1392  * specified range of this set.
1393  * @see #getRangeCount
1394  * @see #getRangeEnd
1395  */
getRangeStart(int32_t index) const1396 UChar32 UnicodeSet::getRangeStart(int32_t index) const {
1397     return list[index*2];
1398 }
1399 
1400 /**
1401  * Iteration method that returns the last character in the
1402  * specified range of this set.
1403  * @see #getRangeStart
1404  * @see #getRangeEnd
1405  */
getRangeEnd(int32_t index) const1406 UChar32 UnicodeSet::getRangeEnd(int32_t index) const {
1407     return list[index*2 + 1] - 1;
1408 }
1409 
getString(int32_t index) const1410 const UnicodeString* UnicodeSet::getString(int32_t index) const {
1411     return (const UnicodeString*) strings->elementAt(index);
1412 }
1413 
1414 /**
1415  * Reallocate this objects internal structures to take up the least
1416  * possible space, without changing this object's value.
1417  */
compact()1418 UnicodeSet& UnicodeSet::compact() {
1419     if (isFrozen() || isBogus()) {
1420         return *this;
1421     }
1422     // Delete buffer first to defragment memory less.
1423     if (buffer != stackList) {
1424         uprv_free(buffer);
1425         buffer = NULL;
1426         bufferCapacity = 0;
1427     }
1428     if (list == stackList) {
1429         // pass
1430     } else if (len <= INITIAL_CAPACITY) {
1431         uprv_memcpy(stackList, list, len * sizeof(UChar32));
1432         uprv_free(list);
1433         list = stackList;
1434         capacity = INITIAL_CAPACITY;
1435     } else if ((len + 7) < capacity) {
1436         // If we have more than a little unused capacity, shrink it to len.
1437         UChar32* temp = (UChar32*) uprv_realloc(list, sizeof(UChar32) * len);
1438         if (temp) {
1439             list = temp;
1440             capacity = len;
1441         }
1442         // else what the heck happened?! We allocated less memory!
1443         // Oh well. We'll keep our original array.
1444     }
1445     if (strings != nullptr && strings->isEmpty()) {
1446         delete strings;
1447         strings = nullptr;
1448     }
1449     return *this;
1450 }
1451 
1452 #ifdef DEBUG_SERIALIZE
1453 #include <stdio.h>
1454 #endif
1455 
1456 /**
1457  * Deserialize constructor.
1458  */
UnicodeSet(const uint16_t data[],int32_t dataLen,ESerialization serialization,UErrorCode & ec)1459 UnicodeSet::UnicodeSet(const uint16_t data[], int32_t dataLen, ESerialization serialization,
1460                        UErrorCode &ec) {
1461 
1462   if(U_FAILURE(ec)) {
1463     setToBogus();
1464     return;
1465   }
1466 
1467   if( (serialization != kSerialized)
1468       || (data==NULL)
1469       || (dataLen < 1)) {
1470     ec = U_ILLEGAL_ARGUMENT_ERROR;
1471     setToBogus();
1472     return;
1473   }
1474 
1475   // bmp?
1476   int32_t headerSize = ((data[0]&0x8000)) ?2:1;
1477   int32_t bmpLength = (headerSize==1)?data[0]:data[1];
1478 
1479   int32_t newLength = (((data[0]&0x7FFF)-bmpLength)/2)+bmpLength;
1480 #ifdef DEBUG_SERIALIZE
1481   printf("dataLen %d headerSize %d bmpLen %d len %d. data[0]=%X/%X/%X/%X\n", dataLen,headerSize,bmpLength,newLength, data[0],data[1],data[2],data[3]);
1482 #endif
1483   if(!ensureCapacity(newLength + 1)) {  // +1 for HIGH
1484     return;
1485   }
1486   // copy bmp
1487   int32_t i;
1488   for(i = 0; i< bmpLength;i++) {
1489     list[i] = data[i+headerSize];
1490 #ifdef DEBUG_SERIALIZE
1491     printf("<<16@%d[%d] %X\n", i+headerSize, i, list[i]);
1492 #endif
1493   }
1494   // copy smp
1495   for(i=bmpLength;i<newLength;i++) {
1496     list[i] = ((UChar32)data[headerSize+bmpLength+(i-bmpLength)*2+0] << 16) +
1497               ((UChar32)data[headerSize+bmpLength+(i-bmpLength)*2+1]);
1498 #ifdef DEBUG_SERIALIZE
1499     printf("<<32@%d+[%d] %lX\n", headerSize+bmpLength+i, i, list[i]);
1500 #endif
1501   }
1502   U_ASSERT(i == newLength);
1503   if (i == 0 || list[i - 1] != UNICODESET_HIGH) {
1504     list[i++] = UNICODESET_HIGH;
1505   }
1506   len = i;
1507 }
1508 
1509 
serialize(uint16_t * dest,int32_t destCapacity,UErrorCode & ec) const1510 int32_t UnicodeSet::serialize(uint16_t *dest, int32_t destCapacity, UErrorCode& ec) const {
1511     int32_t bmpLength, length, destLength;
1512 
1513     if (U_FAILURE(ec)) {
1514         return 0;
1515     }
1516 
1517     if (destCapacity<0 || (destCapacity>0 && dest==NULL)) {
1518         ec=U_ILLEGAL_ARGUMENT_ERROR;
1519         return 0;
1520     }
1521 
1522     /* count necessary 16-bit units */
1523     length=this->len-1; // Subtract 1 to ignore final UNICODESET_HIGH
1524     // assert(length>=0);
1525     if (length==0) {
1526         /* empty set */
1527         if (destCapacity>0) {
1528             *dest=0;
1529         } else {
1530             ec=U_BUFFER_OVERFLOW_ERROR;
1531         }
1532         return 1;
1533     }
1534     /* now length>0 */
1535 
1536     if (this->list[length-1]<=0xffff) {
1537         /* all BMP */
1538         bmpLength=length;
1539     } else if (this->list[0]>=0x10000) {
1540         /* all supplementary */
1541         bmpLength=0;
1542         length*=2;
1543     } else {
1544         /* some BMP, some supplementary */
1545         for (bmpLength=0; bmpLength<length && this->list[bmpLength]<=0xffff; ++bmpLength) {}
1546         length=bmpLength+2*(length-bmpLength);
1547     }
1548 #ifdef DEBUG_SERIALIZE
1549     printf(">> bmpLength%d length%d len%d\n", bmpLength, length, len);
1550 #endif
1551     /* length: number of 16-bit array units */
1552     if (length>0x7fff) {
1553         /* there are only 15 bits for the length in the first serialized word */
1554         ec=U_INDEX_OUTOFBOUNDS_ERROR;
1555         return 0;
1556     }
1557 
1558     /*
1559      * total serialized length:
1560      * number of 16-bit array units (length) +
1561      * 1 length unit (always) +
1562      * 1 bmpLength unit (if there are supplementary values)
1563      */
1564     destLength=length+((length>bmpLength)?2:1);
1565     if (destLength<=destCapacity) {
1566         const UChar32 *p;
1567         int32_t i;
1568 
1569 #ifdef DEBUG_SERIALIZE
1570         printf("writeHdr\n");
1571 #endif
1572         *dest=(uint16_t)length;
1573         if (length>bmpLength) {
1574             *dest|=0x8000;
1575             *++dest=(uint16_t)bmpLength;
1576         }
1577         ++dest;
1578 
1579         /* write the BMP part of the array */
1580         p=this->list;
1581         for (i=0; i<bmpLength; ++i) {
1582 #ifdef DEBUG_SERIALIZE
1583           printf("writebmp: %x\n", (int)*p);
1584 #endif
1585             *dest++=(uint16_t)*p++;
1586         }
1587 
1588         /* write the supplementary part of the array */
1589         for (; i<length; i+=2) {
1590 #ifdef DEBUG_SERIALIZE
1591           printf("write32: %x\n", (int)*p);
1592 #endif
1593             *dest++=(uint16_t)(*p>>16);
1594             *dest++=(uint16_t)*p++;
1595         }
1596     } else {
1597         ec=U_BUFFER_OVERFLOW_ERROR;
1598     }
1599     return destLength;
1600 }
1601 
1602 //----------------------------------------------------------------
1603 // Implementation: Utility methods
1604 //----------------------------------------------------------------
1605 
1606 /**
1607  * Allocate our strings vector and return TRUE if successful.
1608  */
allocateStrings(UErrorCode & status)1609 UBool UnicodeSet::allocateStrings(UErrorCode &status) {
1610     if (U_FAILURE(status)) {
1611         return FALSE;
1612     }
1613     strings = new UVector(uprv_deleteUObject,
1614                           uhash_compareUnicodeString, 1, status);
1615     if (strings == NULL) { // Check for memory allocation error.
1616         status = U_MEMORY_ALLOCATION_ERROR;
1617         return FALSE;
1618     }
1619     if (U_FAILURE(status)) {
1620         delete strings;
1621         strings = NULL;
1622         return FALSE;
1623     }
1624     return TRUE;
1625 }
1626 
nextCapacity(int32_t minCapacity)1627 int32_t UnicodeSet::nextCapacity(int32_t minCapacity) {
1628     // Grow exponentially to reduce the frequency of allocations.
1629     if (minCapacity < INITIAL_CAPACITY) {
1630         return minCapacity + INITIAL_CAPACITY;
1631     } else if (minCapacity <= 2500) {
1632         return 5 * minCapacity;
1633     } else {
1634         int32_t newCapacity = 2 * minCapacity;
1635         if (newCapacity > MAX_LENGTH) {
1636             newCapacity = MAX_LENGTH;
1637         }
1638         return newCapacity;
1639     }
1640 }
1641 
ensureCapacity(int32_t newLen)1642 bool UnicodeSet::ensureCapacity(int32_t newLen) {
1643     if (newLen > MAX_LENGTH) {
1644         newLen = MAX_LENGTH;
1645     }
1646     if (newLen <= capacity) {
1647         return true;
1648     }
1649     int32_t newCapacity = nextCapacity(newLen);
1650     UChar32* temp = (UChar32*) uprv_malloc(newCapacity * sizeof(UChar32));
1651     if (temp == NULL) {
1652         setToBogus(); // set the object to bogus state if an OOM failure occurred.
1653         return false;
1654     }
1655     // Copy only the actual contents.
1656     uprv_memcpy(temp, list, len * sizeof(UChar32));
1657     if (list != stackList) {
1658         uprv_free(list);
1659     }
1660     list = temp;
1661     capacity = newCapacity;
1662     return true;
1663 }
1664 
ensureBufferCapacity(int32_t newLen)1665 bool UnicodeSet::ensureBufferCapacity(int32_t newLen) {
1666     if (newLen > MAX_LENGTH) {
1667         newLen = MAX_LENGTH;
1668     }
1669     if (newLen <= bufferCapacity) {
1670         return true;
1671     }
1672     int32_t newCapacity = nextCapacity(newLen);
1673     UChar32* temp = (UChar32*) uprv_malloc(newCapacity * sizeof(UChar32));
1674     if (temp == NULL) {
1675         setToBogus();
1676         return false;
1677     }
1678     // The buffer has no contents to be copied.
1679     // It is always filled from scratch after this call.
1680     if (buffer != stackList) {
1681         uprv_free(buffer);
1682     }
1683     buffer = temp;
1684     bufferCapacity = newCapacity;
1685     return true;
1686 }
1687 
1688 /**
1689  * Swap list and buffer.
1690  */
swapBuffers(void)1691 void UnicodeSet::swapBuffers(void) {
1692     // swap list and buffer
1693     UChar32* temp = list;
1694     list = buffer;
1695     buffer = temp;
1696 
1697     int32_t c = capacity;
1698     capacity = bufferCapacity;
1699     bufferCapacity = c;
1700 }
1701 
setToBogus()1702 void UnicodeSet::setToBogus() {
1703     clear(); // Remove everything in the set.
1704     fFlags = kIsBogus;
1705 }
1706 
1707 //----------------------------------------------------------------
1708 // Implementation: Fundamental operators
1709 //----------------------------------------------------------------
1710 
max(UChar32 a,UChar32 b)1711 static inline UChar32 max(UChar32 a, UChar32 b) {
1712     return (a > b) ? a : b;
1713 }
1714 
1715 // polarity = 0, 3 is normal: x xor y
1716 // polarity = 1, 2: x xor ~y == x === y
1717 
exclusiveOr(const UChar32 * other,int32_t otherLen,int8_t polarity)1718 void UnicodeSet::exclusiveOr(const UChar32* other, int32_t otherLen, int8_t polarity) {
1719     if (isFrozen() || isBogus()) {
1720         return;
1721     }
1722     if (!ensureBufferCapacity(len + otherLen)) {
1723         return;
1724     }
1725 
1726     int32_t i = 0, j = 0, k = 0;
1727     UChar32 a = list[i++];
1728     UChar32 b;
1729     if (polarity == 1 || polarity == 2) {
1730         b = UNICODESET_LOW;
1731         if (other[j] == UNICODESET_LOW) { // skip base if already LOW
1732             ++j;
1733             b = other[j];
1734         }
1735     } else {
1736         b = other[j++];
1737     }
1738     // simplest of all the routines
1739     // sort the values, discarding identicals!
1740     for (;;) {
1741         if (a < b) {
1742             buffer[k++] = a;
1743             a = list[i++];
1744         } else if (b < a) {
1745             buffer[k++] = b;
1746             b = other[j++];
1747         } else if (a != UNICODESET_HIGH) { // at this point, a == b
1748             // discard both values!
1749             a = list[i++];
1750             b = other[j++];
1751         } else { // DONE!
1752             buffer[k++] = UNICODESET_HIGH;
1753             len = k;
1754             break;
1755         }
1756     }
1757     swapBuffers();
1758     releasePattern();
1759 }
1760 
1761 // polarity = 0 is normal: x union y
1762 // polarity = 2: x union ~y
1763 // polarity = 1: ~x union y
1764 // polarity = 3: ~x union ~y
1765 
add(const UChar32 * other,int32_t otherLen,int8_t polarity)1766 void UnicodeSet::add(const UChar32* other, int32_t otherLen, int8_t polarity) {
1767     if (isFrozen() || isBogus() || other==NULL) {
1768         return;
1769     }
1770     if (!ensureBufferCapacity(len + otherLen)) {
1771         return;
1772     }
1773 
1774     int32_t i = 0, j = 0, k = 0;
1775     UChar32 a = list[i++];
1776     UChar32 b = other[j++];
1777     // change from xor is that we have to check overlapping pairs
1778     // polarity bit 1 means a is second, bit 2 means b is.
1779     for (;;) {
1780         switch (polarity) {
1781           case 0: // both first; take lower if unequal
1782             if (a < b) { // take a
1783                 // Back up over overlapping ranges in buffer[]
1784                 if (k > 0 && a <= buffer[k-1]) {
1785                     // Pick latter end value in buffer[] vs. list[]
1786                     a = max(list[i], buffer[--k]);
1787                 } else {
1788                     // No overlap
1789                     buffer[k++] = a;
1790                     a = list[i];
1791                 }
1792                 i++; // Common if/else code factored out
1793                 polarity ^= 1;
1794             } else if (b < a) { // take b
1795                 if (k > 0 && b <= buffer[k-1]) {
1796                     b = max(other[j], buffer[--k]);
1797                 } else {
1798                     buffer[k++] = b;
1799                     b = other[j];
1800                 }
1801                 j++;
1802                 polarity ^= 2;
1803             } else { // a == b, take a, drop b
1804                 if (a == UNICODESET_HIGH) goto loop_end;
1805                 // This is symmetrical; it doesn't matter if
1806                 // we backtrack with a or b. - liu
1807                 if (k > 0 && a <= buffer[k-1]) {
1808                     a = max(list[i], buffer[--k]);
1809                 } else {
1810                     // No overlap
1811                     buffer[k++] = a;
1812                     a = list[i];
1813                 }
1814                 i++;
1815                 polarity ^= 1;
1816                 b = other[j++];
1817                 polarity ^= 2;
1818             }
1819             break;
1820           case 3: // both second; take higher if unequal, and drop other
1821             if (b <= a) { // take a
1822                 if (a == UNICODESET_HIGH) goto loop_end;
1823                 buffer[k++] = a;
1824             } else { // take b
1825                 if (b == UNICODESET_HIGH) goto loop_end;
1826                 buffer[k++] = b;
1827             }
1828             a = list[i++];
1829             polarity ^= 1;   // factored common code
1830             b = other[j++];
1831             polarity ^= 2;
1832             break;
1833           case 1: // a second, b first; if b < a, overlap
1834             if (a < b) { // no overlap, take a
1835                 buffer[k++] = a; a = list[i++]; polarity ^= 1;
1836             } else if (b < a) { // OVERLAP, drop b
1837                 b = other[j++];
1838                 polarity ^= 2;
1839             } else { // a == b, drop both!
1840                 if (a == UNICODESET_HIGH) goto loop_end;
1841                 a = list[i++];
1842                 polarity ^= 1;
1843                 b = other[j++];
1844                 polarity ^= 2;
1845             }
1846             break;
1847           case 2: // a first, b second; if a < b, overlap
1848             if (b < a) { // no overlap, take b
1849                 buffer[k++] = b;
1850                 b = other[j++];
1851                 polarity ^= 2;
1852             } else  if (a < b) { // OVERLAP, drop a
1853                 a = list[i++];
1854                 polarity ^= 1;
1855             } else { // a == b, drop both!
1856                 if (a == UNICODESET_HIGH) goto loop_end;
1857                 a = list[i++];
1858                 polarity ^= 1;
1859                 b = other[j++];
1860                 polarity ^= 2;
1861             }
1862             break;
1863         }
1864     }
1865  loop_end:
1866     buffer[k++] = UNICODESET_HIGH;    // terminate
1867     len = k;
1868     swapBuffers();
1869     releasePattern();
1870 }
1871 
1872 // polarity = 0 is normal: x intersect y
1873 // polarity = 2: x intersect ~y == set-minus
1874 // polarity = 1: ~x intersect y
1875 // polarity = 3: ~x intersect ~y
1876 
retain(const UChar32 * other,int32_t otherLen,int8_t polarity)1877 void UnicodeSet::retain(const UChar32* other, int32_t otherLen, int8_t polarity) {
1878     if (isFrozen() || isBogus()) {
1879         return;
1880     }
1881     if (!ensureBufferCapacity(len + otherLen)) {
1882         return;
1883     }
1884 
1885     int32_t i = 0, j = 0, k = 0;
1886     UChar32 a = list[i++];
1887     UChar32 b = other[j++];
1888     // change from xor is that we have to check overlapping pairs
1889     // polarity bit 1 means a is second, bit 2 means b is.
1890     for (;;) {
1891         switch (polarity) {
1892           case 0: // both first; drop the smaller
1893             if (a < b) { // drop a
1894                 a = list[i++];
1895                 polarity ^= 1;
1896             } else if (b < a) { // drop b
1897                 b = other[j++];
1898                 polarity ^= 2;
1899             } else { // a == b, take one, drop other
1900                 if (a == UNICODESET_HIGH) goto loop_end;
1901                 buffer[k++] = a;
1902                 a = list[i++];
1903                 polarity ^= 1;
1904                 b = other[j++];
1905                 polarity ^= 2;
1906             }
1907             break;
1908           case 3: // both second; take lower if unequal
1909             if (a < b) { // take a
1910                 buffer[k++] = a;
1911                 a = list[i++];
1912                 polarity ^= 1;
1913             } else if (b < a) { // take b
1914                 buffer[k++] = b;
1915                 b = other[j++];
1916                 polarity ^= 2;
1917             } else { // a == b, take one, drop other
1918                 if (a == UNICODESET_HIGH) goto loop_end;
1919                 buffer[k++] = a;
1920                 a = list[i++];
1921                 polarity ^= 1;
1922                 b = other[j++];
1923                 polarity ^= 2;
1924             }
1925             break;
1926           case 1: // a second, b first;
1927             if (a < b) { // NO OVERLAP, drop a
1928                 a = list[i++];
1929                 polarity ^= 1;
1930             } else if (b < a) { // OVERLAP, take b
1931                 buffer[k++] = b;
1932                 b = other[j++];
1933                 polarity ^= 2;
1934             } else { // a == b, drop both!
1935                 if (a == UNICODESET_HIGH) goto loop_end;
1936                 a = list[i++];
1937                 polarity ^= 1;
1938                 b = other[j++];
1939                 polarity ^= 2;
1940             }
1941             break;
1942           case 2: // a first, b second; if a < b, overlap
1943             if (b < a) { // no overlap, drop b
1944                 b = other[j++];
1945                 polarity ^= 2;
1946             } else  if (a < b) { // OVERLAP, take a
1947                 buffer[k++] = a;
1948                 a = list[i++];
1949                 polarity ^= 1;
1950             } else { // a == b, drop both!
1951                 if (a == UNICODESET_HIGH) goto loop_end;
1952                 a = list[i++];
1953                 polarity ^= 1;
1954                 b = other[j++];
1955                 polarity ^= 2;
1956             }
1957             break;
1958         }
1959     }
1960  loop_end:
1961     buffer[k++] = UNICODESET_HIGH;    // terminate
1962     len = k;
1963     swapBuffers();
1964     releasePattern();
1965 }
1966 
1967 /**
1968  * Append the <code>toPattern()</code> representation of a
1969  * string to the given <code>StringBuffer</code>.
1970  */
_appendToPat(UnicodeString & buf,const UnicodeString & s,UBool escapeUnprintable)1971 void UnicodeSet::_appendToPat(UnicodeString& buf, const UnicodeString& s, UBool
1972 escapeUnprintable) {
1973     UChar32 cp;
1974     for (int32_t i = 0; i < s.length(); i += U16_LENGTH(cp)) {
1975         _appendToPat(buf, cp = s.char32At(i), escapeUnprintable);
1976     }
1977 }
1978 
1979 /**
1980  * Append the <code>toPattern()</code> representation of a
1981  * character to the given <code>StringBuffer</code>.
1982  */
_appendToPat(UnicodeString & buf,UChar32 c,UBool escapeUnprintable)1983 void UnicodeSet::_appendToPat(UnicodeString& buf, UChar32 c, UBool
1984 escapeUnprintable) {
1985     if (escapeUnprintable && ICU_Utility::isUnprintable(c)) {
1986         // Use hex escape notation (\uxxxx or \Uxxxxxxxx) for anything
1987         // unprintable
1988         if (ICU_Utility::escapeUnprintable(buf, c)) {
1989             return;
1990         }
1991     }
1992     // Okay to let ':' pass through
1993     switch (c) {
1994     case u'[':
1995     case u']':
1996     case u'-':
1997     case u'^':
1998     case u'&':
1999     case u'\\':
2000     case u'{':
2001     case u'}':
2002     case u':':
2003     case SymbolTable::SYMBOL_REF:
2004         buf.append(u'\\');
2005         break;
2006     default:
2007         // Escape whitespace
2008         if (PatternProps::isWhiteSpace(c)) {
2009             buf.append(u'\\');
2010         }
2011         break;
2012     }
2013     buf.append(c);
2014 }
2015 
2016 /**
2017  * Append a string representation of this set to result.  This will be
2018  * a cleaned version of the string passed to applyPattern(), if there
2019  * is one.  Otherwise it will be generated.
2020  */
_toPattern(UnicodeString & result,UBool escapeUnprintable) const2021 UnicodeString& UnicodeSet::_toPattern(UnicodeString& result,
2022                                       UBool escapeUnprintable) const
2023 {
2024     if (pat != NULL) {
2025         int32_t i;
2026         int32_t backslashCount = 0;
2027         for (i=0; i<patLen; ) {
2028             UChar32 c;
2029             U16_NEXT(pat, i, patLen, c);
2030             if (escapeUnprintable && ICU_Utility::isUnprintable(c)) {
2031                 // If the unprintable character is preceded by an odd
2032                 // number of backslashes, then it has been escaped.
2033                 // Before unescaping it, we delete the final
2034                 // backslash.
2035                 if ((backslashCount % 2) == 1) {
2036                     result.truncate(result.length() - 1);
2037                 }
2038                 ICU_Utility::escapeUnprintable(result, c);
2039                 backslashCount = 0;
2040             } else {
2041                 result.append(c);
2042                 if (c == u'\\') {
2043                     ++backslashCount;
2044                 } else {
2045                     backslashCount = 0;
2046                 }
2047             }
2048         }
2049         return result;
2050     }
2051 
2052     return _generatePattern(result, escapeUnprintable);
2053 }
2054 
2055 /**
2056  * Returns a string representation of this set.  If the result of
2057  * calling this function is passed to a UnicodeSet constructor, it
2058  * will produce another set that is equal to this one.
2059  */
toPattern(UnicodeString & result,UBool escapeUnprintable) const2060 UnicodeString& UnicodeSet::toPattern(UnicodeString& result,
2061                                      UBool escapeUnprintable) const
2062 {
2063     result.truncate(0);
2064     return _toPattern(result, escapeUnprintable);
2065 }
2066 
2067 /**
2068  * Generate and append a string representation of this set to result.
2069  * This does not use this.pat, the cleaned up copy of the string
2070  * passed to applyPattern().
2071  */
_generatePattern(UnicodeString & result,UBool escapeUnprintable) const2072 UnicodeString& UnicodeSet::_generatePattern(UnicodeString& result,
2073                                             UBool escapeUnprintable) const
2074 {
2075     result.append(u'[');
2076 
2077 //  // Check against the predefined categories.  We implicitly build
2078 //  // up ALL category sets the first time toPattern() is called.
2079 //  for (int8_t cat=0; cat<Unicode::GENERAL_TYPES_COUNT; ++cat) {
2080 //      if (*this == getCategorySet(cat)) {
2081 //          result.append(u':');
2082 //          result.append(CATEGORY_NAMES, cat*2, 2);
2083 //          return result.append(CATEGORY_CLOSE);
2084 //      }
2085 //  }
2086 
2087     int32_t count = getRangeCount();
2088 
2089     // If the set contains at least 2 intervals and includes both
2090     // MIN_VALUE and MAX_VALUE, then the inverse representation will
2091     // be more economical.
2092     if (count > 1 &&
2093         getRangeStart(0) == MIN_VALUE &&
2094         getRangeEnd(count-1) == MAX_VALUE) {
2095 
2096         // Emit the inverse
2097         result.append(u'^');
2098 
2099         for (int32_t i = 1; i < count; ++i) {
2100             UChar32 start = getRangeEnd(i-1)+1;
2101             UChar32 end = getRangeStart(i)-1;
2102             _appendToPat(result, start, escapeUnprintable);
2103             if (start != end) {
2104                 if ((start+1) != end) {
2105                     result.append(u'-');
2106                 }
2107                 _appendToPat(result, end, escapeUnprintable);
2108             }
2109         }
2110     }
2111 
2112     // Default; emit the ranges as pairs
2113     else {
2114         for (int32_t i = 0; i < count; ++i) {
2115             UChar32 start = getRangeStart(i);
2116             UChar32 end = getRangeEnd(i);
2117             _appendToPat(result, start, escapeUnprintable);
2118             if (start != end) {
2119                 if ((start+1) != end) {
2120                     result.append(u'-');
2121                 }
2122                 _appendToPat(result, end, escapeUnprintable);
2123             }
2124         }
2125     }
2126 
2127     if (strings != nullptr) {
2128         for (int32_t i = 0; i<strings->size(); ++i) {
2129             result.append(u'{');
2130             _appendToPat(result,
2131                          *(const UnicodeString*) strings->elementAt(i),
2132                          escapeUnprintable);
2133             result.append(u'}');
2134         }
2135     }
2136     return result.append(u']');
2137 }
2138 
2139 /**
2140 * Release existing cached pattern
2141 */
releasePattern()2142 void UnicodeSet::releasePattern() {
2143     if (pat) {
2144         uprv_free(pat);
2145         pat = NULL;
2146         patLen = 0;
2147     }
2148 }
2149 
2150 /**
2151 * Set the new pattern to cache.
2152 */
setPattern(const char16_t * newPat,int32_t newPatLen)2153 void UnicodeSet::setPattern(const char16_t *newPat, int32_t newPatLen) {
2154     releasePattern();
2155     pat = (UChar *)uprv_malloc((newPatLen + 1) * sizeof(UChar));
2156     if (pat) {
2157         patLen = newPatLen;
2158         u_memcpy(pat, newPat, patLen);
2159         pat[patLen] = 0;
2160     }
2161     // else we don't care if malloc failed. This was just a nice cache.
2162     // We can regenerate an equivalent pattern later when requested.
2163 }
2164 
freeze()2165 UnicodeSet *UnicodeSet::freeze() {
2166     if(!isFrozen() && !isBogus()) {
2167         compact();
2168 
2169         // Optimize contains() and span() and similar functions.
2170         if (hasStrings()) {
2171             stringSpan = new UnicodeSetStringSpan(*this, *strings, UnicodeSetStringSpan::ALL);
2172             if (stringSpan == nullptr) {
2173                 setToBogus();
2174                 return this;
2175             } else if (!stringSpan->needsStringSpanUTF16()) {
2176                 // All strings are irrelevant for span() etc. because
2177                 // all of each string's code points are contained in this set.
2178                 // Do not check needsStringSpanUTF8() because UTF-8 has at most as
2179                 // many relevant strings as UTF-16.
2180                 // (Thus needsStringSpanUTF8() implies needsStringSpanUTF16().)
2181                 delete stringSpan;
2182                 stringSpan = NULL;
2183             }
2184         }
2185         if (stringSpan == NULL) {
2186             // No span-relevant strings: Optimize for code point spans.
2187             bmpSet=new BMPSet(list, len);
2188             if (bmpSet == NULL) { // Check for memory allocation error.
2189                 setToBogus();
2190             }
2191         }
2192     }
2193     return this;
2194 }
2195 
span(const UChar * s,int32_t length,USetSpanCondition spanCondition) const2196 int32_t UnicodeSet::span(const UChar *s, int32_t length, USetSpanCondition spanCondition) const {
2197     if(length>0 && bmpSet!=NULL) {
2198         return (int32_t)(bmpSet->span(s, s+length, spanCondition)-s);
2199     }
2200     if(length<0) {
2201         length=u_strlen(s);
2202     }
2203     if(length==0) {
2204         return 0;
2205     }
2206     if(stringSpan!=NULL) {
2207         return stringSpan->span(s, length, spanCondition);
2208     } else if(hasStrings()) {
2209         uint32_t which= spanCondition==USET_SPAN_NOT_CONTAINED ?
2210                             UnicodeSetStringSpan::FWD_UTF16_NOT_CONTAINED :
2211                             UnicodeSetStringSpan::FWD_UTF16_CONTAINED;
2212         UnicodeSetStringSpan strSpan(*this, *strings, which);
2213         if(strSpan.needsStringSpanUTF16()) {
2214             return strSpan.span(s, length, spanCondition);
2215         }
2216     }
2217 
2218     if(spanCondition!=USET_SPAN_NOT_CONTAINED) {
2219         spanCondition=USET_SPAN_CONTAINED;  // Pin to 0/1 values.
2220     }
2221 
2222     UChar32 c;
2223     int32_t start=0, prev=0;
2224     do {
2225         U16_NEXT(s, start, length, c);
2226         if(spanCondition!=contains(c)) {
2227             break;
2228         }
2229     } while((prev=start)<length);
2230     return prev;
2231 }
2232 
spanBack(const UChar * s,int32_t length,USetSpanCondition spanCondition) const2233 int32_t UnicodeSet::spanBack(const UChar *s, int32_t length, USetSpanCondition spanCondition) const {
2234     if(length>0 && bmpSet!=NULL) {
2235         return (int32_t)(bmpSet->spanBack(s, s+length, spanCondition)-s);
2236     }
2237     if(length<0) {
2238         length=u_strlen(s);
2239     }
2240     if(length==0) {
2241         return 0;
2242     }
2243     if(stringSpan!=NULL) {
2244         return stringSpan->spanBack(s, length, spanCondition);
2245     } else if(hasStrings()) {
2246         uint32_t which= spanCondition==USET_SPAN_NOT_CONTAINED ?
2247                             UnicodeSetStringSpan::BACK_UTF16_NOT_CONTAINED :
2248                             UnicodeSetStringSpan::BACK_UTF16_CONTAINED;
2249         UnicodeSetStringSpan strSpan(*this, *strings, which);
2250         if(strSpan.needsStringSpanUTF16()) {
2251             return strSpan.spanBack(s, length, spanCondition);
2252         }
2253     }
2254 
2255     if(spanCondition!=USET_SPAN_NOT_CONTAINED) {
2256         spanCondition=USET_SPAN_CONTAINED;  // Pin to 0/1 values.
2257     }
2258 
2259     UChar32 c;
2260     int32_t prev=length;
2261     do {
2262         U16_PREV(s, 0, length, c);
2263         if(spanCondition!=contains(c)) {
2264             break;
2265         }
2266     } while((prev=length)>0);
2267     return prev;
2268 }
2269 
spanUTF8(const char * s,int32_t length,USetSpanCondition spanCondition) const2270 int32_t UnicodeSet::spanUTF8(const char *s, int32_t length, USetSpanCondition spanCondition) const {
2271     if(length>0 && bmpSet!=NULL) {
2272         const uint8_t *s0=(const uint8_t *)s;
2273         return (int32_t)(bmpSet->spanUTF8(s0, length, spanCondition)-s0);
2274     }
2275     if(length<0) {
2276         length=(int32_t)uprv_strlen(s);
2277     }
2278     if(length==0) {
2279         return 0;
2280     }
2281     if(stringSpan!=NULL) {
2282         return stringSpan->spanUTF8((const uint8_t *)s, length, spanCondition);
2283     } else if(hasStrings()) {
2284         uint32_t which= spanCondition==USET_SPAN_NOT_CONTAINED ?
2285                             UnicodeSetStringSpan::FWD_UTF8_NOT_CONTAINED :
2286                             UnicodeSetStringSpan::FWD_UTF8_CONTAINED;
2287         UnicodeSetStringSpan strSpan(*this, *strings, which);
2288         if(strSpan.needsStringSpanUTF8()) {
2289             return strSpan.spanUTF8((const uint8_t *)s, length, spanCondition);
2290         }
2291     }
2292 
2293     if(spanCondition!=USET_SPAN_NOT_CONTAINED) {
2294         spanCondition=USET_SPAN_CONTAINED;  // Pin to 0/1 values.
2295     }
2296 
2297     UChar32 c;
2298     int32_t start=0, prev=0;
2299     do {
2300         U8_NEXT_OR_FFFD(s, start, length, c);
2301         if(spanCondition!=contains(c)) {
2302             break;
2303         }
2304     } while((prev=start)<length);
2305     return prev;
2306 }
2307 
spanBackUTF8(const char * s,int32_t length,USetSpanCondition spanCondition) const2308 int32_t UnicodeSet::spanBackUTF8(const char *s, int32_t length, USetSpanCondition spanCondition) const {
2309     if(length>0 && bmpSet!=NULL) {
2310         const uint8_t *s0=(const uint8_t *)s;
2311         return bmpSet->spanBackUTF8(s0, length, spanCondition);
2312     }
2313     if(length<0) {
2314         length=(int32_t)uprv_strlen(s);
2315     }
2316     if(length==0) {
2317         return 0;
2318     }
2319     if(stringSpan!=NULL) {
2320         return stringSpan->spanBackUTF8((const uint8_t *)s, length, spanCondition);
2321     } else if(hasStrings()) {
2322         uint32_t which= spanCondition==USET_SPAN_NOT_CONTAINED ?
2323                             UnicodeSetStringSpan::BACK_UTF8_NOT_CONTAINED :
2324                             UnicodeSetStringSpan::BACK_UTF8_CONTAINED;
2325         UnicodeSetStringSpan strSpan(*this, *strings, which);
2326         if(strSpan.needsStringSpanUTF8()) {
2327             return strSpan.spanBackUTF8((const uint8_t *)s, length, spanCondition);
2328         }
2329     }
2330 
2331     if(spanCondition!=USET_SPAN_NOT_CONTAINED) {
2332         spanCondition=USET_SPAN_CONTAINED;  // Pin to 0/1 values.
2333     }
2334 
2335     UChar32 c;
2336     int32_t prev=length;
2337     do {
2338         U8_PREV_OR_FFFD(s, 0, length, c);
2339         if(spanCondition!=contains(c)) {
2340             break;
2341         }
2342     } while((prev=length)>0);
2343     return prev;
2344 }
2345 
2346 U_NAMESPACE_END
2347