1 // © 2016 and later: Unicode, Inc. and others.
2 // License & terms of use: http://www.unicode.org/copyright.html
3 /*
4 **********************************************************************
5 *   Copyright (C) 1999-2015, International Business Machines
6 *   Corporation and others.  All Rights Reserved.
7 **********************************************************************
8 *   Date        Name        Description
9 *   10/20/99    alan        Creation.
10 **********************************************************************
11 */
12 
13 #include "unicode/utypes.h"
14 #include "unicode/parsepos.h"
15 #include "unicode/symtable.h"
16 #include "unicode/uniset.h"
17 #include "unicode/ustring.h"
18 #include "unicode/utf8.h"
19 #include "unicode/utf16.h"
20 #include "ruleiter.h"
21 #include "cmemory.h"
22 #include "cstring.h"
23 #include "patternprops.h"
24 #include "uelement.h"
25 #include "util.h"
26 #include "uvector.h"
27 #include "charstr.h"
28 #include "ustrfmt.h"
29 #include "uassert.h"
30 #include "bmpset.h"
31 #include "unisetspan.h"
32 
33 // Define UChar constants using hex for EBCDIC compatibility
34 // Used #define to reduce private static exports and memory access time.
35 #define SET_OPEN        ((UChar)0x005B) /*[*/
36 #define SET_CLOSE       ((UChar)0x005D) /*]*/
37 #define HYPHEN          ((UChar)0x002D) /*-*/
38 #define COMPLEMENT      ((UChar)0x005E) /*^*/
39 #define COLON           ((UChar)0x003A) /*:*/
40 #define BACKSLASH       ((UChar)0x005C) /*\*/
41 #define INTERSECTION    ((UChar)0x0026) /*&*/
42 #define UPPER_U         ((UChar)0x0055) /*U*/
43 #define LOWER_U         ((UChar)0x0075) /*u*/
44 #define OPEN_BRACE      ((UChar)123)    /*{*/
45 #define CLOSE_BRACE     ((UChar)125)    /*}*/
46 #define UPPER_P         ((UChar)0x0050) /*P*/
47 #define LOWER_P         ((UChar)0x0070) /*p*/
48 #define UPPER_N         ((UChar)78)     /*N*/
49 #define EQUALS          ((UChar)0x003D) /*=*/
50 
51 // HIGH_VALUE > all valid values. 110000 for codepoints
52 #define UNICODESET_HIGH 0x0110000
53 
54 // LOW <= all valid values. ZERO for codepoints
55 #define UNICODESET_LOW 0x000000
56 
57 /** Max list [0, 1, 2, ..., max code point, HIGH] */
58 constexpr int32_t MAX_LENGTH = UNICODESET_HIGH + 1;
59 
60 U_NAMESPACE_BEGIN
61 
~SymbolTable()62 SymbolTable::~SymbolTable() {}
63 
UOBJECT_DEFINE_RTTI_IMPLEMENTATION(UnicodeSet)64 UOBJECT_DEFINE_RTTI_IMPLEMENTATION(UnicodeSet)
65 
66 /**
67  * Modify the given UChar32 variable so that it is in range, by
68  * pinning values < UNICODESET_LOW to UNICODESET_LOW, and
69  * pinning values > UNICODESET_HIGH-1 to UNICODESET_HIGH-1.
70  * It modifies its argument in-place and also returns it.
71  */
72 static inline UChar32 pinCodePoint(UChar32& c) {
73     if (c < UNICODESET_LOW) {
74         c = UNICODESET_LOW;
75     } else if (c > (UNICODESET_HIGH-1)) {
76         c = (UNICODESET_HIGH-1);
77     }
78     return c;
79 }
80 
81 //----------------------------------------------------------------
82 // Debugging
83 //----------------------------------------------------------------
84 
85 // DO NOT DELETE THIS CODE.  This code is used to debug memory leaks.
86 // To enable the debugging, define the symbol DEBUG_MEM in the line
87 // below.  This will result in text being sent to stdout that looks
88 // like this:
89 //   DEBUG UnicodeSet: ct 0x00A39B20; 397 [\u0A81-\u0A83\u0A85-
90 //   DEBUG UnicodeSet: dt 0x00A39B20; 396 [\u0A81-\u0A83\u0A85-
91 // Each line lists a construction (ct) or destruction (dt) event, the
92 // object address, the number of outstanding objects after the event,
93 // and the pattern of the object in question.
94 
95 // #define DEBUG_MEM
96 
97 #ifdef DEBUG_MEM
98 #include <stdio.h>
99 static int32_t _dbgCount = 0;
100 
_dbgct(UnicodeSet * set)101 static inline void _dbgct(UnicodeSet* set) {
102     UnicodeString str;
103     set->toPattern(str, TRUE);
104     char buf[40];
105     str.extract(0, 39, buf, "");
106     printf("DEBUG UnicodeSet: ct 0x%08X; %d %s\n", set, ++_dbgCount, buf);
107 }
108 
_dbgdt(UnicodeSet * set)109 static inline void _dbgdt(UnicodeSet* set) {
110     UnicodeString str;
111     set->toPattern(str, TRUE);
112     char buf[40];
113     str.extract(0, 39, buf, "");
114     printf("DEBUG UnicodeSet: dt 0x%08X; %d %s\n", set, --_dbgCount, buf);
115 }
116 
117 #else
118 
119 #define _dbgct(set)
120 #define _dbgdt(set)
121 
122 #endif
123 
124 //----------------------------------------------------------------
125 // UnicodeString in UVector support
126 //----------------------------------------------------------------
127 
cloneUnicodeString(UElement * dst,UElement * src)128 static void U_CALLCONV cloneUnicodeString(UElement *dst, UElement *src) {
129     dst->pointer = new UnicodeString(*(UnicodeString*)src->pointer);
130 }
131 
compareUnicodeString(UElement t1,UElement t2)132 static int8_t U_CALLCONV compareUnicodeString(UElement t1, UElement t2) {
133     const UnicodeString &a = *(const UnicodeString*)t1.pointer;
134     const UnicodeString &b = *(const UnicodeString*)t2.pointer;
135     return a.compare(b);
136 }
137 
hasStrings() const138 UBool UnicodeSet::hasStrings() const {
139     return strings != nullptr && !strings->isEmpty();
140 }
141 
stringsSize() const142 int32_t UnicodeSet::stringsSize() const {
143     return strings == nullptr ? 0 : strings->size();
144 }
145 
stringsContains(const UnicodeString & s) const146 UBool UnicodeSet::stringsContains(const UnicodeString &s) const {
147     return strings != nullptr && strings->contains((void*) &s);
148 }
149 
150 //----------------------------------------------------------------
151 // Constructors &c
152 //----------------------------------------------------------------
153 
154 /**
155  * Constructs an empty set.
156  */
UnicodeSet()157 UnicodeSet::UnicodeSet() {
158     list[0] = UNICODESET_HIGH;
159     _dbgct(this);
160 }
161 
162 /**
163  * Constructs a set containing the given range. If <code>end >
164  * start</code> then an empty set is created.
165  *
166  * @param start first character, inclusive, of range
167  * @param end last character, inclusive, of range
168  */
UnicodeSet(UChar32 start,UChar32 end)169 UnicodeSet::UnicodeSet(UChar32 start, UChar32 end) {
170     list[0] = UNICODESET_HIGH;
171     add(start, end);
172     _dbgct(this);
173 }
174 
175 /**
176  * Constructs a set that is identical to the given UnicodeSet.
177  */
UnicodeSet(const UnicodeSet & o)178 UnicodeSet::UnicodeSet(const UnicodeSet& o) : UnicodeFilter(o) {
179     *this = o;
180     _dbgct(this);
181 }
182 
183 // Copy-construct as thawed.
UnicodeSet(const UnicodeSet & o,UBool)184 UnicodeSet::UnicodeSet(const UnicodeSet& o, UBool /* asThawed */) : UnicodeFilter(o) {
185     if (ensureCapacity(o.len)) {
186         // *this = o except for bmpSet and stringSpan
187         len = o.len;
188         uprv_memcpy(list, o.list, (size_t)len*sizeof(UChar32));
189         if (o.hasStrings()) {
190             UErrorCode status = U_ZERO_ERROR;
191             if (!allocateStrings(status) ||
192                     (strings->assign(*o.strings, cloneUnicodeString, status), U_FAILURE(status))) {
193                 setToBogus();
194                 return;
195             }
196         }
197         if (o.pat) {
198             setPattern(o.pat, o.patLen);
199         }
200         _dbgct(this);
201     }
202 }
203 
204 /**
205  * Destructs the set.
206  */
~UnicodeSet()207 UnicodeSet::~UnicodeSet() {
208     _dbgdt(this); // first!
209     if (list != stackList) {
210         uprv_free(list);
211     }
212     delete bmpSet;
213     if (buffer != stackList) {
214         uprv_free(buffer);
215     }
216     delete strings;
217     delete stringSpan;
218     releasePattern();
219 }
220 
221 /**
222  * Assigns this object to be a copy of another.
223  */
operator =(const UnicodeSet & o)224 UnicodeSet& UnicodeSet::operator=(const UnicodeSet& o) {
225     return copyFrom(o, FALSE);
226 }
227 
copyFrom(const UnicodeSet & o,UBool asThawed)228 UnicodeSet& UnicodeSet::copyFrom(const UnicodeSet& o, UBool asThawed) {
229     if (this == &o) {
230         return *this;
231     }
232     if (isFrozen()) {
233         return *this;
234     }
235     if (o.isBogus()) {
236         setToBogus();
237         return *this;
238     }
239     if (!ensureCapacity(o.len)) {
240         // ensureCapacity will mark the UnicodeSet as Bogus if OOM failure happens.
241         return *this;
242     }
243     len = o.len;
244     uprv_memcpy(list, o.list, (size_t)len*sizeof(UChar32));
245     if (o.bmpSet != nullptr && !asThawed) {
246         bmpSet = new BMPSet(*o.bmpSet, list, len);
247         if (bmpSet == NULL) { // Check for memory allocation error.
248             setToBogus();
249             return *this;
250         }
251     }
252     if (o.hasStrings()) {
253         UErrorCode status = U_ZERO_ERROR;
254         if ((strings == nullptr && !allocateStrings(status)) ||
255                 (strings->assign(*o.strings, cloneUnicodeString, status), U_FAILURE(status))) {
256             setToBogus();
257             return *this;
258         }
259     } else if (hasStrings()) {
260         strings->removeAllElements();
261     }
262     if (o.stringSpan != nullptr && !asThawed) {
263         stringSpan = new UnicodeSetStringSpan(*o.stringSpan, *strings);
264         if (stringSpan == NULL) { // Check for memory allocation error.
265             setToBogus();
266             return *this;
267         }
268     }
269     releasePattern();
270     if (o.pat) {
271         setPattern(o.pat, o.patLen);
272     }
273     return *this;
274 }
275 
276 /**
277  * Returns a copy of this object.  All UnicodeMatcher objects have
278  * to support cloning in order to allow classes using
279  * UnicodeMatchers, such as Transliterator, to implement cloning.
280  */
clone() const281 UnicodeSet* UnicodeSet::clone() const {
282     return new UnicodeSet(*this);
283 }
284 
cloneAsThawed() const285 UnicodeSet *UnicodeSet::cloneAsThawed() const {
286     return new UnicodeSet(*this, TRUE);
287 }
288 
289 /**
290  * Compares the specified object with this set for equality.  Returns
291  * <tt>true</tt> if the two sets
292  * have the same size, and every member of the specified set is
293  * contained in this set (or equivalently, every member of this set is
294  * contained in the specified set).
295  *
296  * @param o set to be compared for equality with this set.
297  * @return <tt>true</tt> if the specified set is equal to this set.
298  */
operator ==(const UnicodeSet & o) const299 UBool UnicodeSet::operator==(const UnicodeSet& o) const {
300     if (len != o.len) return FALSE;
301     for (int32_t i = 0; i < len; ++i) {
302         if (list[i] != o.list[i]) return FALSE;
303     }
304     if (hasStrings() != o.hasStrings()) { return FALSE; }
305     if (hasStrings() && *strings != *o.strings) return FALSE;
306     return TRUE;
307 }
308 
309 /**
310  * Returns the hash code value for this set.
311  *
312  * @return the hash code value for this set.
313  * @see Object#hashCode()
314  */
hashCode(void) const315 int32_t UnicodeSet::hashCode(void) const {
316     uint32_t result = static_cast<uint32_t>(len);
317     for (int32_t i = 0; i < len; ++i) {
318         result *= 1000003u;
319         result += list[i];
320     }
321     return static_cast<int32_t>(result);
322 }
323 
324 //----------------------------------------------------------------
325 // Public API
326 //----------------------------------------------------------------
327 
328 /**
329  * Returns the number of elements in this set (its cardinality),
330  * Note than the elements of a set may include both individual
331  * codepoints and strings.
332  *
333  * @return the number of elements in this set (its cardinality).
334  */
size(void) const335 int32_t UnicodeSet::size(void) const {
336     int32_t n = 0;
337     int32_t count = getRangeCount();
338     for (int32_t i = 0; i < count; ++i) {
339         n += getRangeEnd(i) - getRangeStart(i) + 1;
340     }
341     return n + stringsSize();
342 }
343 
344 /**
345  * Returns <tt>true</tt> if this set contains no elements.
346  *
347  * @return <tt>true</tt> if this set contains no elements.
348  */
isEmpty(void) const349 UBool UnicodeSet::isEmpty(void) const {
350     return len == 1 && !hasStrings();
351 }
352 
353 /**
354  * Returns true if this set contains the given character.
355  * @param c character to be checked for containment
356  * @return true if the test condition is met
357  */
contains(UChar32 c) const358 UBool UnicodeSet::contains(UChar32 c) const {
359     // Set i to the index of the start item greater than ch
360     // We know we will terminate without length test!
361     // LATER: for large sets, add binary search
362     //int32_t i = -1;
363     //for (;;) {
364     //    if (c < list[++i]) break;
365     //}
366     if (bmpSet != NULL) {
367         return bmpSet->contains(c);
368     }
369     if (stringSpan != NULL) {
370         return stringSpan->contains(c);
371     }
372     if (c >= UNICODESET_HIGH) { // Don't need to check LOW bound
373         return FALSE;
374     }
375     int32_t i = findCodePoint(c);
376     return (UBool)(i & 1); // return true if odd
377 }
378 
379 /**
380  * Returns the smallest value i such that c < list[i].  Caller
381  * must ensure that c is a legal value or this method will enter
382  * an infinite loop.  This method performs a binary search.
383  * @param c a character in the range MIN_VALUE..MAX_VALUE
384  * inclusive
385  * @return the smallest integer i in the range 0..len-1,
386  * inclusive, such that c < list[i]
387  */
findCodePoint(UChar32 c) const388 int32_t UnicodeSet::findCodePoint(UChar32 c) const {
389     /* Examples:
390                                        findCodePoint(c)
391        set              list[]         c=0 1 3 4 7 8
392        ===              ==============   ===========
393        []               [110000]         0 0 0 0 0 0
394        [\u0000-\u0003]  [0, 4, 110000]   1 1 1 2 2 2
395        [\u0004-\u0007]  [4, 8, 110000]   0 0 0 1 1 2
396        [:Any:]          [0, 110000]      1 1 1 1 1 1
397      */
398 
399     // Return the smallest i such that c < list[i].  Assume
400     // list[len - 1] == HIGH and that c is legal (0..HIGH-1).
401     if (c < list[0])
402         return 0;
403     // High runner test.  c is often after the last range, so an
404     // initial check for this condition pays off.
405     int32_t lo = 0;
406     int32_t hi = len - 1;
407     if (lo >= hi || c >= list[hi-1])
408         return hi;
409     // invariant: c >= list[lo]
410     // invariant: c < list[hi]
411     for (;;) {
412         int32_t i = (lo + hi) >> 1;
413         if (i == lo) {
414             break; // Found!
415         } else if (c < list[i]) {
416             hi = i;
417         } else {
418             lo = i;
419         }
420     }
421     return hi;
422 }
423 
424 /**
425  * Returns true if this set contains every character
426  * of the given range.
427  * @param start first character, inclusive, of the range
428  * @param end last character, inclusive, of the range
429  * @return true if the test condition is met
430  */
contains(UChar32 start,UChar32 end) const431 UBool UnicodeSet::contains(UChar32 start, UChar32 end) const {
432     //int32_t i = -1;
433     //for (;;) {
434     //    if (start < list[++i]) break;
435     //}
436     int32_t i = findCodePoint(start);
437     return ((i & 1) != 0 && end < list[i]);
438 }
439 
440 /**
441  * Returns <tt>true</tt> if this set contains the given
442  * multicharacter string.
443  * @param s string to be checked for containment
444  * @return <tt>true</tt> if this set contains the specified string
445  */
contains(const UnicodeString & s) const446 UBool UnicodeSet::contains(const UnicodeString& s) const {
447     if (s.length() == 0) return FALSE;
448     int32_t cp = getSingleCP(s);
449     if (cp < 0) {
450         return stringsContains(s);
451     } else {
452         return contains((UChar32) cp);
453     }
454 }
455 
456 /**
457  * Returns true if this set contains all the characters and strings
458  * of the given set.
459  * @param c set to be checked for containment
460  * @return true if the test condition is met
461  */
containsAll(const UnicodeSet & c) const462 UBool UnicodeSet::containsAll(const UnicodeSet& c) const {
463     // The specified set is a subset if all of its pairs are contained in
464     // this set.  It's possible to code this more efficiently in terms of
465     // direct manipulation of the inversion lists if the need arises.
466     int32_t n = c.getRangeCount();
467     for (int i=0; i<n; ++i) {
468         if (!contains(c.getRangeStart(i), c.getRangeEnd(i))) {
469             return FALSE;
470         }
471     }
472     return !c.hasStrings() || (strings != nullptr && strings->containsAll(*c.strings));
473 }
474 
475 /**
476  * Returns true if this set contains all the characters
477  * of the given string.
478  * @param s string containing characters to be checked for containment
479  * @return true if the test condition is met
480  */
containsAll(const UnicodeString & s) const481 UBool UnicodeSet::containsAll(const UnicodeString& s) const {
482     return (UBool)(span(s.getBuffer(), s.length(), USET_SPAN_CONTAINED) ==
483                    s.length());
484 }
485 
486 /**
487  * Returns true if this set contains none of the characters
488  * of the given range.
489  * @param start first character, inclusive, of the range
490  * @param end last character, inclusive, of the range
491  * @return true if the test condition is met
492  */
containsNone(UChar32 start,UChar32 end) const493 UBool UnicodeSet::containsNone(UChar32 start, UChar32 end) const {
494     //int32_t i = -1;
495     //for (;;) {
496     //    if (start < list[++i]) break;
497     //}
498     int32_t i = findCodePoint(start);
499     return ((i & 1) == 0 && end < list[i]);
500 }
501 
502 /**
503  * Returns true if this set contains none of the characters and strings
504  * of the given set.
505  * @param c set to be checked for containment
506  * @return true if the test condition is met
507  */
containsNone(const UnicodeSet & c) const508 UBool UnicodeSet::containsNone(const UnicodeSet& c) const {
509     // The specified set is a subset if all of its pairs are contained in
510     // this set.  It's possible to code this more efficiently in terms of
511     // direct manipulation of the inversion lists if the need arises.
512     int32_t n = c.getRangeCount();
513     for (int32_t i=0; i<n; ++i) {
514         if (!containsNone(c.getRangeStart(i), c.getRangeEnd(i))) {
515             return FALSE;
516         }
517     }
518     return strings == nullptr || !c.hasStrings() || strings->containsNone(*c.strings);
519 }
520 
521 /**
522  * Returns true if this set contains none of the characters
523  * of the given string.
524  * @param s string containing characters to be checked for containment
525  * @return true if the test condition is met
526  */
containsNone(const UnicodeString & s) const527 UBool UnicodeSet::containsNone(const UnicodeString& s) const {
528     return (UBool)(span(s.getBuffer(), s.length(), USET_SPAN_NOT_CONTAINED) ==
529                    s.length());
530 }
531 
532 /**
533  * Returns <tt>true</tt> if this set contains any character whose low byte
534  * is the given value.  This is used by <tt>RuleBasedTransliterator</tt> for
535  * indexing.
536  */
matchesIndexValue(uint8_t v) const537 UBool UnicodeSet::matchesIndexValue(uint8_t v) const {
538     /* The index value v, in the range [0,255], is contained in this set if
539      * it is contained in any pair of this set.  Pairs either have the high
540      * bytes equal, or unequal.  If the high bytes are equal, then we have
541      * aaxx..aayy, where aa is the high byte.  Then v is contained if xx <=
542      * v <= yy.  If the high bytes are unequal we have aaxx..bbyy, bb>aa.
543      * Then v is contained if xx <= v || v <= yy.  (This is identical to the
544      * time zone month containment logic.)
545      */
546     int32_t i;
547     int32_t rangeCount=getRangeCount();
548     for (i=0; i<rangeCount; ++i) {
549         UChar32 low = getRangeStart(i);
550         UChar32 high = getRangeEnd(i);
551         if ((low & ~0xFF) == (high & ~0xFF)) {
552             if ((low & 0xFF) <= v && v <= (high & 0xFF)) {
553                 return TRUE;
554             }
555         } else if ((low & 0xFF) <= v || v <= (high & 0xFF)) {
556             return TRUE;
557         }
558     }
559     if (hasStrings()) {
560         for (i=0; i<strings->size(); ++i) {
561             const UnicodeString& s = *(const UnicodeString*)strings->elementAt(i);
562             //if (s.length() == 0) {
563             //    // Empty strings match everything
564             //    return TRUE;
565             //}
566             // assert(s.length() != 0); // We enforce this elsewhere
567             UChar32 c = s.char32At(0);
568             if ((c & 0xFF) == v) {
569                 return TRUE;
570             }
571         }
572     }
573     return FALSE;
574 }
575 
576 /**
577  * Implementation of UnicodeMatcher::matches().  Always matches the
578  * longest possible multichar string.
579  */
matches(const Replaceable & text,int32_t & offset,int32_t limit,UBool incremental)580 UMatchDegree UnicodeSet::matches(const Replaceable& text,
581                                  int32_t& offset,
582                                  int32_t limit,
583                                  UBool incremental) {
584     if (offset == limit) {
585         // Strings, if any, have length != 0, so we don't worry
586         // about them here.  If we ever allow zero-length strings
587         // we much check for them here.
588         if (contains(U_ETHER)) {
589             return incremental ? U_PARTIAL_MATCH : U_MATCH;
590         } else {
591             return U_MISMATCH;
592         }
593     } else {
594         if (hasStrings()) { // try strings first
595 
596             // might separate forward and backward loops later
597             // for now they are combined
598 
599             // TODO Improve efficiency of this, at least in the forward
600             // direction, if not in both.  In the forward direction we
601             // can assume the strings are sorted.
602 
603             int32_t i;
604             UBool forward = offset < limit;
605 
606             // firstChar is the leftmost char to match in the
607             // forward direction or the rightmost char to match in
608             // the reverse direction.
609             UChar firstChar = text.charAt(offset);
610 
611             // If there are multiple strings that can match we
612             // return the longest match.
613             int32_t highWaterLength = 0;
614 
615             for (i=0; i<strings->size(); ++i) {
616                 const UnicodeString& trial = *(const UnicodeString*)strings->elementAt(i);
617 
618                 //if (trial.length() == 0) {
619                 //    return U_MATCH; // null-string always matches
620                 //}
621                 // assert(trial.length() != 0); // We ensure this elsewhere
622 
623                 UChar c = trial.charAt(forward ? 0 : trial.length() - 1);
624 
625                 // Strings are sorted, so we can optimize in the
626                 // forward direction.
627                 if (forward && c > firstChar) break;
628                 if (c != firstChar) continue;
629 
630                 int32_t matchLen = matchRest(text, offset, limit, trial);
631 
632                 if (incremental) {
633                     int32_t maxLen = forward ? limit-offset : offset-limit;
634                     if (matchLen == maxLen) {
635                         // We have successfully matched but only up to limit.
636                         return U_PARTIAL_MATCH;
637                     }
638                 }
639 
640                 if (matchLen == trial.length()) {
641                     // We have successfully matched the whole string.
642                     if (matchLen > highWaterLength) {
643                         highWaterLength = matchLen;
644                     }
645                     // In the forward direction we know strings
646                     // are sorted so we can bail early.
647                     if (forward && matchLen < highWaterLength) {
648                         break;
649                     }
650                     continue;
651                 }
652             }
653 
654             // We've checked all strings without a partial match.
655             // If we have full matches, return the longest one.
656             if (highWaterLength != 0) {
657                 offset += forward ? highWaterLength : -highWaterLength;
658                 return U_MATCH;
659             }
660         }
661         return UnicodeFilter::matches(text, offset, limit, incremental);
662     }
663 }
664 
665 /**
666  * Returns the longest match for s in text at the given position.
667  * If limit > start then match forward from start+1 to limit
668  * matching all characters except s.charAt(0).  If limit < start,
669  * go backward starting from start-1 matching all characters
670  * except s.charAt(s.length()-1).  This method assumes that the
671  * first character, text.charAt(start), matches s, so it does not
672  * check it.
673  * @param text the text to match
674  * @param start the first character to match.  In the forward
675  * direction, text.charAt(start) is matched against s.charAt(0).
676  * In the reverse direction, it is matched against
677  * s.charAt(s.length()-1).
678  * @param limit the limit offset for matching, either last+1 in
679  * the forward direction, or last-1 in the reverse direction,
680  * where last is the index of the last character to match.
681  * @return If part of s matches up to the limit, return |limit -
682  * start|.  If all of s matches before reaching the limit, return
683  * s.length().  If there is a mismatch between s and text, return
684  * 0
685  */
matchRest(const Replaceable & text,int32_t start,int32_t limit,const UnicodeString & s)686 int32_t UnicodeSet::matchRest(const Replaceable& text,
687                               int32_t start, int32_t limit,
688                               const UnicodeString& s) {
689     int32_t i;
690     int32_t maxLen;
691     int32_t slen = s.length();
692     if (start < limit) {
693         maxLen = limit - start;
694         if (maxLen > slen) maxLen = slen;
695         for (i = 1; i < maxLen; ++i) {
696             if (text.charAt(start + i) != s.charAt(i)) return 0;
697         }
698     } else {
699         maxLen = start - limit;
700         if (maxLen > slen) maxLen = slen;
701         --slen; // <=> slen = s.length() - 1;
702         for (i = 1; i < maxLen; ++i) {
703             if (text.charAt(start - i) != s.charAt(slen - i)) return 0;
704         }
705     }
706     return maxLen;
707 }
708 
709 /**
710  * Implement of UnicodeMatcher
711  */
addMatchSetTo(UnicodeSet & toUnionTo) const712 void UnicodeSet::addMatchSetTo(UnicodeSet& toUnionTo) const {
713     toUnionTo.addAll(*this);
714 }
715 
716 /**
717  * Returns the index of the given character within this set, where
718  * the set is ordered by ascending code point.  If the character
719  * is not in this set, return -1.  The inverse of this method is
720  * <code>charAt()</code>.
721  * @return an index from 0..size()-1, or -1
722  */
indexOf(UChar32 c) const723 int32_t UnicodeSet::indexOf(UChar32 c) const {
724     if (c < MIN_VALUE || c > MAX_VALUE) {
725         return -1;
726     }
727     int32_t i = 0;
728     int32_t n = 0;
729     for (;;) {
730         UChar32 start = list[i++];
731         if (c < start) {
732             return -1;
733         }
734         UChar32 limit = list[i++];
735         if (c < limit) {
736             return n + c - start;
737         }
738         n += limit - start;
739     }
740 }
741 
742 /**
743  * Returns the character at the given index within this set, where
744  * the set is ordered by ascending code point.  If the index is
745  * out of range, return (UChar32)-1.  The inverse of this method is
746  * <code>indexOf()</code>.
747  * @param index an index from 0..size()-1
748  * @return the character at the given index, or (UChar32)-1.
749  */
charAt(int32_t index) const750 UChar32 UnicodeSet::charAt(int32_t index) const {
751     if (index >= 0) {
752         // len2 is the largest even integer <= len, that is, it is len
753         // for even values and len-1 for odd values.  With odd values
754         // the last entry is UNICODESET_HIGH.
755         int32_t len2 = len & ~1;
756         for (int32_t i=0; i < len2;) {
757             UChar32 start = list[i++];
758             int32_t count = list[i++] - start;
759             if (index < count) {
760                 return (UChar32)(start + index);
761             }
762             index -= count;
763         }
764     }
765     return (UChar32)-1;
766 }
767 
768 /**
769  * Make this object represent the range <code>start - end</code>.
770  * If <code>end > start</code> then this object is set to an
771  * an empty range.
772  *
773  * @param start first character in the set, inclusive
774  * @rparam end last character in the set, inclusive
775  */
set(UChar32 start,UChar32 end)776 UnicodeSet& UnicodeSet::set(UChar32 start, UChar32 end) {
777     clear();
778     complement(start, end);
779     return *this;
780 }
781 
782 /**
783  * Adds the specified range to this set if it is not already
784  * present.  If this set already contains the specified range,
785  * the call leaves this set unchanged.  If <code>end > start</code>
786  * then an empty range is added, leaving the set unchanged.
787  *
788  * @param start first character, inclusive, of range to be added
789  * to this set.
790  * @param end last character, inclusive, of range to be added
791  * to this set.
792  */
add(UChar32 start,UChar32 end)793 UnicodeSet& UnicodeSet::add(UChar32 start, UChar32 end) {
794     if (pinCodePoint(start) < pinCodePoint(end)) {
795         UChar32 limit = end + 1;
796         // Fast path for adding a new range after the last one.
797         // Odd list length: [..., lastStart, lastLimit, HIGH]
798         if ((len & 1) != 0) {
799             // If the list is empty, set lastLimit low enough to not be adjacent to 0.
800             UChar32 lastLimit = len == 1 ? -2 : list[len - 2];
801             if (lastLimit <= start && !isFrozen() && !isBogus()) {
802                 if (lastLimit == start) {
803                     // Extend the last range.
804                     list[len - 2] = limit;
805                     if (limit == UNICODESET_HIGH) {
806                         --len;
807                     }
808                 } else {
809                     list[len - 1] = start;
810                     if (limit < UNICODESET_HIGH) {
811                         if (ensureCapacity(len + 2)) {
812                             list[len++] = limit;
813                             list[len++] = UNICODESET_HIGH;
814                         }
815                     } else {  // limit == UNICODESET_HIGH
816                         if (ensureCapacity(len + 1)) {
817                             list[len++] = UNICODESET_HIGH;
818                         }
819                     }
820                 }
821                 releasePattern();
822                 return *this;
823             }
824         }
825         // This is slow. Could be much faster using findCodePoint(start)
826         // and modifying the list, dealing with adjacent & overlapping ranges.
827         UChar32 range[3] = { start, limit, UNICODESET_HIGH };
828         add(range, 2, 0);
829     } else if (start == end) {
830         add(start);
831     }
832     return *this;
833 }
834 
835 // #define DEBUG_US_ADD
836 
837 #ifdef DEBUG_US_ADD
838 #include <stdio.h>
dump(UChar32 c)839 void dump(UChar32 c) {
840     if (c <= 0xFF) {
841         printf("%c", (char)c);
842     } else {
843         printf("U+%04X", c);
844     }
845 }
dump(const UChar32 * list,int32_t len)846 void dump(const UChar32* list, int32_t len) {
847     printf("[");
848     for (int32_t i=0; i<len; ++i) {
849         if (i != 0) printf(", ");
850         dump(list[i]);
851     }
852     printf("]");
853 }
854 #endif
855 
856 /**
857  * Adds the specified character to this set if it is not already
858  * present.  If this set already contains the specified character,
859  * the call leaves this set unchanged.
860  */
add(UChar32 c)861 UnicodeSet& UnicodeSet::add(UChar32 c) {
862     // find smallest i such that c < list[i]
863     // if odd, then it is IN the set
864     // if even, then it is OUT of the set
865     int32_t i = findCodePoint(pinCodePoint(c));
866 
867     // already in set?
868     if ((i & 1) != 0  || isFrozen() || isBogus()) return *this;
869 
870     // HIGH is 0x110000
871     // assert(list[len-1] == HIGH);
872 
873     // empty = [HIGH]
874     // [start_0, limit_0, start_1, limit_1, HIGH]
875 
876     // [..., start_k-1, limit_k-1, start_k, limit_k, ..., HIGH]
877     //                             ^
878     //                             list[i]
879 
880     // i == 0 means c is before the first range
881 
882 #ifdef DEBUG_US_ADD
883     printf("Add of ");
884     dump(c);
885     printf(" found at %d", i);
886     printf(": ");
887     dump(list, len);
888     printf(" => ");
889 #endif
890 
891     if (c == list[i]-1) {
892         // c is before start of next range
893         list[i] = c;
894         // if we touched the HIGH mark, then add a new one
895         if (c == (UNICODESET_HIGH - 1)) {
896             if (!ensureCapacity(len+1)) {
897                 // ensureCapacity will mark the object as Bogus if OOM failure happens.
898                 return *this;
899             }
900             list[len++] = UNICODESET_HIGH;
901         }
902         if (i > 0 && c == list[i-1]) {
903             // collapse adjacent ranges
904 
905             // [..., start_k-1, c, c, limit_k, ..., HIGH]
906             //                     ^
907             //                     list[i]
908 
909             //for (int32_t k=i-1; k<len-2; ++k) {
910             //    list[k] = list[k+2];
911             //}
912             UChar32* dst = list + i - 1;
913             UChar32* src = dst + 2;
914             UChar32* srclimit = list + len;
915             while (src < srclimit) *(dst++) = *(src++);
916 
917             len -= 2;
918         }
919     }
920 
921     else if (i > 0 && c == list[i-1]) {
922         // c is after end of prior range
923         list[i-1]++;
924         // no need to check for collapse here
925     }
926 
927     else {
928         // At this point we know the new char is not adjacent to
929         // any existing ranges, and it is not 10FFFF.
930 
931 
932         // [..., start_k-1, limit_k-1, start_k, limit_k, ..., HIGH]
933         //                             ^
934         //                             list[i]
935 
936         // [..., start_k-1, limit_k-1, c, c+1, start_k, limit_k, ..., HIGH]
937         //                             ^
938         //                             list[i]
939 
940         if (!ensureCapacity(len+2)) {
941             // ensureCapacity will mark the object as Bogus if OOM failure happens.
942             return *this;
943         }
944 
945         UChar32 *p = list + i;
946         uprv_memmove(p + 2, p, (len - i) * sizeof(*p));
947         list[i] = c;
948         list[i+1] = c+1;
949         len += 2;
950     }
951 
952 #ifdef DEBUG_US_ADD
953     dump(list, len);
954     printf("\n");
955 
956     for (i=1; i<len; ++i) {
957         if (list[i] <= list[i-1]) {
958             // Corrupt array!
959             printf("ERROR: list has been corrupted\n");
960             exit(1);
961         }
962     }
963 #endif
964 
965     releasePattern();
966     return *this;
967 }
968 
969 /**
970  * Adds the specified multicharacter to this set if it is not already
971  * present.  If this set already contains the multicharacter,
972  * the call leaves this set unchanged.
973  * Thus "ch" => {"ch"}
974  * <br><b>Warning: you cannot add an empty string ("") to a UnicodeSet.</b>
975  * @param s the source string
976  * @return the modified set, for chaining
977  */
add(const UnicodeString & s)978 UnicodeSet& UnicodeSet::add(const UnicodeString& s) {
979     if (s.length() == 0 || isFrozen() || isBogus()) return *this;
980     int32_t cp = getSingleCP(s);
981     if (cp < 0) {
982         if (!stringsContains(s)) {
983             _add(s);
984             releasePattern();
985         }
986     } else {
987         add((UChar32)cp);
988     }
989     return *this;
990 }
991 
992 /**
993  * Adds the given string, in order, to 'strings'.  The given string
994  * must have been checked by the caller to not be empty and to not
995  * already be in 'strings'.
996  */
_add(const UnicodeString & s)997 void UnicodeSet::_add(const UnicodeString& s) {
998     if (isFrozen() || isBogus()) {
999         return;
1000     }
1001     UErrorCode ec = U_ZERO_ERROR;
1002     if (strings == nullptr && !allocateStrings(ec)) {
1003         setToBogus();
1004         return;
1005     }
1006     UnicodeString* t = new UnicodeString(s);
1007     if (t == NULL) { // Check for memory allocation error.
1008         setToBogus();
1009         return;
1010     }
1011     strings->sortedInsert(t, compareUnicodeString, ec);
1012     if (U_FAILURE(ec)) {
1013         setToBogus();
1014         delete t;
1015     }
1016 }
1017 
1018 /**
1019  * @return a code point IF the string consists of a single one.
1020  * otherwise returns -1.
1021  * @param string to test
1022  */
getSingleCP(const UnicodeString & s)1023 int32_t UnicodeSet::getSingleCP(const UnicodeString& s) {
1024     //if (s.length() < 1) {
1025     //    throw new IllegalArgumentException("Can't use zero-length strings in UnicodeSet");
1026     //}
1027     if (s.length() > 2) return -1;
1028     if (s.length() == 1) return s.charAt(0);
1029 
1030     // at this point, len = 2
1031     UChar32 cp = s.char32At(0);
1032     if (cp > 0xFFFF) { // is surrogate pair
1033         return cp;
1034     }
1035     return -1;
1036 }
1037 
1038 /**
1039  * Adds each of the characters in this string to the set. Thus "ch" => {"c", "h"}
1040  * If this set already any particular character, it has no effect on that character.
1041  * @param the source string
1042  * @return the modified set, for chaining
1043  */
addAll(const UnicodeString & s)1044 UnicodeSet& UnicodeSet::addAll(const UnicodeString& s) {
1045     UChar32 cp;
1046     for (int32_t i = 0; i < s.length(); i += U16_LENGTH(cp)) {
1047         cp = s.char32At(i);
1048         add(cp);
1049     }
1050     return *this;
1051 }
1052 
1053 /**
1054  * Retains EACH of the characters in this string. Note: "ch" == {"c", "h"}
1055  * If this set already any particular character, it has no effect on that character.
1056  * @param the source string
1057  * @return the modified set, for chaining
1058  */
retainAll(const UnicodeString & s)1059 UnicodeSet& UnicodeSet::retainAll(const UnicodeString& s) {
1060     UnicodeSet set;
1061     set.addAll(s);
1062     retainAll(set);
1063     return *this;
1064 }
1065 
1066 /**
1067  * Complement EACH of the characters in this string. Note: "ch" == {"c", "h"}
1068  * If this set already any particular character, it has no effect on that character.
1069  * @param the source string
1070  * @return the modified set, for chaining
1071  */
complementAll(const UnicodeString & s)1072 UnicodeSet& UnicodeSet::complementAll(const UnicodeString& s) {
1073     UnicodeSet set;
1074     set.addAll(s);
1075     complementAll(set);
1076     return *this;
1077 }
1078 
1079 /**
1080  * Remove EACH of the characters in this string. Note: "ch" == {"c", "h"}
1081  * If this set already any particular character, it has no effect on that character.
1082  * @param the source string
1083  * @return the modified set, for chaining
1084  */
removeAll(const UnicodeString & s)1085 UnicodeSet& UnicodeSet::removeAll(const UnicodeString& s) {
1086     UnicodeSet set;
1087     set.addAll(s);
1088     removeAll(set);
1089     return *this;
1090 }
1091 
removeAllStrings()1092 UnicodeSet& UnicodeSet::removeAllStrings() {
1093     if (!isFrozen() && hasStrings()) {
1094         strings->removeAllElements();
1095         releasePattern();
1096     }
1097     return *this;
1098 }
1099 
1100 
1101 /**
1102  * Makes a set from a multicharacter string. Thus "ch" => {"ch"}
1103  * <br><b>Warning: you cannot add an empty string ("") to a UnicodeSet.</b>
1104  * @param the source string
1105  * @return a newly created set containing the given string
1106  */
createFrom(const UnicodeString & s)1107 UnicodeSet* U_EXPORT2 UnicodeSet::createFrom(const UnicodeString& s) {
1108     UnicodeSet *set = new UnicodeSet();
1109     if (set != NULL) { // Check for memory allocation error.
1110         set->add(s);
1111     }
1112     return set;
1113 }
1114 
1115 
1116 /**
1117  * Makes a set from each of the characters in the string. Thus "ch" => {"c", "h"}
1118  * @param the source string
1119  * @return a newly created set containing the given characters
1120  */
createFromAll(const UnicodeString & s)1121 UnicodeSet* U_EXPORT2 UnicodeSet::createFromAll(const UnicodeString& s) {
1122     UnicodeSet *set = new UnicodeSet();
1123     if (set != NULL) { // Check for memory allocation error.
1124         set->addAll(s);
1125     }
1126     return set;
1127 }
1128 
1129 /**
1130  * Retain only the elements in this set that are contained in the
1131  * specified range.  If <code>end > start</code> then an empty range is
1132  * retained, leaving the set empty.
1133  *
1134  * @param start first character, inclusive, of range to be retained
1135  * to this set.
1136  * @param end last character, inclusive, of range to be retained
1137  * to this set.
1138  */
retain(UChar32 start,UChar32 end)1139 UnicodeSet& UnicodeSet::retain(UChar32 start, UChar32 end) {
1140     if (pinCodePoint(start) <= pinCodePoint(end)) {
1141         UChar32 range[3] = { start, end+1, UNICODESET_HIGH };
1142         retain(range, 2, 0);
1143     } else {
1144         clear();
1145     }
1146     return *this;
1147 }
1148 
retain(UChar32 c)1149 UnicodeSet& UnicodeSet::retain(UChar32 c) {
1150     return retain(c, c);
1151 }
1152 
1153 /**
1154  * Removes the specified range from this set if it is present.
1155  * The set will not contain the specified range once the call
1156  * returns.  If <code>end > start</code> then an empty range is
1157  * removed, leaving the set unchanged.
1158  *
1159  * @param start first character, inclusive, of range to be removed
1160  * from this set.
1161  * @param end last character, inclusive, of range to be removed
1162  * from this set.
1163  */
remove(UChar32 start,UChar32 end)1164 UnicodeSet& UnicodeSet::remove(UChar32 start, UChar32 end) {
1165     if (pinCodePoint(start) <= pinCodePoint(end)) {
1166         UChar32 range[3] = { start, end+1, UNICODESET_HIGH };
1167         retain(range, 2, 2);
1168     }
1169     return *this;
1170 }
1171 
1172 /**
1173  * Removes the specified character from this set if it is present.
1174  * The set will not contain the specified range once the call
1175  * returns.
1176  */
remove(UChar32 c)1177 UnicodeSet& UnicodeSet::remove(UChar32 c) {
1178     return remove(c, c);
1179 }
1180 
1181 /**
1182  * Removes the specified string from this set if it is present.
1183  * The set will not contain the specified character once the call
1184  * returns.
1185  * @param the source string
1186  * @return the modified set, for chaining
1187  */
remove(const UnicodeString & s)1188 UnicodeSet& UnicodeSet::remove(const UnicodeString& s) {
1189     if (s.length() == 0 || isFrozen() || isBogus()) return *this;
1190     int32_t cp = getSingleCP(s);
1191     if (cp < 0) {
1192         if (strings != nullptr && strings->removeElement((void*) &s)) {
1193             releasePattern();
1194         }
1195     } else {
1196         remove((UChar32)cp, (UChar32)cp);
1197     }
1198     return *this;
1199 }
1200 
1201 /**
1202  * Complements the specified range in this set.  Any character in
1203  * the range will be removed if it is in this set, or will be
1204  * added if it is not in this set.  If <code>end > start</code>
1205  * then an empty range is xor'ed, leaving the set unchanged.
1206  *
1207  * @param start first character, inclusive, of range to be removed
1208  * from this set.
1209  * @param end last character, inclusive, of range to be removed
1210  * from this set.
1211  */
complement(UChar32 start,UChar32 end)1212 UnicodeSet& UnicodeSet::complement(UChar32 start, UChar32 end) {
1213     if (isFrozen() || isBogus()) {
1214         return *this;
1215     }
1216     if (pinCodePoint(start) <= pinCodePoint(end)) {
1217         UChar32 range[3] = { start, end+1, UNICODESET_HIGH };
1218         exclusiveOr(range, 2, 0);
1219     }
1220     releasePattern();
1221     return *this;
1222 }
1223 
complement(UChar32 c)1224 UnicodeSet& UnicodeSet::complement(UChar32 c) {
1225     return complement(c, c);
1226 }
1227 
1228 /**
1229  * This is equivalent to
1230  * <code>complement(MIN_VALUE, MAX_VALUE)</code>.
1231  */
complement(void)1232 UnicodeSet& UnicodeSet::complement(void) {
1233     if (isFrozen() || isBogus()) {
1234         return *this;
1235     }
1236     if (list[0] == UNICODESET_LOW) {
1237         uprv_memmove(list, list + 1, (size_t)(len-1)*sizeof(UChar32));
1238         --len;
1239     } else {
1240         if (!ensureCapacity(len+1)) {
1241             return *this;
1242         }
1243         uprv_memmove(list + 1, list, (size_t)len*sizeof(UChar32));
1244         list[0] = UNICODESET_LOW;
1245         ++len;
1246     }
1247     releasePattern();
1248     return *this;
1249 }
1250 
1251 /**
1252  * Complement the specified string in this set.
1253  * The set will not contain the specified string once the call
1254  * returns.
1255  * <br><b>Warning: you cannot add an empty string ("") to a UnicodeSet.</b>
1256  * @param s the string to complement
1257  * @return this object, for chaining
1258  */
complement(const UnicodeString & s)1259 UnicodeSet& UnicodeSet::complement(const UnicodeString& s) {
1260     if (s.length() == 0 || isFrozen() || isBogus()) return *this;
1261     int32_t cp = getSingleCP(s);
1262     if (cp < 0) {
1263         if (stringsContains(s)) {
1264             strings->removeElement((void*) &s);
1265         } else {
1266             _add(s);
1267         }
1268         releasePattern();
1269     } else {
1270         complement((UChar32)cp, (UChar32)cp);
1271     }
1272     return *this;
1273 }
1274 
1275 /**
1276  * Adds all of the elements in the specified set to this set if
1277  * they're not already present.  This operation effectively
1278  * modifies this set so that its value is the <i>union</i> of the two
1279  * sets.  The behavior of this operation is unspecified if the specified
1280  * collection is modified while the operation is in progress.
1281  *
1282  * @param c set whose elements are to be added to this set.
1283  * @see #add(char, char)
1284  */
addAll(const UnicodeSet & c)1285 UnicodeSet& UnicodeSet::addAll(const UnicodeSet& c) {
1286     if ( c.len>0 && c.list!=NULL ) {
1287         add(c.list, c.len, 0);
1288     }
1289 
1290     // Add strings in order
1291     if ( c.strings!=NULL ) {
1292         for (int32_t i=0; i<c.strings->size(); ++i) {
1293             const UnicodeString* s = (const UnicodeString*)c.strings->elementAt(i);
1294             if (!stringsContains(*s)) {
1295                 _add(*s);
1296             }
1297         }
1298     }
1299     return *this;
1300 }
1301 
1302 /**
1303  * Retains only the elements in this set that are contained in the
1304  * specified set.  In other words, removes from this set all of
1305  * its elements that are not contained in the specified set.  This
1306  * operation effectively modifies this set so that its value is
1307  * the <i>intersection</i> of the two sets.
1308  *
1309  * @param c set that defines which elements this set will retain.
1310  */
retainAll(const UnicodeSet & c)1311 UnicodeSet& UnicodeSet::retainAll(const UnicodeSet& c) {
1312     if (isFrozen() || isBogus()) {
1313         return *this;
1314     }
1315     retain(c.list, c.len, 0);
1316     if (hasStrings()) {
1317         if (!c.hasStrings()) {
1318             strings->removeAllElements();
1319         } else {
1320             strings->retainAll(*c.strings);
1321         }
1322     }
1323     return *this;
1324 }
1325 
1326 /**
1327  * Removes from this set all of its elements that are contained in the
1328  * specified set.  This operation effectively modifies this
1329  * set so that its value is the <i>asymmetric set difference</i> of
1330  * the two sets.
1331  *
1332  * @param c set that defines which elements will be removed from
1333  *          this set.
1334  */
removeAll(const UnicodeSet & c)1335 UnicodeSet& UnicodeSet::removeAll(const UnicodeSet& c) {
1336     if (isFrozen() || isBogus()) {
1337         return *this;
1338     }
1339     retain(c.list, c.len, 2);
1340     if (hasStrings() && c.hasStrings()) {
1341         strings->removeAll(*c.strings);
1342     }
1343     return *this;
1344 }
1345 
1346 /**
1347  * Complements in this set all elements contained in the specified
1348  * set.  Any character in the other set will be removed if it is
1349  * in this set, or will be added if it is not in this set.
1350  *
1351  * @param c set that defines which elements will be xor'ed from
1352  *          this set.
1353  */
complementAll(const UnicodeSet & c)1354 UnicodeSet& UnicodeSet::complementAll(const UnicodeSet& c) {
1355     if (isFrozen() || isBogus()) {
1356         return *this;
1357     }
1358     exclusiveOr(c.list, c.len, 0);
1359 
1360     if (c.strings != nullptr) {
1361         for (int32_t i=0; i<c.strings->size(); ++i) {
1362             void* e = c.strings->elementAt(i);
1363             if (strings == nullptr || !strings->removeElement(e)) {
1364                 _add(*(const UnicodeString*)e);
1365             }
1366         }
1367     }
1368     return *this;
1369 }
1370 
1371 /**
1372  * Removes all of the elements from this set.  This set will be
1373  * empty after this call returns.
1374  */
clear(void)1375 UnicodeSet& UnicodeSet::clear(void) {
1376     if (isFrozen()) {
1377         return *this;
1378     }
1379     list[0] = UNICODESET_HIGH;
1380     len = 1;
1381     releasePattern();
1382     if (strings != NULL) {
1383         strings->removeAllElements();
1384     }
1385     // Remove bogus
1386     fFlags = 0;
1387     return *this;
1388 }
1389 
1390 /**
1391  * Iteration method that returns the number of ranges contained in
1392  * this set.
1393  * @see #getRangeStart
1394  * @see #getRangeEnd
1395  */
getRangeCount() const1396 int32_t UnicodeSet::getRangeCount() const {
1397     return len/2;
1398 }
1399 
1400 /**
1401  * Iteration method that returns the first character in the
1402  * specified range of this set.
1403  * @see #getRangeCount
1404  * @see #getRangeEnd
1405  */
getRangeStart(int32_t index) const1406 UChar32 UnicodeSet::getRangeStart(int32_t index) const {
1407     return list[index*2];
1408 }
1409 
1410 /**
1411  * Iteration method that returns the last character in the
1412  * specified range of this set.
1413  * @see #getRangeStart
1414  * @see #getRangeEnd
1415  */
getRangeEnd(int32_t index) const1416 UChar32 UnicodeSet::getRangeEnd(int32_t index) const {
1417     return list[index*2 + 1] - 1;
1418 }
1419 
getString(int32_t index) const1420 const UnicodeString* UnicodeSet::getString(int32_t index) const {
1421     return (const UnicodeString*) strings->elementAt(index);
1422 }
1423 
1424 /**
1425  * Reallocate this objects internal structures to take up the least
1426  * possible space, without changing this object's value.
1427  */
compact()1428 UnicodeSet& UnicodeSet::compact() {
1429     if (isFrozen() || isBogus()) {
1430         return *this;
1431     }
1432     // Delete buffer first to defragment memory less.
1433     if (buffer != stackList) {
1434         uprv_free(buffer);
1435         buffer = NULL;
1436         bufferCapacity = 0;
1437     }
1438     if (list == stackList) {
1439         // pass
1440     } else if (len <= INITIAL_CAPACITY) {
1441         uprv_memcpy(stackList, list, len * sizeof(UChar32));
1442         uprv_free(list);
1443         list = stackList;
1444         capacity = INITIAL_CAPACITY;
1445     } else if ((len + 7) < capacity) {
1446         // If we have more than a little unused capacity, shrink it to len.
1447         UChar32* temp = (UChar32*) uprv_realloc(list, sizeof(UChar32) * len);
1448         if (temp) {
1449             list = temp;
1450             capacity = len;
1451         }
1452         // else what the heck happened?! We allocated less memory!
1453         // Oh well. We'll keep our original array.
1454     }
1455     if (strings != nullptr && strings->isEmpty()) {
1456         delete strings;
1457         strings = nullptr;
1458     }
1459     return *this;
1460 }
1461 
1462 #ifdef DEBUG_SERIALIZE
1463 #include <stdio.h>
1464 #endif
1465 
1466 /**
1467  * Deserialize constructor.
1468  */
UnicodeSet(const uint16_t data[],int32_t dataLen,ESerialization serialization,UErrorCode & ec)1469 UnicodeSet::UnicodeSet(const uint16_t data[], int32_t dataLen, ESerialization serialization,
1470                        UErrorCode &ec) {
1471 
1472   if(U_FAILURE(ec)) {
1473     setToBogus();
1474     return;
1475   }
1476 
1477   if( (serialization != kSerialized)
1478       || (data==NULL)
1479       || (dataLen < 1)) {
1480     ec = U_ILLEGAL_ARGUMENT_ERROR;
1481     setToBogus();
1482     return;
1483   }
1484 
1485   // bmp?
1486   int32_t headerSize = ((data[0]&0x8000)) ?2:1;
1487   int32_t bmpLength = (headerSize==1)?data[0]:data[1];
1488 
1489   int32_t newLength = (((data[0]&0x7FFF)-bmpLength)/2)+bmpLength;
1490 #ifdef DEBUG_SERIALIZE
1491   printf("dataLen %d headerSize %d bmpLen %d len %d. data[0]=%X/%X/%X/%X\n", dataLen,headerSize,bmpLength,newLength, data[0],data[1],data[2],data[3]);
1492 #endif
1493   if(!ensureCapacity(newLength + 1)) {  // +1 for HIGH
1494     return;
1495   }
1496   // copy bmp
1497   int32_t i;
1498   for(i = 0; i< bmpLength;i++) {
1499     list[i] = data[i+headerSize];
1500 #ifdef DEBUG_SERIALIZE
1501     printf("<<16@%d[%d] %X\n", i+headerSize, i, list[i]);
1502 #endif
1503   }
1504   // copy smp
1505   for(i=bmpLength;i<newLength;i++) {
1506     list[i] = ((UChar32)data[headerSize+bmpLength+(i-bmpLength)*2+0] << 16) +
1507               ((UChar32)data[headerSize+bmpLength+(i-bmpLength)*2+1]);
1508 #ifdef DEBUG_SERIALIZE
1509     printf("<<32@%d+[%d] %lX\n", headerSize+bmpLength+i, i, list[i]);
1510 #endif
1511   }
1512   U_ASSERT(i == newLength);
1513   if (i == 0 || list[i - 1] != UNICODESET_HIGH) {
1514     list[i++] = UNICODESET_HIGH;
1515   }
1516   len = i;
1517 }
1518 
1519 
serialize(uint16_t * dest,int32_t destCapacity,UErrorCode & ec) const1520 int32_t UnicodeSet::serialize(uint16_t *dest, int32_t destCapacity, UErrorCode& ec) const {
1521     int32_t bmpLength, length, destLength;
1522 
1523     if (U_FAILURE(ec)) {
1524         return 0;
1525     }
1526 
1527     if (destCapacity<0 || (destCapacity>0 && dest==NULL)) {
1528         ec=U_ILLEGAL_ARGUMENT_ERROR;
1529         return 0;
1530     }
1531 
1532     /* count necessary 16-bit units */
1533     length=this->len-1; // Subtract 1 to ignore final UNICODESET_HIGH
1534     // assert(length>=0);
1535     if (length==0) {
1536         /* empty set */
1537         if (destCapacity>0) {
1538             *dest=0;
1539         } else {
1540             ec=U_BUFFER_OVERFLOW_ERROR;
1541         }
1542         return 1;
1543     }
1544     /* now length>0 */
1545 
1546     if (this->list[length-1]<=0xffff) {
1547         /* all BMP */
1548         bmpLength=length;
1549     } else if (this->list[0]>=0x10000) {
1550         /* all supplementary */
1551         bmpLength=0;
1552         length*=2;
1553     } else {
1554         /* some BMP, some supplementary */
1555         for (bmpLength=0; bmpLength<length && this->list[bmpLength]<=0xffff; ++bmpLength) {}
1556         length=bmpLength+2*(length-bmpLength);
1557     }
1558 #ifdef DEBUG_SERIALIZE
1559     printf(">> bmpLength%d length%d len%d\n", bmpLength, length, len);
1560 #endif
1561     /* length: number of 16-bit array units */
1562     if (length>0x7fff) {
1563         /* there are only 15 bits for the length in the first serialized word */
1564         ec=U_INDEX_OUTOFBOUNDS_ERROR;
1565         return 0;
1566     }
1567 
1568     /*
1569      * total serialized length:
1570      * number of 16-bit array units (length) +
1571      * 1 length unit (always) +
1572      * 1 bmpLength unit (if there are supplementary values)
1573      */
1574     destLength=length+((length>bmpLength)?2:1);
1575     if (destLength<=destCapacity) {
1576         const UChar32 *p;
1577         int32_t i;
1578 
1579 #ifdef DEBUG_SERIALIZE
1580         printf("writeHdr\n");
1581 #endif
1582         *dest=(uint16_t)length;
1583         if (length>bmpLength) {
1584             *dest|=0x8000;
1585             *++dest=(uint16_t)bmpLength;
1586         }
1587         ++dest;
1588 
1589         /* write the BMP part of the array */
1590         p=this->list;
1591         for (i=0; i<bmpLength; ++i) {
1592 #ifdef DEBUG_SERIALIZE
1593           printf("writebmp: %x\n", (int)*p);
1594 #endif
1595             *dest++=(uint16_t)*p++;
1596         }
1597 
1598         /* write the supplementary part of the array */
1599         for (; i<length; i+=2) {
1600 #ifdef DEBUG_SERIALIZE
1601           printf("write32: %x\n", (int)*p);
1602 #endif
1603             *dest++=(uint16_t)(*p>>16);
1604             *dest++=(uint16_t)*p++;
1605         }
1606     } else {
1607         ec=U_BUFFER_OVERFLOW_ERROR;
1608     }
1609     return destLength;
1610 }
1611 
1612 //----------------------------------------------------------------
1613 // Implementation: Utility methods
1614 //----------------------------------------------------------------
1615 
1616 /**
1617  * Allocate our strings vector and return TRUE if successful.
1618  */
allocateStrings(UErrorCode & status)1619 UBool UnicodeSet::allocateStrings(UErrorCode &status) {
1620     if (U_FAILURE(status)) {
1621         return FALSE;
1622     }
1623     strings = new UVector(uprv_deleteUObject,
1624                           uhash_compareUnicodeString, 1, status);
1625     if (strings == NULL) { // Check for memory allocation error.
1626         status = U_MEMORY_ALLOCATION_ERROR;
1627         return FALSE;
1628     }
1629     if (U_FAILURE(status)) {
1630         delete strings;
1631         strings = NULL;
1632         return FALSE;
1633     }
1634     return TRUE;
1635 }
1636 
nextCapacity(int32_t minCapacity)1637 int32_t UnicodeSet::nextCapacity(int32_t minCapacity) {
1638     // Grow exponentially to reduce the frequency of allocations.
1639     if (minCapacity < INITIAL_CAPACITY) {
1640         return minCapacity + INITIAL_CAPACITY;
1641     } else if (minCapacity <= 2500) {
1642         return 5 * minCapacity;
1643     } else {
1644         int32_t newCapacity = 2 * minCapacity;
1645         if (newCapacity > MAX_LENGTH) {
1646             newCapacity = MAX_LENGTH;
1647         }
1648         return newCapacity;
1649     }
1650 }
1651 
ensureCapacity(int32_t newLen)1652 bool UnicodeSet::ensureCapacity(int32_t newLen) {
1653     if (newLen > MAX_LENGTH) {
1654         newLen = MAX_LENGTH;
1655     }
1656     if (newLen <= capacity) {
1657         return true;
1658     }
1659     int32_t newCapacity = nextCapacity(newLen);
1660     UChar32* temp = (UChar32*) uprv_malloc(newCapacity * sizeof(UChar32));
1661     if (temp == NULL) {
1662         setToBogus(); // set the object to bogus state if an OOM failure occurred.
1663         return false;
1664     }
1665     // Copy only the actual contents.
1666     uprv_memcpy(temp, list, len * sizeof(UChar32));
1667     if (list != stackList) {
1668         uprv_free(list);
1669     }
1670     list = temp;
1671     capacity = newCapacity;
1672     return true;
1673 }
1674 
ensureBufferCapacity(int32_t newLen)1675 bool UnicodeSet::ensureBufferCapacity(int32_t newLen) {
1676     if (newLen > MAX_LENGTH) {
1677         newLen = MAX_LENGTH;
1678     }
1679     if (newLen <= bufferCapacity) {
1680         return true;
1681     }
1682     int32_t newCapacity = nextCapacity(newLen);
1683     UChar32* temp = (UChar32*) uprv_malloc(newCapacity * sizeof(UChar32));
1684     if (temp == NULL) {
1685         setToBogus();
1686         return false;
1687     }
1688     // The buffer has no contents to be copied.
1689     // It is always filled from scratch after this call.
1690     if (buffer != stackList) {
1691         uprv_free(buffer);
1692     }
1693     buffer = temp;
1694     bufferCapacity = newCapacity;
1695     return true;
1696 }
1697 
1698 /**
1699  * Swap list and buffer.
1700  */
swapBuffers(void)1701 void UnicodeSet::swapBuffers(void) {
1702     // swap list and buffer
1703     UChar32* temp = list;
1704     list = buffer;
1705     buffer = temp;
1706 
1707     int32_t c = capacity;
1708     capacity = bufferCapacity;
1709     bufferCapacity = c;
1710 }
1711 
setToBogus()1712 void UnicodeSet::setToBogus() {
1713     clear(); // Remove everything in the set.
1714     fFlags = kIsBogus;
1715 }
1716 
1717 //----------------------------------------------------------------
1718 // Implementation: Fundamental operators
1719 //----------------------------------------------------------------
1720 
max(UChar32 a,UChar32 b)1721 static inline UChar32 max(UChar32 a, UChar32 b) {
1722     return (a > b) ? a : b;
1723 }
1724 
1725 // polarity = 0, 3 is normal: x xor y
1726 // polarity = 1, 2: x xor ~y == x === y
1727 
exclusiveOr(const UChar32 * other,int32_t otherLen,int8_t polarity)1728 void UnicodeSet::exclusiveOr(const UChar32* other, int32_t otherLen, int8_t polarity) {
1729     if (isFrozen() || isBogus()) {
1730         return;
1731     }
1732     if (!ensureBufferCapacity(len + otherLen)) {
1733         return;
1734     }
1735 
1736     int32_t i = 0, j = 0, k = 0;
1737     UChar32 a = list[i++];
1738     UChar32 b;
1739     if (polarity == 1 || polarity == 2) {
1740         b = UNICODESET_LOW;
1741         if (other[j] == UNICODESET_LOW) { // skip base if already LOW
1742             ++j;
1743             b = other[j];
1744         }
1745     } else {
1746         b = other[j++];
1747     }
1748     // simplest of all the routines
1749     // sort the values, discarding identicals!
1750     for (;;) {
1751         if (a < b) {
1752             buffer[k++] = a;
1753             a = list[i++];
1754         } else if (b < a) {
1755             buffer[k++] = b;
1756             b = other[j++];
1757         } else if (a != UNICODESET_HIGH) { // at this point, a == b
1758             // discard both values!
1759             a = list[i++];
1760             b = other[j++];
1761         } else { // DONE!
1762             buffer[k++] = UNICODESET_HIGH;
1763             len = k;
1764             break;
1765         }
1766     }
1767     swapBuffers();
1768     releasePattern();
1769 }
1770 
1771 // polarity = 0 is normal: x union y
1772 // polarity = 2: x union ~y
1773 // polarity = 1: ~x union y
1774 // polarity = 3: ~x union ~y
1775 
add(const UChar32 * other,int32_t otherLen,int8_t polarity)1776 void UnicodeSet::add(const UChar32* other, int32_t otherLen, int8_t polarity) {
1777     if (isFrozen() || isBogus() || other==NULL) {
1778         return;
1779     }
1780     if (!ensureBufferCapacity(len + otherLen)) {
1781         return;
1782     }
1783 
1784     int32_t i = 0, j = 0, k = 0;
1785     UChar32 a = list[i++];
1786     UChar32 b = other[j++];
1787     // change from xor is that we have to check overlapping pairs
1788     // polarity bit 1 means a is second, bit 2 means b is.
1789     for (;;) {
1790         switch (polarity) {
1791           case 0: // both first; take lower if unequal
1792             if (a < b) { // take a
1793                 // Back up over overlapping ranges in buffer[]
1794                 if (k > 0 && a <= buffer[k-1]) {
1795                     // Pick latter end value in buffer[] vs. list[]
1796                     a = max(list[i], buffer[--k]);
1797                 } else {
1798                     // No overlap
1799                     buffer[k++] = a;
1800                     a = list[i];
1801                 }
1802                 i++; // Common if/else code factored out
1803                 polarity ^= 1;
1804             } else if (b < a) { // take b
1805                 if (k > 0 && b <= buffer[k-1]) {
1806                     b = max(other[j], buffer[--k]);
1807                 } else {
1808                     buffer[k++] = b;
1809                     b = other[j];
1810                 }
1811                 j++;
1812                 polarity ^= 2;
1813             } else { // a == b, take a, drop b
1814                 if (a == UNICODESET_HIGH) goto loop_end;
1815                 // This is symmetrical; it doesn't matter if
1816                 // we backtrack with a or b. - liu
1817                 if (k > 0 && a <= buffer[k-1]) {
1818                     a = max(list[i], buffer[--k]);
1819                 } else {
1820                     // No overlap
1821                     buffer[k++] = a;
1822                     a = list[i];
1823                 }
1824                 i++;
1825                 polarity ^= 1;
1826                 b = other[j++];
1827                 polarity ^= 2;
1828             }
1829             break;
1830           case 3: // both second; take higher if unequal, and drop other
1831             if (b <= a) { // take a
1832                 if (a == UNICODESET_HIGH) goto loop_end;
1833                 buffer[k++] = a;
1834             } else { // take b
1835                 if (b == UNICODESET_HIGH) goto loop_end;
1836                 buffer[k++] = b;
1837             }
1838             a = list[i++];
1839             polarity ^= 1;   // factored common code
1840             b = other[j++];
1841             polarity ^= 2;
1842             break;
1843           case 1: // a second, b first; if b < a, overlap
1844             if (a < b) { // no overlap, take a
1845                 buffer[k++] = a; a = list[i++]; polarity ^= 1;
1846             } else if (b < a) { // OVERLAP, drop b
1847                 b = other[j++];
1848                 polarity ^= 2;
1849             } else { // a == b, drop both!
1850                 if (a == UNICODESET_HIGH) goto loop_end;
1851                 a = list[i++];
1852                 polarity ^= 1;
1853                 b = other[j++];
1854                 polarity ^= 2;
1855             }
1856             break;
1857           case 2: // a first, b second; if a < b, overlap
1858             if (b < a) { // no overlap, take b
1859                 buffer[k++] = b;
1860                 b = other[j++];
1861                 polarity ^= 2;
1862             } else  if (a < b) { // OVERLAP, drop a
1863                 a = list[i++];
1864                 polarity ^= 1;
1865             } else { // a == b, drop both!
1866                 if (a == UNICODESET_HIGH) goto loop_end;
1867                 a = list[i++];
1868                 polarity ^= 1;
1869                 b = other[j++];
1870                 polarity ^= 2;
1871             }
1872             break;
1873         }
1874     }
1875  loop_end:
1876     buffer[k++] = UNICODESET_HIGH;    // terminate
1877     len = k;
1878     swapBuffers();
1879     releasePattern();
1880 }
1881 
1882 // polarity = 0 is normal: x intersect y
1883 // polarity = 2: x intersect ~y == set-minus
1884 // polarity = 1: ~x intersect y
1885 // polarity = 3: ~x intersect ~y
1886 
retain(const UChar32 * other,int32_t otherLen,int8_t polarity)1887 void UnicodeSet::retain(const UChar32* other, int32_t otherLen, int8_t polarity) {
1888     if (isFrozen() || isBogus()) {
1889         return;
1890     }
1891     if (!ensureBufferCapacity(len + otherLen)) {
1892         return;
1893     }
1894 
1895     int32_t i = 0, j = 0, k = 0;
1896     UChar32 a = list[i++];
1897     UChar32 b = other[j++];
1898     // change from xor is that we have to check overlapping pairs
1899     // polarity bit 1 means a is second, bit 2 means b is.
1900     for (;;) {
1901         switch (polarity) {
1902           case 0: // both first; drop the smaller
1903             if (a < b) { // drop a
1904                 a = list[i++];
1905                 polarity ^= 1;
1906             } else if (b < a) { // drop b
1907                 b = other[j++];
1908                 polarity ^= 2;
1909             } else { // a == b, take one, drop other
1910                 if (a == UNICODESET_HIGH) goto loop_end;
1911                 buffer[k++] = a;
1912                 a = list[i++];
1913                 polarity ^= 1;
1914                 b = other[j++];
1915                 polarity ^= 2;
1916             }
1917             break;
1918           case 3: // both second; take lower if unequal
1919             if (a < b) { // take a
1920                 buffer[k++] = a;
1921                 a = list[i++];
1922                 polarity ^= 1;
1923             } else if (b < a) { // take b
1924                 buffer[k++] = b;
1925                 b = other[j++];
1926                 polarity ^= 2;
1927             } else { // a == b, take one, drop other
1928                 if (a == UNICODESET_HIGH) goto loop_end;
1929                 buffer[k++] = a;
1930                 a = list[i++];
1931                 polarity ^= 1;
1932                 b = other[j++];
1933                 polarity ^= 2;
1934             }
1935             break;
1936           case 1: // a second, b first;
1937             if (a < b) { // NO OVERLAP, drop a
1938                 a = list[i++];
1939                 polarity ^= 1;
1940             } else if (b < a) { // OVERLAP, take b
1941                 buffer[k++] = b;
1942                 b = other[j++];
1943                 polarity ^= 2;
1944             } else { // a == b, drop both!
1945                 if (a == UNICODESET_HIGH) goto loop_end;
1946                 a = list[i++];
1947                 polarity ^= 1;
1948                 b = other[j++];
1949                 polarity ^= 2;
1950             }
1951             break;
1952           case 2: // a first, b second; if a < b, overlap
1953             if (b < a) { // no overlap, drop b
1954                 b = other[j++];
1955                 polarity ^= 2;
1956             } else  if (a < b) { // OVERLAP, take a
1957                 buffer[k++] = a;
1958                 a = list[i++];
1959                 polarity ^= 1;
1960             } else { // a == b, drop both!
1961                 if (a == UNICODESET_HIGH) goto loop_end;
1962                 a = list[i++];
1963                 polarity ^= 1;
1964                 b = other[j++];
1965                 polarity ^= 2;
1966             }
1967             break;
1968         }
1969     }
1970  loop_end:
1971     buffer[k++] = UNICODESET_HIGH;    // terminate
1972     len = k;
1973     swapBuffers();
1974     releasePattern();
1975 }
1976 
1977 /**
1978  * Append the <code>toPattern()</code> representation of a
1979  * string to the given <code>StringBuffer</code>.
1980  */
_appendToPat(UnicodeString & buf,const UnicodeString & s,UBool escapeUnprintable)1981 void UnicodeSet::_appendToPat(UnicodeString& buf, const UnicodeString& s, UBool
1982 escapeUnprintable) {
1983     UChar32 cp;
1984     for (int32_t i = 0; i < s.length(); i += U16_LENGTH(cp)) {
1985         _appendToPat(buf, cp = s.char32At(i), escapeUnprintable);
1986     }
1987 }
1988 
1989 /**
1990  * Append the <code>toPattern()</code> representation of a
1991  * character to the given <code>StringBuffer</code>.
1992  */
_appendToPat(UnicodeString & buf,UChar32 c,UBool escapeUnprintable)1993 void UnicodeSet::_appendToPat(UnicodeString& buf, UChar32 c, UBool
1994 escapeUnprintable) {
1995     if (escapeUnprintable && ICU_Utility::isUnprintable(c)) {
1996         // Use hex escape notation (\uxxxx or \Uxxxxxxxx) for anything
1997         // unprintable
1998         if (ICU_Utility::escapeUnprintable(buf, c)) {
1999             return;
2000         }
2001     }
2002     // Okay to let ':' pass through
2003     switch (c) {
2004     case SET_OPEN:
2005     case SET_CLOSE:
2006     case HYPHEN:
2007     case COMPLEMENT:
2008     case INTERSECTION:
2009     case BACKSLASH:
2010     case OPEN_BRACE:
2011     case CLOSE_BRACE:
2012     case COLON:
2013     case SymbolTable::SYMBOL_REF:
2014         buf.append(BACKSLASH);
2015         break;
2016     default:
2017         // Escape whitespace
2018         if (PatternProps::isWhiteSpace(c)) {
2019             buf.append(BACKSLASH);
2020         }
2021         break;
2022     }
2023     buf.append(c);
2024 }
2025 
2026 /**
2027  * Append a string representation of this set to result.  This will be
2028  * a cleaned version of the string passed to applyPattern(), if there
2029  * is one.  Otherwise it will be generated.
2030  */
_toPattern(UnicodeString & result,UBool escapeUnprintable) const2031 UnicodeString& UnicodeSet::_toPattern(UnicodeString& result,
2032                                       UBool escapeUnprintable) const
2033 {
2034     if (pat != NULL) {
2035         int32_t i;
2036         int32_t backslashCount = 0;
2037         for (i=0; i<patLen; ) {
2038             UChar32 c;
2039             U16_NEXT(pat, i, patLen, c);
2040             if (escapeUnprintable && ICU_Utility::isUnprintable(c)) {
2041                 // If the unprintable character is preceded by an odd
2042                 // number of backslashes, then it has been escaped.
2043                 // Before unescaping it, we delete the final
2044                 // backslash.
2045                 if ((backslashCount % 2) == 1) {
2046                     result.truncate(result.length() - 1);
2047                 }
2048                 ICU_Utility::escapeUnprintable(result, c);
2049                 backslashCount = 0;
2050             } else {
2051                 result.append(c);
2052                 if (c == BACKSLASH) {
2053                     ++backslashCount;
2054                 } else {
2055                     backslashCount = 0;
2056                 }
2057             }
2058         }
2059         return result;
2060     }
2061 
2062     return _generatePattern(result, escapeUnprintable);
2063 }
2064 
2065 /**
2066  * Returns a string representation of this set.  If the result of
2067  * calling this function is passed to a UnicodeSet constructor, it
2068  * will produce another set that is equal to this one.
2069  */
toPattern(UnicodeString & result,UBool escapeUnprintable) const2070 UnicodeString& UnicodeSet::toPattern(UnicodeString& result,
2071                                      UBool escapeUnprintable) const
2072 {
2073     result.truncate(0);
2074     return _toPattern(result, escapeUnprintable);
2075 }
2076 
2077 /**
2078  * Generate and append a string representation of this set to result.
2079  * This does not use this.pat, the cleaned up copy of the string
2080  * passed to applyPattern().
2081  */
_generatePattern(UnicodeString & result,UBool escapeUnprintable) const2082 UnicodeString& UnicodeSet::_generatePattern(UnicodeString& result,
2083                                             UBool escapeUnprintable) const
2084 {
2085     result.append(SET_OPEN);
2086 
2087 //  // Check against the predefined categories.  We implicitly build
2088 //  // up ALL category sets the first time toPattern() is called.
2089 //  for (int8_t cat=0; cat<Unicode::GENERAL_TYPES_COUNT; ++cat) {
2090 //      if (*this == getCategorySet(cat)) {
2091 //          result.append(COLON);
2092 //          result.append(CATEGORY_NAMES, cat*2, 2);
2093 //          return result.append(CATEGORY_CLOSE);
2094 //      }
2095 //  }
2096 
2097     int32_t count = getRangeCount();
2098 
2099     // If the set contains at least 2 intervals and includes both
2100     // MIN_VALUE and MAX_VALUE, then the inverse representation will
2101     // be more economical.
2102     if (count > 1 &&
2103         getRangeStart(0) == MIN_VALUE &&
2104         getRangeEnd(count-1) == MAX_VALUE) {
2105 
2106         // Emit the inverse
2107         result.append(COMPLEMENT);
2108 
2109         for (int32_t i = 1; i < count; ++i) {
2110             UChar32 start = getRangeEnd(i-1)+1;
2111             UChar32 end = getRangeStart(i)-1;
2112             _appendToPat(result, start, escapeUnprintable);
2113             if (start != end) {
2114                 if ((start+1) != end) {
2115                     result.append(HYPHEN);
2116                 }
2117                 _appendToPat(result, end, escapeUnprintable);
2118             }
2119         }
2120     }
2121 
2122     // Default; emit the ranges as pairs
2123     else {
2124         for (int32_t i = 0; i < count; ++i) {
2125             UChar32 start = getRangeStart(i);
2126             UChar32 end = getRangeEnd(i);
2127             _appendToPat(result, start, escapeUnprintable);
2128             if (start != end) {
2129                 if ((start+1) != end) {
2130                     result.append(HYPHEN);
2131                 }
2132                 _appendToPat(result, end, escapeUnprintable);
2133             }
2134         }
2135     }
2136 
2137     if (strings != nullptr) {
2138         for (int32_t i = 0; i<strings->size(); ++i) {
2139             result.append(OPEN_BRACE);
2140             _appendToPat(result,
2141                          *(const UnicodeString*) strings->elementAt(i),
2142                          escapeUnprintable);
2143             result.append(CLOSE_BRACE);
2144         }
2145     }
2146     return result.append(SET_CLOSE);
2147 }
2148 
2149 /**
2150 * Release existing cached pattern
2151 */
releasePattern()2152 void UnicodeSet::releasePattern() {
2153     if (pat) {
2154         uprv_free(pat);
2155         pat = NULL;
2156         patLen = 0;
2157     }
2158 }
2159 
2160 /**
2161 * Set the new pattern to cache.
2162 */
setPattern(const char16_t * newPat,int32_t newPatLen)2163 void UnicodeSet::setPattern(const char16_t *newPat, int32_t newPatLen) {
2164     releasePattern();
2165     pat = (UChar *)uprv_malloc((newPatLen + 1) * sizeof(UChar));
2166     if (pat) {
2167         patLen = newPatLen;
2168         u_memcpy(pat, newPat, patLen);
2169         pat[patLen] = 0;
2170     }
2171     // else we don't care if malloc failed. This was just a nice cache.
2172     // We can regenerate an equivalent pattern later when requested.
2173 }
2174 
freeze()2175 UnicodeSet *UnicodeSet::freeze() {
2176     if(!isFrozen() && !isBogus()) {
2177         compact();
2178 
2179         // Optimize contains() and span() and similar functions.
2180         if (hasStrings()) {
2181             stringSpan = new UnicodeSetStringSpan(*this, *strings, UnicodeSetStringSpan::ALL);
2182             if (stringSpan == nullptr) {
2183                 setToBogus();
2184                 return this;
2185             } else if (!stringSpan->needsStringSpanUTF16()) {
2186                 // All strings are irrelevant for span() etc. because
2187                 // all of each string's code points are contained in this set.
2188                 // Do not check needsStringSpanUTF8() because UTF-8 has at most as
2189                 // many relevant strings as UTF-16.
2190                 // (Thus needsStringSpanUTF8() implies needsStringSpanUTF16().)
2191                 delete stringSpan;
2192                 stringSpan = NULL;
2193             }
2194         }
2195         if (stringSpan == NULL) {
2196             // No span-relevant strings: Optimize for code point spans.
2197             bmpSet=new BMPSet(list, len);
2198             if (bmpSet == NULL) { // Check for memory allocation error.
2199                 setToBogus();
2200             }
2201         }
2202     }
2203     return this;
2204 }
2205 
span(const UChar * s,int32_t length,USetSpanCondition spanCondition) const2206 int32_t UnicodeSet::span(const UChar *s, int32_t length, USetSpanCondition spanCondition) const {
2207     if(length>0 && bmpSet!=NULL) {
2208         return (int32_t)(bmpSet->span(s, s+length, spanCondition)-s);
2209     }
2210     if(length<0) {
2211         length=u_strlen(s);
2212     }
2213     if(length==0) {
2214         return 0;
2215     }
2216     if(stringSpan!=NULL) {
2217         return stringSpan->span(s, length, spanCondition);
2218     } else if(hasStrings()) {
2219         uint32_t which= spanCondition==USET_SPAN_NOT_CONTAINED ?
2220                             UnicodeSetStringSpan::FWD_UTF16_NOT_CONTAINED :
2221                             UnicodeSetStringSpan::FWD_UTF16_CONTAINED;
2222         UnicodeSetStringSpan strSpan(*this, *strings, which);
2223         if(strSpan.needsStringSpanUTF16()) {
2224             return strSpan.span(s, length, spanCondition);
2225         }
2226     }
2227 
2228     if(spanCondition!=USET_SPAN_NOT_CONTAINED) {
2229         spanCondition=USET_SPAN_CONTAINED;  // Pin to 0/1 values.
2230     }
2231 
2232     UChar32 c;
2233     int32_t start=0, prev=0;
2234     do {
2235         U16_NEXT(s, start, length, c);
2236         if(spanCondition!=contains(c)) {
2237             break;
2238         }
2239     } while((prev=start)<length);
2240     return prev;
2241 }
2242 
spanBack(const UChar * s,int32_t length,USetSpanCondition spanCondition) const2243 int32_t UnicodeSet::spanBack(const UChar *s, int32_t length, USetSpanCondition spanCondition) const {
2244     if(length>0 && bmpSet!=NULL) {
2245         return (int32_t)(bmpSet->spanBack(s, s+length, spanCondition)-s);
2246     }
2247     if(length<0) {
2248         length=u_strlen(s);
2249     }
2250     if(length==0) {
2251         return 0;
2252     }
2253     if(stringSpan!=NULL) {
2254         return stringSpan->spanBack(s, length, spanCondition);
2255     } else if(hasStrings()) {
2256         uint32_t which= spanCondition==USET_SPAN_NOT_CONTAINED ?
2257                             UnicodeSetStringSpan::BACK_UTF16_NOT_CONTAINED :
2258                             UnicodeSetStringSpan::BACK_UTF16_CONTAINED;
2259         UnicodeSetStringSpan strSpan(*this, *strings, which);
2260         if(strSpan.needsStringSpanUTF16()) {
2261             return strSpan.spanBack(s, length, spanCondition);
2262         }
2263     }
2264 
2265     if(spanCondition!=USET_SPAN_NOT_CONTAINED) {
2266         spanCondition=USET_SPAN_CONTAINED;  // Pin to 0/1 values.
2267     }
2268 
2269     UChar32 c;
2270     int32_t prev=length;
2271     do {
2272         U16_PREV(s, 0, length, c);
2273         if(spanCondition!=contains(c)) {
2274             break;
2275         }
2276     } while((prev=length)>0);
2277     return prev;
2278 }
2279 
spanUTF8(const char * s,int32_t length,USetSpanCondition spanCondition) const2280 int32_t UnicodeSet::spanUTF8(const char *s, int32_t length, USetSpanCondition spanCondition) const {
2281     if(length>0 && bmpSet!=NULL) {
2282         const uint8_t *s0=(const uint8_t *)s;
2283         return (int32_t)(bmpSet->spanUTF8(s0, length, spanCondition)-s0);
2284     }
2285     if(length<0) {
2286         length=(int32_t)uprv_strlen(s);
2287     }
2288     if(length==0) {
2289         return 0;
2290     }
2291     if(stringSpan!=NULL) {
2292         return stringSpan->spanUTF8((const uint8_t *)s, length, spanCondition);
2293     } else if(hasStrings()) {
2294         uint32_t which= spanCondition==USET_SPAN_NOT_CONTAINED ?
2295                             UnicodeSetStringSpan::FWD_UTF8_NOT_CONTAINED :
2296                             UnicodeSetStringSpan::FWD_UTF8_CONTAINED;
2297         UnicodeSetStringSpan strSpan(*this, *strings, which);
2298         if(strSpan.needsStringSpanUTF8()) {
2299             return strSpan.spanUTF8((const uint8_t *)s, length, spanCondition);
2300         }
2301     }
2302 
2303     if(spanCondition!=USET_SPAN_NOT_CONTAINED) {
2304         spanCondition=USET_SPAN_CONTAINED;  // Pin to 0/1 values.
2305     }
2306 
2307     UChar32 c;
2308     int32_t start=0, prev=0;
2309     do {
2310         U8_NEXT_OR_FFFD(s, start, length, c);
2311         if(spanCondition!=contains(c)) {
2312             break;
2313         }
2314     } while((prev=start)<length);
2315     return prev;
2316 }
2317 
spanBackUTF8(const char * s,int32_t length,USetSpanCondition spanCondition) const2318 int32_t UnicodeSet::spanBackUTF8(const char *s, int32_t length, USetSpanCondition spanCondition) const {
2319     if(length>0 && bmpSet!=NULL) {
2320         const uint8_t *s0=(const uint8_t *)s;
2321         return bmpSet->spanBackUTF8(s0, length, spanCondition);
2322     }
2323     if(length<0) {
2324         length=(int32_t)uprv_strlen(s);
2325     }
2326     if(length==0) {
2327         return 0;
2328     }
2329     if(stringSpan!=NULL) {
2330         return stringSpan->spanBackUTF8((const uint8_t *)s, length, spanCondition);
2331     } else if(hasStrings()) {
2332         uint32_t which= spanCondition==USET_SPAN_NOT_CONTAINED ?
2333                             UnicodeSetStringSpan::BACK_UTF8_NOT_CONTAINED :
2334                             UnicodeSetStringSpan::BACK_UTF8_CONTAINED;
2335         UnicodeSetStringSpan strSpan(*this, *strings, which);
2336         if(strSpan.needsStringSpanUTF8()) {
2337             return strSpan.spanBackUTF8((const uint8_t *)s, length, spanCondition);
2338         }
2339     }
2340 
2341     if(spanCondition!=USET_SPAN_NOT_CONTAINED) {
2342         spanCondition=USET_SPAN_CONTAINED;  // Pin to 0/1 values.
2343     }
2344 
2345     UChar32 c;
2346     int32_t prev=length;
2347     do {
2348         U8_PREV_OR_FFFD(s, 0, length, c);
2349         if(spanCondition!=contains(c)) {
2350             break;
2351         }
2352     } while((prev=length)>0);
2353     return prev;
2354 }
2355 
2356 U_NAMESPACE_END
2357