1 // © 2016 and later: Unicode, Inc. and others.
2 // License & terms of use: http://www.unicode.org/copyright.html
3 /*
4 *******************************************************************************
5 *
6 *   Copyright (C) 1999-2014, International Business Machines
7 *   Corporation and others.  All Rights Reserved.
8 *
9 *******************************************************************************
10 *   file name:  uniset_props.cpp
11 *   encoding:   UTF-8
12 *   tab size:   8 (not used)
13 *   indentation:4
14 *
15 *   created on: 2004aug25
16 *   created by: Markus W. Scherer
17 *
18 *   Character property dependent functions moved here from uniset.cpp
19 */
20 
21 #include "unicode/utypes.h"
22 #include "unicode/uniset.h"
23 #include "unicode/parsepos.h"
24 #include "unicode/uchar.h"
25 #include "unicode/uscript.h"
26 #include "unicode/symtable.h"
27 #include "unicode/uset.h"
28 #include "unicode/locid.h"
29 #include "unicode/brkiter.h"
30 #include "uset_imp.h"
31 #include "ruleiter.h"
32 #include "cmemory.h"
33 #include "ucln_cmn.h"
34 #include "util.h"
35 #include "uvector.h"
36 #include "uprops.h"
37 #include "propname.h"
38 #include "normalizer2impl.h"
39 #include "uinvchar.h"
40 #include "uprops.h"
41 #include "charstr.h"
42 #include "cstring.h"
43 #include "mutex.h"
44 #include "umutex.h"
45 #include "uassert.h"
46 #include "hash.h"
47 
48 U_NAMESPACE_USE
49 
50 // Special property set IDs
51 static const char ANY[]   = "ANY";   // [\u0000-\U0010FFFF]
52 static const char ASCII[] = "ASCII"; // [\u0000-\u007F]
53 static const char ASSIGNED[] = "Assigned"; // [:^Cn:]
54 
55 // Unicode name property alias
56 #define NAME_PROP "na"
57 #define NAME_PROP_LENGTH 2
58 
59 // Cached sets ------------------------------------------------------------- ***
60 
61 U_CDECL_BEGIN
62 static UBool U_CALLCONV uset_cleanup();
63 
64 static UnicodeSet *uni32Singleton;
65 static icu::UInitOnce uni32InitOnce = U_INITONCE_INITIALIZER;
66 
67 /**
68  * Cleanup function for UnicodeSet
69  */
uset_cleanup(void)70 static UBool U_CALLCONV uset_cleanup(void) {
71     delete uni32Singleton;
72     uni32Singleton = NULL;
73     uni32InitOnce.reset();
74     return TRUE;
75 }
76 
77 U_CDECL_END
78 
79 U_NAMESPACE_BEGIN
80 
81 namespace {
82 
83 // Cache some sets for other services -------------------------------------- ***
createUni32Set(UErrorCode & errorCode)84 void U_CALLCONV createUni32Set(UErrorCode &errorCode) {
85     U_ASSERT(uni32Singleton == NULL);
86     uni32Singleton = new UnicodeSet(UNICODE_STRING_SIMPLE("[:age=3.2:]"), errorCode);
87     if(uni32Singleton==NULL) {
88         errorCode=U_MEMORY_ALLOCATION_ERROR;
89     } else {
90         uni32Singleton->freeze();
91     }
92     ucln_common_registerCleanup(UCLN_COMMON_USET, uset_cleanup);
93 }
94 
95 
96 U_CFUNC UnicodeSet *
uniset_getUnicode32Instance(UErrorCode & errorCode)97 uniset_getUnicode32Instance(UErrorCode &errorCode) {
98     umtx_initOnce(uni32InitOnce, &createUni32Set, errorCode);
99     return uni32Singleton;
100 }
101 
102 // helper functions for matching of pattern syntax pieces ------------------ ***
103 // these functions are parallel to the PERL_OPEN etc. strings above
104 
105 // using these functions is not only faster than UnicodeString::compare() and
106 // caseCompare(), but they also make UnicodeSet work for simple patterns when
107 // no Unicode properties data is available - when caseCompare() fails
108 
109 static inline UBool
isPerlOpen(const UnicodeString & pattern,int32_t pos)110 isPerlOpen(const UnicodeString &pattern, int32_t pos) {
111     UChar c;
112     return pattern.charAt(pos)==u'\\' && ((c=pattern.charAt(pos+1))==u'p' || c==u'P');
113 }
114 
115 /*static inline UBool
116 isPerlClose(const UnicodeString &pattern, int32_t pos) {
117     return pattern.charAt(pos)==u'}';
118 }*/
119 
120 static inline UBool
isNameOpen(const UnicodeString & pattern,int32_t pos)121 isNameOpen(const UnicodeString &pattern, int32_t pos) {
122     return pattern.charAt(pos)==u'\\' && pattern.charAt(pos+1)==u'N';
123 }
124 
125 static inline UBool
isPOSIXOpen(const UnicodeString & pattern,int32_t pos)126 isPOSIXOpen(const UnicodeString &pattern, int32_t pos) {
127     return pattern.charAt(pos)==u'[' && pattern.charAt(pos+1)==u':';
128 }
129 
130 /*static inline UBool
131 isPOSIXClose(const UnicodeString &pattern, int32_t pos) {
132     return pattern.charAt(pos)==u':' && pattern.charAt(pos+1)==u']';
133 }*/
134 
135 // TODO memory debugging provided inside uniset.cpp
136 // could be made available here but probably obsolete with use of modern
137 // memory leak checker tools
138 #define _dbgct(me)
139 
140 }  // namespace
141 
142 //----------------------------------------------------------------
143 // Constructors &c
144 //----------------------------------------------------------------
145 
146 /**
147  * Constructs a set from the given pattern, optionally ignoring
148  * white space.  See the class description for the syntax of the
149  * pattern language.
150  * @param pattern a string specifying what characters are in the set
151  */
UnicodeSet(const UnicodeString & pattern,UErrorCode & status)152 UnicodeSet::UnicodeSet(const UnicodeString& pattern,
153                        UErrorCode& status) {
154     applyPattern(pattern, status);
155     _dbgct(this);
156 }
157 
158 //----------------------------------------------------------------
159 // Public API
160 //----------------------------------------------------------------
161 
applyPattern(const UnicodeString & pattern,UErrorCode & status)162 UnicodeSet& UnicodeSet::applyPattern(const UnicodeString& pattern,
163                                      UErrorCode& status) {
164     // Equivalent to
165     //   return applyPattern(pattern, USET_IGNORE_SPACE, NULL, status);
166     // but without dependency on closeOver().
167     ParsePosition pos(0);
168     applyPatternIgnoreSpace(pattern, pos, NULL, status);
169     if (U_FAILURE(status)) return *this;
170 
171     int32_t i = pos.getIndex();
172     // Skip over trailing whitespace
173     ICU_Utility::skipWhitespace(pattern, i, TRUE);
174     if (i != pattern.length()) {
175         status = U_ILLEGAL_ARGUMENT_ERROR;
176     }
177     return *this;
178 }
179 
180 void
applyPatternIgnoreSpace(const UnicodeString & pattern,ParsePosition & pos,const SymbolTable * symbols,UErrorCode & status)181 UnicodeSet::applyPatternIgnoreSpace(const UnicodeString& pattern,
182                                     ParsePosition& pos,
183                                     const SymbolTable* symbols,
184                                     UErrorCode& status) {
185     if (U_FAILURE(status)) {
186         return;
187     }
188     if (isFrozen()) {
189         status = U_NO_WRITE_PERMISSION;
190         return;
191     }
192     // Need to build the pattern in a temporary string because
193     // _applyPattern calls add() etc., which set pat to empty.
194     UnicodeString rebuiltPat;
195     RuleCharacterIterator chars(pattern, symbols, pos);
196     applyPattern(chars, symbols, rebuiltPat, USET_IGNORE_SPACE, NULL, 0, status);
197     if (U_FAILURE(status)) return;
198     if (chars.inVariable()) {
199         // syntaxError(chars, "Extra chars in variable value");
200         status = U_MALFORMED_SET;
201         return;
202     }
203     setPattern(rebuiltPat);
204 }
205 
206 /**
207  * Return true if the given position, in the given pattern, appears
208  * to be the start of a UnicodeSet pattern.
209  */
resemblesPattern(const UnicodeString & pattern,int32_t pos)210 UBool UnicodeSet::resemblesPattern(const UnicodeString& pattern, int32_t pos) {
211     return ((pos+1) < pattern.length() &&
212             pattern.charAt(pos) == (UChar)91/*[*/) ||
213         resemblesPropertyPattern(pattern, pos);
214 }
215 
216 //----------------------------------------------------------------
217 // Implementation: Pattern parsing
218 //----------------------------------------------------------------
219 
220 namespace {
221 
222 /**
223  * A small all-inline class to manage a UnicodeSet pointer.  Add
224  * operator->() etc. as needed.
225  */
226 class UnicodeSetPointer {
227     UnicodeSet* p;
228 public:
UnicodeSetPointer()229     inline UnicodeSetPointer() : p(0) {}
~UnicodeSetPointer()230     inline ~UnicodeSetPointer() { delete p; }
pointer()231     inline UnicodeSet* pointer() { return p; }
allocate()232     inline UBool allocate() {
233         if (p == 0) {
234             p = new UnicodeSet();
235         }
236         return p != 0;
237     }
238 };
239 
240 constexpr int32_t MAX_DEPTH = 100;
241 
242 }  // namespace
243 
244 /**
245  * Parse the pattern from the given RuleCharacterIterator.  The
246  * iterator is advanced over the parsed pattern.
247  * @param chars iterator over the pattern characters.  Upon return
248  * it will be advanced to the first character after the parsed
249  * pattern, or the end of the iteration if all characters are
250  * parsed.
251  * @param symbols symbol table to use to parse and dereference
252  * variables, or null if none.
253  * @param rebuiltPat the pattern that was parsed, rebuilt or
254  * copied from the input pattern, as appropriate.
255  * @param options a bit mask of zero or more of the following:
256  * IGNORE_SPACE, CASE.
257  */
applyPattern(RuleCharacterIterator & chars,const SymbolTable * symbols,UnicodeString & rebuiltPat,uint32_t options,UnicodeSet & (UnicodeSet::* caseClosure)(int32_t attribute),int32_t depth,UErrorCode & ec)258 void UnicodeSet::applyPattern(RuleCharacterIterator& chars,
259                               const SymbolTable* symbols,
260                               UnicodeString& rebuiltPat,
261                               uint32_t options,
262                               UnicodeSet& (UnicodeSet::*caseClosure)(int32_t attribute),
263                               int32_t depth,
264                               UErrorCode& ec) {
265     if (U_FAILURE(ec)) return;
266     if (depth > MAX_DEPTH) {
267         ec = U_ILLEGAL_ARGUMENT_ERROR;
268         return;
269     }
270 
271     // Syntax characters: [ ] ^ - & { }
272 
273     // Recognized special forms for chars, sets: c-c s-s s&s
274 
275     int32_t opts = RuleCharacterIterator::PARSE_VARIABLES |
276                    RuleCharacterIterator::PARSE_ESCAPES;
277     if ((options & USET_IGNORE_SPACE) != 0) {
278         opts |= RuleCharacterIterator::SKIP_WHITESPACE;
279     }
280 
281     UnicodeString patLocal, buf;
282     UBool usePat = FALSE;
283     UnicodeSetPointer scratch;
284     RuleCharacterIterator::Pos backup;
285 
286     // mode: 0=before [, 1=between [...], 2=after ]
287     // lastItem: 0=none, 1=char, 2=set
288     int8_t lastItem = 0, mode = 0;
289     UChar32 lastChar = 0;
290     UChar op = 0;
291 
292     UBool invert = FALSE;
293 
294     clear();
295 
296     while (mode != 2 && !chars.atEnd()) {
297         U_ASSERT((lastItem == 0 && op == 0) ||
298                  (lastItem == 1 && (op == 0 || op == u'-')) ||
299                  (lastItem == 2 && (op == 0 || op == u'-' || op == u'&')));
300 
301         UChar32 c = 0;
302         UBool literal = FALSE;
303         UnicodeSet* nested = 0; // alias - do not delete
304 
305         // -------- Check for property pattern
306 
307         // setMode: 0=none, 1=unicodeset, 2=propertypat, 3=preparsed
308         int8_t setMode = 0;
309         if (resemblesPropertyPattern(chars, opts)) {
310             setMode = 2;
311         }
312 
313         // -------- Parse '[' of opening delimiter OR nested set.
314         // If there is a nested set, use `setMode' to define how
315         // the set should be parsed.  If the '[' is part of the
316         // opening delimiter for this pattern, parse special
317         // strings "[", "[^", "[-", and "[^-".  Check for stand-in
318         // characters representing a nested set in the symbol
319         // table.
320 
321         else {
322             // Prepare to backup if necessary
323             chars.getPos(backup);
324             c = chars.next(opts, literal, ec);
325             if (U_FAILURE(ec)) return;
326 
327             if (c == u'[' && !literal) {
328                 if (mode == 1) {
329                     chars.setPos(backup); // backup
330                     setMode = 1;
331                 } else {
332                     // Handle opening '[' delimiter
333                     mode = 1;
334                     patLocal.append(u'[');
335                     chars.getPos(backup); // prepare to backup
336                     c = chars.next(opts, literal, ec);
337                     if (U_FAILURE(ec)) return;
338                     if (c == u'^' && !literal) {
339                         invert = TRUE;
340                         patLocal.append(u'^');
341                         chars.getPos(backup); // prepare to backup
342                         c = chars.next(opts, literal, ec);
343                         if (U_FAILURE(ec)) return;
344                     }
345                     // Fall through to handle special leading '-';
346                     // otherwise restart loop for nested [], \p{}, etc.
347                     if (c == u'-') {
348                         literal = TRUE;
349                         // Fall through to handle literal '-' below
350                     } else {
351                         chars.setPos(backup); // backup
352                         continue;
353                     }
354                 }
355             } else if (symbols != 0) {
356                 const UnicodeFunctor *m = symbols->lookupMatcher(c);
357                 if (m != 0) {
358                     const UnicodeSet *ms = dynamic_cast<const UnicodeSet *>(m);
359                     if (ms == NULL) {
360                         ec = U_MALFORMED_SET;
361                         return;
362                     }
363                     // casting away const, but `nested' won't be modified
364                     // (important not to modify stored set)
365                     nested = const_cast<UnicodeSet*>(ms);
366                     setMode = 3;
367                 }
368             }
369         }
370 
371         // -------- Handle a nested set.  This either is inline in
372         // the pattern or represented by a stand-in that has
373         // previously been parsed and was looked up in the symbol
374         // table.
375 
376         if (setMode != 0) {
377             if (lastItem == 1) {
378                 if (op != 0) {
379                     // syntaxError(chars, "Char expected after operator");
380                     ec = U_MALFORMED_SET;
381                     return;
382                 }
383                 add(lastChar, lastChar);
384                 _appendToPat(patLocal, lastChar, FALSE);
385                 lastItem = 0;
386                 op = 0;
387             }
388 
389             if (op == u'-' || op == u'&') {
390                 patLocal.append(op);
391             }
392 
393             if (nested == 0) {
394                 // lazy allocation
395                 if (!scratch.allocate()) {
396                     ec = U_MEMORY_ALLOCATION_ERROR;
397                     return;
398                 }
399                 nested = scratch.pointer();
400             }
401             switch (setMode) {
402             case 1:
403                 nested->applyPattern(chars, symbols, patLocal, options, caseClosure, depth + 1, ec);
404                 break;
405             case 2:
406                 chars.skipIgnored(opts);
407                 nested->applyPropertyPattern(chars, patLocal, ec);
408                 if (U_FAILURE(ec)) return;
409                 break;
410             case 3: // `nested' already parsed
411                 nested->_toPattern(patLocal, FALSE);
412                 break;
413             }
414 
415             usePat = TRUE;
416 
417             if (mode == 0) {
418                 // Entire pattern is a category; leave parse loop
419                 *this = *nested;
420                 mode = 2;
421                 break;
422             }
423 
424             switch (op) {
425             case u'-':
426                 removeAll(*nested);
427                 break;
428             case u'&':
429                 retainAll(*nested);
430                 break;
431             case 0:
432                 addAll(*nested);
433                 break;
434             }
435 
436             op = 0;
437             lastItem = 2;
438 
439             continue;
440         }
441 
442         if (mode == 0) {
443             // syntaxError(chars, "Missing '['");
444             ec = U_MALFORMED_SET;
445             return;
446         }
447 
448         // -------- Parse special (syntax) characters.  If the
449         // current character is not special, or if it is escaped,
450         // then fall through and handle it below.
451 
452         if (!literal) {
453             switch (c) {
454             case u']':
455                 if (lastItem == 1) {
456                     add(lastChar, lastChar);
457                     _appendToPat(patLocal, lastChar, FALSE);
458                 }
459                 // Treat final trailing '-' as a literal
460                 if (op == u'-') {
461                     add(op, op);
462                     patLocal.append(op);
463                 } else if (op == u'&') {
464                     // syntaxError(chars, "Trailing '&'");
465                     ec = U_MALFORMED_SET;
466                     return;
467                 }
468                 patLocal.append(u']');
469                 mode = 2;
470                 continue;
471             case u'-':
472                 if (op == 0) {
473                     if (lastItem != 0) {
474                         op = (UChar) c;
475                         continue;
476                     } else {
477                         // Treat final trailing '-' as a literal
478                         add(c, c);
479                         c = chars.next(opts, literal, ec);
480                         if (U_FAILURE(ec)) return;
481                         if (c == u']' && !literal) {
482                             patLocal.append(u"-]", 2);
483                             mode = 2;
484                             continue;
485                         }
486                     }
487                 }
488                 // syntaxError(chars, "'-' not after char or set");
489                 ec = U_MALFORMED_SET;
490                 return;
491             case u'&':
492                 if (lastItem == 2 && op == 0) {
493                     op = (UChar) c;
494                     continue;
495                 }
496                 // syntaxError(chars, "'&' not after set");
497                 ec = U_MALFORMED_SET;
498                 return;
499             case u'^':
500                 // syntaxError(chars, "'^' not after '['");
501                 ec = U_MALFORMED_SET;
502                 return;
503             case u'{':
504                 if (op != 0) {
505                     // syntaxError(chars, "Missing operand after operator");
506                     ec = U_MALFORMED_SET;
507                     return;
508                 }
509                 if (lastItem == 1) {
510                     add(lastChar, lastChar);
511                     _appendToPat(patLocal, lastChar, FALSE);
512                 }
513                 lastItem = 0;
514                 buf.truncate(0);
515                 {
516                     UBool ok = FALSE;
517                     while (!chars.atEnd()) {
518                         c = chars.next(opts, literal, ec);
519                         if (U_FAILURE(ec)) return;
520                         if (c == u'}' && !literal) {
521                             ok = TRUE;
522                             break;
523                         }
524                         buf.append(c);
525                     }
526                     if (!ok) {
527                         // syntaxError(chars, "Invalid multicharacter string");
528                         ec = U_MALFORMED_SET;
529                         return;
530                     }
531                 }
532                 // We have new string. Add it to set and continue;
533                 // we don't need to drop through to the further
534                 // processing
535                 add(buf);
536                 patLocal.append(u'{');
537                 _appendToPat(patLocal, buf, FALSE);
538                 patLocal.append(u'}');
539                 continue;
540             case SymbolTable::SYMBOL_REF:
541                 //         symbols  nosymbols
542                 // [a-$]   error    error (ambiguous)
543                 // [a$]    anchor   anchor
544                 // [a-$x]  var "x"* literal '$'
545                 // [a-$.]  error    literal '$'
546                 // *We won't get here in the case of var "x"
547                 {
548                     chars.getPos(backup);
549                     c = chars.next(opts, literal, ec);
550                     if (U_FAILURE(ec)) return;
551                     UBool anchor = (c == u']' && !literal);
552                     if (symbols == 0 && !anchor) {
553                         c = SymbolTable::SYMBOL_REF;
554                         chars.setPos(backup);
555                         break; // literal '$'
556                     }
557                     if (anchor && op == 0) {
558                         if (lastItem == 1) {
559                             add(lastChar, lastChar);
560                             _appendToPat(patLocal, lastChar, FALSE);
561                         }
562                         add(U_ETHER);
563                         usePat = TRUE;
564                         patLocal.append((UChar) SymbolTable::SYMBOL_REF);
565                         patLocal.append(u']');
566                         mode = 2;
567                         continue;
568                     }
569                     // syntaxError(chars, "Unquoted '$'");
570                     ec = U_MALFORMED_SET;
571                     return;
572                 }
573             default:
574                 break;
575             }
576         }
577 
578         // -------- Parse literal characters.  This includes both
579         // escaped chars ("\u4E01") and non-syntax characters
580         // ("a").
581 
582         switch (lastItem) {
583         case 0:
584             lastItem = 1;
585             lastChar = c;
586             break;
587         case 1:
588             if (op == u'-') {
589                 if (lastChar >= c) {
590                     // Don't allow redundant (a-a) or empty (b-a) ranges;
591                     // these are most likely typos.
592                     // syntaxError(chars, "Invalid range");
593                     ec = U_MALFORMED_SET;
594                     return;
595                 }
596                 add(lastChar, c);
597                 _appendToPat(patLocal, lastChar, FALSE);
598                 patLocal.append(op);
599                 _appendToPat(patLocal, c, FALSE);
600                 lastItem = 0;
601                 op = 0;
602             } else {
603                 add(lastChar, lastChar);
604                 _appendToPat(patLocal, lastChar, FALSE);
605                 lastChar = c;
606             }
607             break;
608         case 2:
609             if (op != 0) {
610                 // syntaxError(chars, "Set expected after operator");
611                 ec = U_MALFORMED_SET;
612                 return;
613             }
614             lastChar = c;
615             lastItem = 1;
616             break;
617         }
618     }
619 
620     if (mode != 2) {
621         // syntaxError(chars, "Missing ']'");
622         ec = U_MALFORMED_SET;
623         return;
624     }
625 
626     chars.skipIgnored(opts);
627 
628     /**
629      * Handle global flags (invert, case insensitivity).  If this
630      * pattern should be compiled case-insensitive, then we need
631      * to close over case BEFORE COMPLEMENTING.  This makes
632      * patterns like /[^abc]/i work.
633      */
634     if ((options & USET_CASE_INSENSITIVE) != 0) {
635         (this->*caseClosure)(USET_CASE_INSENSITIVE);
636     }
637     else if ((options & USET_ADD_CASE_MAPPINGS) != 0) {
638         (this->*caseClosure)(USET_ADD_CASE_MAPPINGS);
639     }
640     if (invert) {
641         complement();
642     }
643 
644     // Use the rebuilt pattern (patLocal) only if necessary.  Prefer the
645     // generated pattern.
646     if (usePat) {
647         rebuiltPat.append(patLocal);
648     } else {
649         _generatePattern(rebuiltPat, FALSE);
650     }
651     if (isBogus() && U_SUCCESS(ec)) {
652         // We likely ran out of memory. AHHH!
653         ec = U_MEMORY_ALLOCATION_ERROR;
654     }
655 }
656 
657 //----------------------------------------------------------------
658 // Property set implementation
659 //----------------------------------------------------------------
660 
661 namespace {
662 
numericValueFilter(UChar32 ch,void * context)663 static UBool numericValueFilter(UChar32 ch, void* context) {
664     return u_getNumericValue(ch) == *(double*)context;
665 }
666 
generalCategoryMaskFilter(UChar32 ch,void * context)667 static UBool generalCategoryMaskFilter(UChar32 ch, void* context) {
668     int32_t value = *(int32_t*)context;
669     return (U_GET_GC_MASK((UChar32) ch) & value) != 0;
670 }
671 
versionFilter(UChar32 ch,void * context)672 static UBool versionFilter(UChar32 ch, void* context) {
673     static const UVersionInfo none = { 0, 0, 0, 0 };
674     UVersionInfo v;
675     u_charAge(ch, v);
676     UVersionInfo* version = (UVersionInfo*)context;
677     return uprv_memcmp(&v, &none, sizeof(v)) > 0 && uprv_memcmp(&v, version, sizeof(v)) <= 0;
678 }
679 
680 typedef struct {
681     UProperty prop;
682     int32_t value;
683 } IntPropertyContext;
684 
intPropertyFilter(UChar32 ch,void * context)685 static UBool intPropertyFilter(UChar32 ch, void* context) {
686     IntPropertyContext* c = (IntPropertyContext*)context;
687     return u_getIntPropertyValue((UChar32) ch, c->prop) == c->value;
688 }
689 
scriptExtensionsFilter(UChar32 ch,void * context)690 static UBool scriptExtensionsFilter(UChar32 ch, void* context) {
691     return uscript_hasScript(ch, *(UScriptCode*)context);
692 }
693 
694 }  // namespace
695 
696 /**
697  * Generic filter-based scanning code for UCD property UnicodeSets.
698  */
applyFilter(UnicodeSet::Filter filter,void * context,const UnicodeSet * inclusions,UErrorCode & status)699 void UnicodeSet::applyFilter(UnicodeSet::Filter filter,
700                              void* context,
701                              const UnicodeSet* inclusions,
702                              UErrorCode &status) {
703     if (U_FAILURE(status)) return;
704 
705     // Logically, walk through all Unicode characters, noting the start
706     // and end of each range for which filter.contain(c) is
707     // true.  Add each range to a set.
708     //
709     // To improve performance, use an inclusions set which
710     // encodes information about character ranges that are known
711     // to have identical properties.
712     // inclusions contains the first characters of
713     // same-value ranges for the given property.
714 
715     clear();
716 
717     UChar32 startHasProperty = -1;
718     int32_t limitRange = inclusions->getRangeCount();
719 
720     for (int j=0; j<limitRange; ++j) {
721         // get current range
722         UChar32 start = inclusions->getRangeStart(j);
723         UChar32 end = inclusions->getRangeEnd(j);
724 
725         // for all the code points in the range, process
726         for (UChar32 ch = start; ch <= end; ++ch) {
727             // only add to this UnicodeSet on inflection points --
728             // where the hasProperty value changes to false
729             if ((*filter)(ch, context)) {
730                 if (startHasProperty < 0) {
731                     startHasProperty = ch;
732                 }
733             } else if (startHasProperty >= 0) {
734                 add(startHasProperty, ch-1);
735                 startHasProperty = -1;
736             }
737         }
738     }
739     if (startHasProperty >= 0) {
740         add((UChar32)startHasProperty, (UChar32)0x10FFFF);
741     }
742     if (isBogus() && U_SUCCESS(status)) {
743         // We likely ran out of memory. AHHH!
744         status = U_MEMORY_ALLOCATION_ERROR;
745     }
746 }
747 
748 namespace {
749 
mungeCharName(char * dst,const char * src,int32_t dstCapacity)750 static UBool mungeCharName(char* dst, const char* src, int32_t dstCapacity) {
751     /* Note: we use ' ' in compiler code page */
752     int32_t j = 0;
753     char ch;
754     --dstCapacity; /* make room for term. zero */
755     while ((ch = *src++) != 0) {
756         if (ch == ' ' && (j==0 || (j>0 && dst[j-1]==' '))) {
757             continue;
758         }
759         if (j >= dstCapacity) return FALSE;
760         dst[j++] = ch;
761     }
762     if (j > 0 && dst[j-1] == ' ') --j;
763     dst[j] = 0;
764     return TRUE;
765 }
766 
767 }  // namespace
768 
769 //----------------------------------------------------------------
770 // Property set API
771 //----------------------------------------------------------------
772 
773 #define FAIL(ec) UPRV_BLOCK_MACRO_BEGIN { \
774     ec=U_ILLEGAL_ARGUMENT_ERROR; \
775     return *this; \
776 } UPRV_BLOCK_MACRO_END
777 
778 UnicodeSet&
applyIntPropertyValue(UProperty prop,int32_t value,UErrorCode & ec)779 UnicodeSet::applyIntPropertyValue(UProperty prop, int32_t value, UErrorCode& ec) {
780     if (U_FAILURE(ec) || isFrozen()) { return *this; }
781     if (prop == UCHAR_GENERAL_CATEGORY_MASK) {
782         const UnicodeSet* inclusions = CharacterProperties::getInclusionsForProperty(prop, ec);
783         applyFilter(generalCategoryMaskFilter, &value, inclusions, ec);
784     } else if (prop == UCHAR_SCRIPT_EXTENSIONS) {
785         const UnicodeSet* inclusions = CharacterProperties::getInclusionsForProperty(prop, ec);
786         UScriptCode script = (UScriptCode)value;
787         applyFilter(scriptExtensionsFilter, &script, inclusions, ec);
788     } else if (0 <= prop && prop < UCHAR_BINARY_LIMIT) {
789         if (value == 0 || value == 1) {
790             const USet *set = u_getBinaryPropertySet(prop, &ec);
791             if (U_FAILURE(ec)) { return *this; }
792             copyFrom(*UnicodeSet::fromUSet(set), TRUE);
793             if (value == 0) {
794                 complement();
795             }
796         } else {
797             clear();
798         }
799     } else if (UCHAR_INT_START <= prop && prop < UCHAR_INT_LIMIT) {
800         const UnicodeSet* inclusions = CharacterProperties::getInclusionsForProperty(prop, ec);
801         IntPropertyContext c = {prop, value};
802         applyFilter(intPropertyFilter, &c, inclusions, ec);
803     } else {
804         ec = U_ILLEGAL_ARGUMENT_ERROR;
805     }
806     return *this;
807 }
808 
809 UnicodeSet&
applyPropertyAlias(const UnicodeString & prop,const UnicodeString & value,UErrorCode & ec)810 UnicodeSet::applyPropertyAlias(const UnicodeString& prop,
811                                const UnicodeString& value,
812                                UErrorCode& ec) {
813     if (U_FAILURE(ec) || isFrozen()) return *this;
814 
815     // prop and value used to be converted to char * using the default
816     // converter instead of the invariant conversion.
817     // This should not be necessary because all Unicode property and value
818     // names use only invariant characters.
819     // If there are any variant characters, then we won't find them anyway.
820     // Checking first avoids assertion failures in the conversion.
821     if( !uprv_isInvariantUString(prop.getBuffer(), prop.length()) ||
822         !uprv_isInvariantUString(value.getBuffer(), value.length())
823     ) {
824         FAIL(ec);
825     }
826     CharString pname, vname;
827     pname.appendInvariantChars(prop, ec);
828     vname.appendInvariantChars(value, ec);
829     if (U_FAILURE(ec)) return *this;
830 
831     UProperty p;
832     int32_t v;
833     UBool invert = FALSE;
834 
835     if (value.length() > 0) {
836         p = u_getPropertyEnum(pname.data());
837         if (p == UCHAR_INVALID_CODE) FAIL(ec);
838 
839         // Treat gc as gcm
840         if (p == UCHAR_GENERAL_CATEGORY) {
841             p = UCHAR_GENERAL_CATEGORY_MASK;
842         }
843 
844         if ((p >= UCHAR_BINARY_START && p < UCHAR_BINARY_LIMIT) ||
845             (p >= UCHAR_INT_START && p < UCHAR_INT_LIMIT) ||
846             (p >= UCHAR_MASK_START && p < UCHAR_MASK_LIMIT)) {
847             v = u_getPropertyValueEnum(p, vname.data());
848             if (v == UCHAR_INVALID_CODE) {
849                 // Handle numeric CCC
850                 if (p == UCHAR_CANONICAL_COMBINING_CLASS ||
851                     p == UCHAR_TRAIL_CANONICAL_COMBINING_CLASS ||
852                     p == UCHAR_LEAD_CANONICAL_COMBINING_CLASS) {
853                     char* end;
854                     double val = uprv_strtod(vname.data(), &end);
855                     // Anything between 0 and 255 is valid even if unused.
856                     // Cast double->int only after range check.
857                     // We catch NaN here because comparing it with both 0 and 255 will be false
858                     // (as are all comparisons with NaN).
859                     if (*end != 0 || !(0 <= val && val <= 255) ||
860                             (v = (int32_t)val) != val) {
861                         // non-integral value or outside 0..255, or trailing junk
862                         FAIL(ec);
863                     }
864                 } else {
865                     FAIL(ec);
866                 }
867             }
868         }
869 
870         else {
871 
872             switch (p) {
873             case UCHAR_NUMERIC_VALUE:
874                 {
875                     char* end;
876                     double val = uprv_strtod(vname.data(), &end);
877                     if (*end != 0) {
878                         FAIL(ec);
879                     }
880                     applyFilter(numericValueFilter, &val,
881                                 CharacterProperties::getInclusionsForProperty(p, ec), ec);
882                     return *this;
883                 }
884             case UCHAR_NAME:
885                 {
886                     // Must munge name, since u_charFromName() does not do
887                     // 'loose' matching.
888                     char buf[128]; // it suffices that this be > uprv_getMaxCharNameLength
889                     if (!mungeCharName(buf, vname.data(), sizeof(buf))) FAIL(ec);
890                     UChar32 ch = u_charFromName(U_EXTENDED_CHAR_NAME, buf, &ec);
891                     if (U_SUCCESS(ec)) {
892                         clear();
893                         add(ch);
894                         return *this;
895                     } else {
896                         FAIL(ec);
897                     }
898                 }
899             case UCHAR_UNICODE_1_NAME:
900                 // ICU 49 deprecates the Unicode_1_Name property APIs.
901                 FAIL(ec);
902             case UCHAR_AGE:
903                 {
904                     // Must munge name, since u_versionFromString() does not do
905                     // 'loose' matching.
906                     char buf[128];
907                     if (!mungeCharName(buf, vname.data(), sizeof(buf))) FAIL(ec);
908                     UVersionInfo version;
909                     u_versionFromString(version, buf);
910                     applyFilter(versionFilter, &version,
911                                 CharacterProperties::getInclusionsForProperty(p, ec), ec);
912                     return *this;
913                 }
914             case UCHAR_SCRIPT_EXTENSIONS:
915                 v = u_getPropertyValueEnum(UCHAR_SCRIPT, vname.data());
916                 if (v == UCHAR_INVALID_CODE) {
917                     FAIL(ec);
918                 }
919                 // fall through to calling applyIntPropertyValue()
920                 break;
921             default:
922                 // p is a non-binary, non-enumerated property that we
923                 // don't support (yet).
924                 FAIL(ec);
925             }
926         }
927     }
928 
929     else {
930         // value is empty.  Interpret as General Category, Script, or
931         // Binary property.
932         p = UCHAR_GENERAL_CATEGORY_MASK;
933         v = u_getPropertyValueEnum(p, pname.data());
934         if (v == UCHAR_INVALID_CODE) {
935             p = UCHAR_SCRIPT;
936             v = u_getPropertyValueEnum(p, pname.data());
937             if (v == UCHAR_INVALID_CODE) {
938                 p = u_getPropertyEnum(pname.data());
939                 if (p >= UCHAR_BINARY_START && p < UCHAR_BINARY_LIMIT) {
940                     v = 1;
941                 } else if (0 == uprv_comparePropertyNames(ANY, pname.data())) {
942                     set(MIN_VALUE, MAX_VALUE);
943                     return *this;
944                 } else if (0 == uprv_comparePropertyNames(ASCII, pname.data())) {
945                     set(0, 0x7F);
946                     return *this;
947                 } else if (0 == uprv_comparePropertyNames(ASSIGNED, pname.data())) {
948                     // [:Assigned:]=[:^Cn:]
949                     p = UCHAR_GENERAL_CATEGORY_MASK;
950                     v = U_GC_CN_MASK;
951                     invert = TRUE;
952                 } else {
953                     FAIL(ec);
954                 }
955             }
956         }
957     }
958 
959     applyIntPropertyValue(p, v, ec);
960     if(invert) {
961         complement();
962     }
963 
964     if (isBogus() && U_SUCCESS(ec)) {
965         // We likely ran out of memory. AHHH!
966         ec = U_MEMORY_ALLOCATION_ERROR;
967     }
968     return *this;
969 }
970 
971 //----------------------------------------------------------------
972 // Property set patterns
973 //----------------------------------------------------------------
974 
975 /**
976  * Return true if the given position, in the given pattern, appears
977  * to be the start of a property set pattern.
978  */
resemblesPropertyPattern(const UnicodeString & pattern,int32_t pos)979 UBool UnicodeSet::resemblesPropertyPattern(const UnicodeString& pattern,
980                                            int32_t pos) {
981     // Patterns are at least 5 characters long
982     if ((pos+5) > pattern.length()) {
983         return FALSE;
984     }
985 
986     // Look for an opening [:, [:^, \p, or \P
987     return isPOSIXOpen(pattern, pos) || isPerlOpen(pattern, pos) || isNameOpen(pattern, pos);
988 }
989 
990 /**
991  * Return true if the given iterator appears to point at a
992  * property pattern.  Regardless of the result, return with the
993  * iterator unchanged.
994  * @param chars iterator over the pattern characters.  Upon return
995  * it will be unchanged.
996  * @param iterOpts RuleCharacterIterator options
997  */
resemblesPropertyPattern(RuleCharacterIterator & chars,int32_t iterOpts)998 UBool UnicodeSet::resemblesPropertyPattern(RuleCharacterIterator& chars,
999                                            int32_t iterOpts) {
1000     // NOTE: literal will always be FALSE, because we don't parse escapes.
1001     UBool result = FALSE, literal;
1002     UErrorCode ec = U_ZERO_ERROR;
1003     iterOpts &= ~RuleCharacterIterator::PARSE_ESCAPES;
1004     RuleCharacterIterator::Pos pos;
1005     chars.getPos(pos);
1006     UChar32 c = chars.next(iterOpts, literal, ec);
1007     if (c == u'[' || c == u'\\') {
1008         UChar32 d = chars.next(iterOpts & ~RuleCharacterIterator::SKIP_WHITESPACE,
1009                                literal, ec);
1010         result = (c == u'[') ? (d == u':') :
1011                                (d == u'N' || d == u'p' || d == u'P');
1012     }
1013     chars.setPos(pos);
1014     return result && U_SUCCESS(ec);
1015 }
1016 
1017 /**
1018  * Parse the given property pattern at the given parse position.
1019  */
applyPropertyPattern(const UnicodeString & pattern,ParsePosition & ppos,UErrorCode & ec)1020 UnicodeSet& UnicodeSet::applyPropertyPattern(const UnicodeString& pattern,
1021                                              ParsePosition& ppos,
1022                                              UErrorCode &ec) {
1023     int32_t pos = ppos.getIndex();
1024 
1025     UBool posix = FALSE; // true for [:pat:], false for \p{pat} \P{pat} \N{pat}
1026     UBool isName = FALSE; // true for \N{pat}, o/w false
1027     UBool invert = FALSE;
1028 
1029     if (U_FAILURE(ec)) return *this;
1030 
1031     // Minimum length is 5 characters, e.g. \p{L}
1032     if ((pos+5) > pattern.length()) {
1033         FAIL(ec);
1034     }
1035 
1036     // On entry, ppos should point to one of the following locations:
1037     // Look for an opening [:, [:^, \p, or \P
1038     if (isPOSIXOpen(pattern, pos)) {
1039         posix = TRUE;
1040         pos += 2;
1041         pos = ICU_Utility::skipWhitespace(pattern, pos);
1042         if (pos < pattern.length() && pattern.charAt(pos) == u'^') {
1043             ++pos;
1044             invert = TRUE;
1045         }
1046     } else if (isPerlOpen(pattern, pos) || isNameOpen(pattern, pos)) {
1047         UChar c = pattern.charAt(pos+1);
1048         invert = (c == u'P');
1049         isName = (c == u'N');
1050         pos += 2;
1051         pos = ICU_Utility::skipWhitespace(pattern, pos);
1052         if (pos == pattern.length() || pattern.charAt(pos++) != u'{') {
1053             // Syntax error; "\p" or "\P" not followed by "{"
1054             FAIL(ec);
1055         }
1056     } else {
1057         // Open delimiter not seen
1058         FAIL(ec);
1059     }
1060 
1061     // Look for the matching close delimiter, either :] or }
1062     int32_t close;
1063     if (posix) {
1064       close = pattern.indexOf(u":]", 2, pos);
1065     } else {
1066       close = pattern.indexOf(u'}', pos);
1067     }
1068     if (close < 0) {
1069         // Syntax error; close delimiter missing
1070         FAIL(ec);
1071     }
1072 
1073     // Look for an '=' sign.  If this is present, we will parse a
1074     // medium \p{gc=Cf} or long \p{GeneralCategory=Format}
1075     // pattern.
1076     int32_t equals = pattern.indexOf(u'=', pos);
1077     UnicodeString propName, valueName;
1078     if (equals >= 0 && equals < close && !isName) {
1079         // Equals seen; parse medium/long pattern
1080         pattern.extractBetween(pos, equals, propName);
1081         pattern.extractBetween(equals+1, close, valueName);
1082     }
1083 
1084     else {
1085         // Handle case where no '=' is seen, and \N{}
1086         pattern.extractBetween(pos, close, propName);
1087 
1088         // Handle \N{name}
1089         if (isName) {
1090             // This is a little inefficient since it means we have to
1091             // parse NAME_PROP back to UCHAR_NAME even though we already
1092             // know it's UCHAR_NAME.  If we refactor the API to
1093             // support args of (UProperty, char*) then we can remove
1094             // NAME_PROP and make this a little more efficient.
1095             valueName = propName;
1096             propName = UnicodeString(NAME_PROP, NAME_PROP_LENGTH, US_INV);
1097         }
1098     }
1099 
1100     applyPropertyAlias(propName, valueName, ec);
1101 
1102     if (U_SUCCESS(ec)) {
1103         if (invert) {
1104             complement();
1105         }
1106 
1107         // Move to the limit position after the close delimiter if the
1108         // parse succeeded.
1109         ppos.setIndex(close + (posix ? 2 : 1));
1110     }
1111 
1112     return *this;
1113 }
1114 
1115 /**
1116  * Parse a property pattern.
1117  * @param chars iterator over the pattern characters.  Upon return
1118  * it will be advanced to the first character after the parsed
1119  * pattern, or the end of the iteration if all characters are
1120  * parsed.
1121  * @param rebuiltPat the pattern that was parsed, rebuilt or
1122  * copied from the input pattern, as appropriate.
1123  */
applyPropertyPattern(RuleCharacterIterator & chars,UnicodeString & rebuiltPat,UErrorCode & ec)1124 void UnicodeSet::applyPropertyPattern(RuleCharacterIterator& chars,
1125                                       UnicodeString& rebuiltPat,
1126                                       UErrorCode& ec) {
1127     if (U_FAILURE(ec)) return;
1128     UnicodeString pattern;
1129     chars.lookahead(pattern);
1130     ParsePosition pos(0);
1131     applyPropertyPattern(pattern, pos, ec);
1132     if (U_FAILURE(ec)) return;
1133     if (pos.getIndex() == 0) {
1134         // syntaxError(chars, "Invalid property pattern");
1135         ec = U_MALFORMED_SET;
1136         return;
1137     }
1138     chars.jumpahead(pos.getIndex());
1139     rebuiltPat.append(pattern, 0, pos.getIndex());
1140 }
1141 
1142 U_NAMESPACE_END
1143