1 // © 2018 and later: Unicode, Inc. and others.
2 // License & terms of use: http://www.unicode.org/copyright.html
3 
4 // characterproperties.cpp
5 // created: 2018sep03 Markus W. Scherer
6 
7 #include "unicode/utypes.h"
8 #include "unicode/localpointer.h"
9 #include "unicode/uchar.h"
10 #include "unicode/ucpmap.h"
11 #include "unicode/ucptrie.h"
12 #include "unicode/umutablecptrie.h"
13 #include "unicode/uniset.h"
14 #include "unicode/uscript.h"
15 #include "unicode/uset.h"
16 #include "cmemory.h"
17 #include "mutex.h"
18 #include "normalizer2impl.h"
19 #include "uassert.h"
20 #include "ubidi_props.h"
21 #include "ucase.h"
22 #include "ucln_cmn.h"
23 #include "umutex.h"
24 #include "uprops.h"
25 
26 using icu::LocalPointer;
27 #if !UCONFIG_NO_NORMALIZATION
28 using icu::Normalizer2Factory;
29 using icu::Normalizer2Impl;
30 #endif
31 using icu::UInitOnce;
32 using icu::UnicodeSet;
33 
34 namespace {
35 
36 UBool U_CALLCONV characterproperties_cleanup();
37 
38 constexpr int32_t NUM_INCLUSIONS = UPROPS_SRC_COUNT + UCHAR_INT_LIMIT - UCHAR_INT_START;
39 
40 struct Inclusion {
41     UnicodeSet  *fSet = nullptr;
42     UInitOnce    fInitOnce = U_INITONCE_INITIALIZER;
43 };
44 Inclusion gInclusions[NUM_INCLUSIONS]; // cached getInclusions()
45 
46 UnicodeSet *sets[UCHAR_BINARY_LIMIT] = {};
47 
48 UCPMap *maps[UCHAR_INT_LIMIT - UCHAR_INT_START] = {};
49 
50 icu::UMutex cpMutex;
51 
52 //----------------------------------------------------------------
53 // Inclusions list
54 //----------------------------------------------------------------
55 
56 // USetAdder implementation
57 // Does not use uset.h to reduce code dependencies
58 void U_CALLCONV
_set_add(USet * set,UChar32 c)59 _set_add(USet *set, UChar32 c) {
60     ((UnicodeSet *)set)->add(c);
61 }
62 
63 void U_CALLCONV
_set_addRange(USet * set,UChar32 start,UChar32 end)64 _set_addRange(USet *set, UChar32 start, UChar32 end) {
65     ((UnicodeSet *)set)->add(start, end);
66 }
67 
68 void U_CALLCONV
_set_addString(USet * set,const UChar * str,int32_t length)69 _set_addString(USet *set, const UChar *str, int32_t length) {
70     ((UnicodeSet *)set)->add(icu::UnicodeString((UBool)(length<0), str, length));
71 }
72 
characterproperties_cleanup()73 UBool U_CALLCONV characterproperties_cleanup() {
74     for (Inclusion &in: gInclusions) {
75         delete in.fSet;
76         in.fSet = nullptr;
77         in.fInitOnce.reset();
78     }
79     for (int32_t i = 0; i < UPRV_LENGTHOF(sets); ++i) {
80         delete sets[i];
81         sets[i] = nullptr;
82     }
83     for (int32_t i = 0; i < UPRV_LENGTHOF(maps); ++i) {
84         ucptrie_close(reinterpret_cast<UCPTrie *>(maps[i]));
85         maps[i] = nullptr;
86     }
87     return TRUE;
88 }
89 
initInclusion(UPropertySource src,UErrorCode & errorCode)90 void U_CALLCONV initInclusion(UPropertySource src, UErrorCode &errorCode) {
91     // This function is invoked only via umtx_initOnce().
92     U_ASSERT(0 <= src && src < UPROPS_SRC_COUNT);
93     if (src == UPROPS_SRC_NONE) {
94         errorCode = U_INTERNAL_PROGRAM_ERROR;
95         return;
96     }
97     U_ASSERT(gInclusions[src].fSet == nullptr);
98 
99     LocalPointer<UnicodeSet> incl(new UnicodeSet());
100     if (incl.isNull()) {
101         errorCode = U_MEMORY_ALLOCATION_ERROR;
102         return;
103     }
104     USetAdder sa = {
105         (USet *)incl.getAlias(),
106         _set_add,
107         _set_addRange,
108         _set_addString,
109         nullptr, // don't need remove()
110         nullptr // don't need removeRange()
111     };
112 
113     switch(src) {
114     case UPROPS_SRC_CHAR:
115         uchar_addPropertyStarts(&sa, &errorCode);
116         break;
117     case UPROPS_SRC_PROPSVEC:
118         upropsvec_addPropertyStarts(&sa, &errorCode);
119         break;
120     case UPROPS_SRC_CHAR_AND_PROPSVEC:
121         uchar_addPropertyStarts(&sa, &errorCode);
122         upropsvec_addPropertyStarts(&sa, &errorCode);
123         break;
124 #if !UCONFIG_NO_NORMALIZATION
125     case UPROPS_SRC_CASE_AND_NORM: {
126         const Normalizer2Impl *impl=Normalizer2Factory::getNFCImpl(errorCode);
127         if(U_SUCCESS(errorCode)) {
128             impl->addPropertyStarts(&sa, errorCode);
129         }
130         ucase_addPropertyStarts(&sa, &errorCode);
131         break;
132     }
133     case UPROPS_SRC_NFC: {
134         const Normalizer2Impl *impl=Normalizer2Factory::getNFCImpl(errorCode);
135         if(U_SUCCESS(errorCode)) {
136             impl->addPropertyStarts(&sa, errorCode);
137         }
138         break;
139     }
140     case UPROPS_SRC_NFKC: {
141         const Normalizer2Impl *impl=Normalizer2Factory::getNFKCImpl(errorCode);
142         if(U_SUCCESS(errorCode)) {
143             impl->addPropertyStarts(&sa, errorCode);
144         }
145         break;
146     }
147     case UPROPS_SRC_NFKC_CF: {
148         const Normalizer2Impl *impl=Normalizer2Factory::getNFKC_CFImpl(errorCode);
149         if(U_SUCCESS(errorCode)) {
150             impl->addPropertyStarts(&sa, errorCode);
151         }
152         break;
153     }
154     case UPROPS_SRC_NFC_CANON_ITER: {
155         const Normalizer2Impl *impl=Normalizer2Factory::getNFCImpl(errorCode);
156         if(U_SUCCESS(errorCode)) {
157             impl->addCanonIterPropertyStarts(&sa, errorCode);
158         }
159         break;
160     }
161 #endif
162     case UPROPS_SRC_CASE:
163         ucase_addPropertyStarts(&sa, &errorCode);
164         break;
165     case UPROPS_SRC_BIDI:
166         ubidi_addPropertyStarts(&sa, &errorCode);
167         break;
168     case UPROPS_SRC_INPC:
169     case UPROPS_SRC_INSC:
170     case UPROPS_SRC_VO:
171         uprops_addPropertyStarts((UPropertySource)src, &sa, &errorCode);
172         break;
173     default:
174         errorCode = U_INTERNAL_PROGRAM_ERROR;
175         break;
176     }
177 
178     if (U_FAILURE(errorCode)) {
179         return;
180     }
181     if (incl->isBogus()) {
182         errorCode = U_MEMORY_ALLOCATION_ERROR;
183         return;
184     }
185     // Compact for caching.
186     incl->compact();
187     gInclusions[src].fSet = incl.orphan();
188     ucln_common_registerCleanup(UCLN_COMMON_CHARACTERPROPERTIES, characterproperties_cleanup);
189 }
190 
getInclusionsForSource(UPropertySource src,UErrorCode & errorCode)191 const UnicodeSet *getInclusionsForSource(UPropertySource src, UErrorCode &errorCode) {
192     if (U_FAILURE(errorCode)) { return nullptr; }
193     if (src < 0 || UPROPS_SRC_COUNT <= src) {
194         errorCode = U_ILLEGAL_ARGUMENT_ERROR;
195         return nullptr;
196     }
197     Inclusion &i = gInclusions[src];
198     umtx_initOnce(i.fInitOnce, &initInclusion, src, errorCode);
199     return i.fSet;
200 }
201 
initIntPropInclusion(UProperty prop,UErrorCode & errorCode)202 void U_CALLCONV initIntPropInclusion(UProperty prop, UErrorCode &errorCode) {
203     // This function is invoked only via umtx_initOnce().
204     U_ASSERT(UCHAR_INT_START <= prop && prop < UCHAR_INT_LIMIT);
205     int32_t inclIndex = UPROPS_SRC_COUNT + prop - UCHAR_INT_START;
206     U_ASSERT(gInclusions[inclIndex].fSet == nullptr);
207     UPropertySource src = uprops_getSource(prop);
208     const UnicodeSet *incl = getInclusionsForSource(src, errorCode);
209     if (U_FAILURE(errorCode)) {
210         return;
211     }
212 
213     LocalPointer<UnicodeSet> intPropIncl(new UnicodeSet(0, 0));
214     if (intPropIncl.isNull()) {
215         errorCode = U_MEMORY_ALLOCATION_ERROR;
216         return;
217     }
218     int32_t numRanges = incl->getRangeCount();
219     int32_t prevValue = 0;
220     for (int32_t i = 0; i < numRanges; ++i) {
221         UChar32 rangeEnd = incl->getRangeEnd(i);
222         for (UChar32 c = incl->getRangeStart(i); c <= rangeEnd; ++c) {
223             // TODO: Get a UCharacterProperty.IntProperty to avoid the property dispatch.
224             int32_t value = u_getIntPropertyValue(c, prop);
225             if (value != prevValue) {
226                 intPropIncl->add(c);
227                 prevValue = value;
228             }
229         }
230     }
231 
232     if (intPropIncl->isBogus()) {
233         errorCode = U_MEMORY_ALLOCATION_ERROR;
234         return;
235     }
236     // Compact for caching.
237     intPropIncl->compact();
238     gInclusions[inclIndex].fSet = intPropIncl.orphan();
239     ucln_common_registerCleanup(UCLN_COMMON_CHARACTERPROPERTIES, characterproperties_cleanup);
240 }
241 
242 }  // namespace
243 
244 U_NAMESPACE_BEGIN
245 
getInclusionsForProperty(UProperty prop,UErrorCode & errorCode)246 const UnicodeSet *CharacterProperties::getInclusionsForProperty(
247         UProperty prop, UErrorCode &errorCode) {
248     if (U_FAILURE(errorCode)) { return nullptr; }
249     if (UCHAR_INT_START <= prop && prop < UCHAR_INT_LIMIT) {
250         int32_t inclIndex = UPROPS_SRC_COUNT + prop - UCHAR_INT_START;
251         Inclusion &i = gInclusions[inclIndex];
252         umtx_initOnce(i.fInitOnce, &initIntPropInclusion, prop, errorCode);
253         return i.fSet;
254     } else {
255         UPropertySource src = uprops_getSource(prop);
256         return getInclusionsForSource(src, errorCode);
257     }
258 }
259 
260 U_NAMESPACE_END
261 
262 namespace {
263 
makeSet(UProperty property,UErrorCode & errorCode)264 UnicodeSet *makeSet(UProperty property, UErrorCode &errorCode) {
265     if (U_FAILURE(errorCode)) { return nullptr; }
266     LocalPointer<UnicodeSet> set(new UnicodeSet());
267     if (set.isNull()) {
268         errorCode = U_MEMORY_ALLOCATION_ERROR;
269         return nullptr;
270     }
271     const UnicodeSet *inclusions =
272         icu::CharacterProperties::getInclusionsForProperty(property, errorCode);
273     if (U_FAILURE(errorCode)) { return nullptr; }
274     int32_t numRanges = inclusions->getRangeCount();
275     UChar32 startHasProperty = -1;
276 
277     for (int32_t i = 0; i < numRanges; ++i) {
278         UChar32 rangeEnd = inclusions->getRangeEnd(i);
279         for (UChar32 c = inclusions->getRangeStart(i); c <= rangeEnd; ++c) {
280             // TODO: Get a UCharacterProperty.BinaryProperty to avoid the property dispatch.
281             if (u_hasBinaryProperty(c, property)) {
282                 if (startHasProperty < 0) {
283                     // Transition from false to true.
284                     startHasProperty = c;
285                 }
286             } else if (startHasProperty >= 0) {
287                 // Transition from true to false.
288                 set->add(startHasProperty, c - 1);
289                 startHasProperty = -1;
290             }
291         }
292     }
293     if (startHasProperty >= 0) {
294         set->add(startHasProperty, 0x10FFFF);
295     }
296     set->freeze();
297     return set.orphan();
298 }
299 
makeMap(UProperty property,UErrorCode & errorCode)300 UCPMap *makeMap(UProperty property, UErrorCode &errorCode) {
301     if (U_FAILURE(errorCode)) { return nullptr; }
302     uint32_t nullValue = property == UCHAR_SCRIPT ? USCRIPT_UNKNOWN : 0;
303     icu::LocalUMutableCPTriePointer mutableTrie(
304         umutablecptrie_open(nullValue, nullValue, &errorCode));
305     const UnicodeSet *inclusions =
306         icu::CharacterProperties::getInclusionsForProperty(property, errorCode);
307     if (U_FAILURE(errorCode)) { return nullptr; }
308     int32_t numRanges = inclusions->getRangeCount();
309     UChar32 start = 0;
310     uint32_t value = nullValue;
311 
312     for (int32_t i = 0; i < numRanges; ++i) {
313         UChar32 rangeEnd = inclusions->getRangeEnd(i);
314         for (UChar32 c = inclusions->getRangeStart(i); c <= rangeEnd; ++c) {
315             // TODO: Get a UCharacterProperty.IntProperty to avoid the property dispatch.
316             uint32_t nextValue = u_getIntPropertyValue(c, property);
317             if (value != nextValue) {
318                 if (value != nullValue) {
319                     umutablecptrie_setRange(mutableTrie.getAlias(), start, c - 1, value, &errorCode);
320                 }
321                 start = c;
322                 value = nextValue;
323             }
324         }
325     }
326     if (value != 0) {
327         umutablecptrie_setRange(mutableTrie.getAlias(), start, 0x10FFFF, value, &errorCode);
328     }
329 
330     UCPTrieType type;
331     if (property == UCHAR_BIDI_CLASS || property == UCHAR_GENERAL_CATEGORY) {
332         type = UCPTRIE_TYPE_FAST;
333     } else {
334         type = UCPTRIE_TYPE_SMALL;
335     }
336     UCPTrieValueWidth valueWidth;
337     // TODO: UCharacterProperty.IntProperty
338     int32_t max = u_getIntPropertyMaxValue(property);
339     if (max <= 0xff) {
340         valueWidth = UCPTRIE_VALUE_BITS_8;
341     } else if (max <= 0xffff) {
342         valueWidth = UCPTRIE_VALUE_BITS_16;
343     } else {
344         valueWidth = UCPTRIE_VALUE_BITS_32;
345     }
346     return reinterpret_cast<UCPMap *>(
347         umutablecptrie_buildImmutable(mutableTrie.getAlias(), type, valueWidth, &errorCode));
348 }
349 
350 }  // namespace
351 
352 U_NAMESPACE_USE
353 
354 U_CAPI const USet * U_EXPORT2
u_getBinaryPropertySet(UProperty property,UErrorCode * pErrorCode)355 u_getBinaryPropertySet(UProperty property, UErrorCode *pErrorCode) {
356     if (U_FAILURE(*pErrorCode)) { return nullptr; }
357     if (property < 0 || UCHAR_BINARY_LIMIT <= property) {
358         *pErrorCode = U_ILLEGAL_ARGUMENT_ERROR;
359         return nullptr;
360     }
361     Mutex m(&cpMutex);
362     UnicodeSet *set = sets[property];
363     if (set == nullptr) {
364         sets[property] = set = makeSet(property, *pErrorCode);
365     }
366     if (U_FAILURE(*pErrorCode)) { return nullptr; }
367     return set->toUSet();
368 }
369 
370 U_CAPI const UCPMap * U_EXPORT2
u_getIntPropertyMap(UProperty property,UErrorCode * pErrorCode)371 u_getIntPropertyMap(UProperty property, UErrorCode *pErrorCode) {
372     if (U_FAILURE(*pErrorCode)) { return nullptr; }
373     if (property < UCHAR_INT_START || UCHAR_INT_LIMIT <= property) {
374         *pErrorCode = U_ILLEGAL_ARGUMENT_ERROR;
375         return nullptr;
376     }
377     Mutex m(&cpMutex);
378     UCPMap *map = maps[property - UCHAR_INT_START];
379     if (map == nullptr) {
380         maps[property - UCHAR_INT_START] = map = makeMap(property, *pErrorCode);
381     }
382     return map;
383 }
384