1 // © 2018 and later: Unicode, Inc. and others.
2 // License & terms of use: http://www.unicode.org/copyright.html
3
4 // characterproperties.cpp
5 // created: 2018sep03 Markus W. Scherer
6
7 #include "unicode/utypes.h"
8 #include "unicode/localpointer.h"
9 #include "unicode/uchar.h"
10 #include "unicode/ucpmap.h"
11 #include "unicode/ucptrie.h"
12 #include "unicode/umutablecptrie.h"
13 #include "unicode/uniset.h"
14 #include "unicode/uscript.h"
15 #include "unicode/uset.h"
16 #include "cmemory.h"
17 #include "mutex.h"
18 #include "normalizer2impl.h"
19 #include "uassert.h"
20 #include "ubidi_props.h"
21 #include "ucase.h"
22 #include "ucln_cmn.h"
23 #include "umutex.h"
24 #include "uprops.h"
25
26 using icu::LocalPointer;
27 #if !UCONFIG_NO_NORMALIZATION
28 using icu::Normalizer2Factory;
29 using icu::Normalizer2Impl;
30 #endif
31 using icu::UInitOnce;
32 using icu::UnicodeSet;
33
34 namespace {
35
36 UBool U_CALLCONV characterproperties_cleanup();
37
38 constexpr int32_t NUM_INCLUSIONS = UPROPS_SRC_COUNT + UCHAR_INT_LIMIT - UCHAR_INT_START;
39
40 struct Inclusion {
41 UnicodeSet *fSet = nullptr;
42 UInitOnce fInitOnce = U_INITONCE_INITIALIZER;
43 };
44 Inclusion gInclusions[NUM_INCLUSIONS]; // cached getInclusions()
45
46 UnicodeSet *sets[UCHAR_BINARY_LIMIT] = {};
47
48 UCPMap *maps[UCHAR_INT_LIMIT - UCHAR_INT_START] = {};
49
50 icu::UMutex cpMutex;
51
52 //----------------------------------------------------------------
53 // Inclusions list
54 //----------------------------------------------------------------
55
56 // USetAdder implementation
57 // Does not use uset.h to reduce code dependencies
58 void U_CALLCONV
_set_add(USet * set,UChar32 c)59 _set_add(USet *set, UChar32 c) {
60 ((UnicodeSet *)set)->add(c);
61 }
62
63 void U_CALLCONV
_set_addRange(USet * set,UChar32 start,UChar32 end)64 _set_addRange(USet *set, UChar32 start, UChar32 end) {
65 ((UnicodeSet *)set)->add(start, end);
66 }
67
68 void U_CALLCONV
_set_addString(USet * set,const UChar * str,int32_t length)69 _set_addString(USet *set, const UChar *str, int32_t length) {
70 ((UnicodeSet *)set)->add(icu::UnicodeString((UBool)(length<0), str, length));
71 }
72
characterproperties_cleanup()73 UBool U_CALLCONV characterproperties_cleanup() {
74 for (Inclusion &in: gInclusions) {
75 delete in.fSet;
76 in.fSet = nullptr;
77 in.fInitOnce.reset();
78 }
79 for (int32_t i = 0; i < UPRV_LENGTHOF(sets); ++i) {
80 delete sets[i];
81 sets[i] = nullptr;
82 }
83 for (int32_t i = 0; i < UPRV_LENGTHOF(maps); ++i) {
84 ucptrie_close(reinterpret_cast<UCPTrie *>(maps[i]));
85 maps[i] = nullptr;
86 }
87 return TRUE;
88 }
89
initInclusion(UPropertySource src,UErrorCode & errorCode)90 void U_CALLCONV initInclusion(UPropertySource src, UErrorCode &errorCode) {
91 // This function is invoked only via umtx_initOnce().
92 U_ASSERT(0 <= src && src < UPROPS_SRC_COUNT);
93 if (src == UPROPS_SRC_NONE) {
94 errorCode = U_INTERNAL_PROGRAM_ERROR;
95 return;
96 }
97 U_ASSERT(gInclusions[src].fSet == nullptr);
98
99 LocalPointer<UnicodeSet> incl(new UnicodeSet());
100 if (incl.isNull()) {
101 errorCode = U_MEMORY_ALLOCATION_ERROR;
102 return;
103 }
104 USetAdder sa = {
105 (USet *)incl.getAlias(),
106 _set_add,
107 _set_addRange,
108 _set_addString,
109 nullptr, // don't need remove()
110 nullptr // don't need removeRange()
111 };
112
113 switch(src) {
114 case UPROPS_SRC_CHAR:
115 uchar_addPropertyStarts(&sa, &errorCode);
116 break;
117 case UPROPS_SRC_PROPSVEC:
118 upropsvec_addPropertyStarts(&sa, &errorCode);
119 break;
120 case UPROPS_SRC_CHAR_AND_PROPSVEC:
121 uchar_addPropertyStarts(&sa, &errorCode);
122 upropsvec_addPropertyStarts(&sa, &errorCode);
123 break;
124 #if !UCONFIG_NO_NORMALIZATION
125 case UPROPS_SRC_CASE_AND_NORM: {
126 const Normalizer2Impl *impl=Normalizer2Factory::getNFCImpl(errorCode);
127 if(U_SUCCESS(errorCode)) {
128 impl->addPropertyStarts(&sa, errorCode);
129 }
130 ucase_addPropertyStarts(&sa, &errorCode);
131 break;
132 }
133 case UPROPS_SRC_NFC: {
134 const Normalizer2Impl *impl=Normalizer2Factory::getNFCImpl(errorCode);
135 if(U_SUCCESS(errorCode)) {
136 impl->addPropertyStarts(&sa, errorCode);
137 }
138 break;
139 }
140 case UPROPS_SRC_NFKC: {
141 const Normalizer2Impl *impl=Normalizer2Factory::getNFKCImpl(errorCode);
142 if(U_SUCCESS(errorCode)) {
143 impl->addPropertyStarts(&sa, errorCode);
144 }
145 break;
146 }
147 case UPROPS_SRC_NFKC_CF: {
148 const Normalizer2Impl *impl=Normalizer2Factory::getNFKC_CFImpl(errorCode);
149 if(U_SUCCESS(errorCode)) {
150 impl->addPropertyStarts(&sa, errorCode);
151 }
152 break;
153 }
154 case UPROPS_SRC_NFC_CANON_ITER: {
155 const Normalizer2Impl *impl=Normalizer2Factory::getNFCImpl(errorCode);
156 if(U_SUCCESS(errorCode)) {
157 impl->addCanonIterPropertyStarts(&sa, errorCode);
158 }
159 break;
160 }
161 #endif
162 case UPROPS_SRC_CASE:
163 ucase_addPropertyStarts(&sa, &errorCode);
164 break;
165 case UPROPS_SRC_BIDI:
166 ubidi_addPropertyStarts(&sa, &errorCode);
167 break;
168 case UPROPS_SRC_INPC:
169 case UPROPS_SRC_INSC:
170 case UPROPS_SRC_VO:
171 uprops_addPropertyStarts((UPropertySource)src, &sa, &errorCode);
172 break;
173 default:
174 errorCode = U_INTERNAL_PROGRAM_ERROR;
175 break;
176 }
177
178 if (U_FAILURE(errorCode)) {
179 return;
180 }
181 if (incl->isBogus()) {
182 errorCode = U_MEMORY_ALLOCATION_ERROR;
183 return;
184 }
185 // Compact for caching.
186 incl->compact();
187 gInclusions[src].fSet = incl.orphan();
188 ucln_common_registerCleanup(UCLN_COMMON_CHARACTERPROPERTIES, characterproperties_cleanup);
189 }
190
getInclusionsForSource(UPropertySource src,UErrorCode & errorCode)191 const UnicodeSet *getInclusionsForSource(UPropertySource src, UErrorCode &errorCode) {
192 if (U_FAILURE(errorCode)) { return nullptr; }
193 if (src < 0 || UPROPS_SRC_COUNT <= src) {
194 errorCode = U_ILLEGAL_ARGUMENT_ERROR;
195 return nullptr;
196 }
197 Inclusion &i = gInclusions[src];
198 umtx_initOnce(i.fInitOnce, &initInclusion, src, errorCode);
199 return i.fSet;
200 }
201
initIntPropInclusion(UProperty prop,UErrorCode & errorCode)202 void U_CALLCONV initIntPropInclusion(UProperty prop, UErrorCode &errorCode) {
203 // This function is invoked only via umtx_initOnce().
204 U_ASSERT(UCHAR_INT_START <= prop && prop < UCHAR_INT_LIMIT);
205 int32_t inclIndex = UPROPS_SRC_COUNT + prop - UCHAR_INT_START;
206 U_ASSERT(gInclusions[inclIndex].fSet == nullptr);
207 UPropertySource src = uprops_getSource(prop);
208 const UnicodeSet *incl = getInclusionsForSource(src, errorCode);
209 if (U_FAILURE(errorCode)) {
210 return;
211 }
212
213 LocalPointer<UnicodeSet> intPropIncl(new UnicodeSet(0, 0));
214 if (intPropIncl.isNull()) {
215 errorCode = U_MEMORY_ALLOCATION_ERROR;
216 return;
217 }
218 int32_t numRanges = incl->getRangeCount();
219 int32_t prevValue = 0;
220 for (int32_t i = 0; i < numRanges; ++i) {
221 UChar32 rangeEnd = incl->getRangeEnd(i);
222 for (UChar32 c = incl->getRangeStart(i); c <= rangeEnd; ++c) {
223 // TODO: Get a UCharacterProperty.IntProperty to avoid the property dispatch.
224 int32_t value = u_getIntPropertyValue(c, prop);
225 if (value != prevValue) {
226 intPropIncl->add(c);
227 prevValue = value;
228 }
229 }
230 }
231
232 if (intPropIncl->isBogus()) {
233 errorCode = U_MEMORY_ALLOCATION_ERROR;
234 return;
235 }
236 // Compact for caching.
237 intPropIncl->compact();
238 gInclusions[inclIndex].fSet = intPropIncl.orphan();
239 ucln_common_registerCleanup(UCLN_COMMON_CHARACTERPROPERTIES, characterproperties_cleanup);
240 }
241
242 } // namespace
243
244 U_NAMESPACE_BEGIN
245
getInclusionsForProperty(UProperty prop,UErrorCode & errorCode)246 const UnicodeSet *CharacterProperties::getInclusionsForProperty(
247 UProperty prop, UErrorCode &errorCode) {
248 if (U_FAILURE(errorCode)) { return nullptr; }
249 if (UCHAR_INT_START <= prop && prop < UCHAR_INT_LIMIT) {
250 int32_t inclIndex = UPROPS_SRC_COUNT + prop - UCHAR_INT_START;
251 Inclusion &i = gInclusions[inclIndex];
252 umtx_initOnce(i.fInitOnce, &initIntPropInclusion, prop, errorCode);
253 return i.fSet;
254 } else {
255 UPropertySource src = uprops_getSource(prop);
256 return getInclusionsForSource(src, errorCode);
257 }
258 }
259
260 U_NAMESPACE_END
261
262 namespace {
263
makeSet(UProperty property,UErrorCode & errorCode)264 UnicodeSet *makeSet(UProperty property, UErrorCode &errorCode) {
265 if (U_FAILURE(errorCode)) { return nullptr; }
266 LocalPointer<UnicodeSet> set(new UnicodeSet());
267 if (set.isNull()) {
268 errorCode = U_MEMORY_ALLOCATION_ERROR;
269 return nullptr;
270 }
271 const UnicodeSet *inclusions =
272 icu::CharacterProperties::getInclusionsForProperty(property, errorCode);
273 if (U_FAILURE(errorCode)) { return nullptr; }
274 int32_t numRanges = inclusions->getRangeCount();
275 UChar32 startHasProperty = -1;
276
277 for (int32_t i = 0; i < numRanges; ++i) {
278 UChar32 rangeEnd = inclusions->getRangeEnd(i);
279 for (UChar32 c = inclusions->getRangeStart(i); c <= rangeEnd; ++c) {
280 // TODO: Get a UCharacterProperty.BinaryProperty to avoid the property dispatch.
281 if (u_hasBinaryProperty(c, property)) {
282 if (startHasProperty < 0) {
283 // Transition from false to true.
284 startHasProperty = c;
285 }
286 } else if (startHasProperty >= 0) {
287 // Transition from true to false.
288 set->add(startHasProperty, c - 1);
289 startHasProperty = -1;
290 }
291 }
292 }
293 if (startHasProperty >= 0) {
294 set->add(startHasProperty, 0x10FFFF);
295 }
296 set->freeze();
297 return set.orphan();
298 }
299
makeMap(UProperty property,UErrorCode & errorCode)300 UCPMap *makeMap(UProperty property, UErrorCode &errorCode) {
301 if (U_FAILURE(errorCode)) { return nullptr; }
302 uint32_t nullValue = property == UCHAR_SCRIPT ? USCRIPT_UNKNOWN : 0;
303 icu::LocalUMutableCPTriePointer mutableTrie(
304 umutablecptrie_open(nullValue, nullValue, &errorCode));
305 const UnicodeSet *inclusions =
306 icu::CharacterProperties::getInclusionsForProperty(property, errorCode);
307 if (U_FAILURE(errorCode)) { return nullptr; }
308 int32_t numRanges = inclusions->getRangeCount();
309 UChar32 start = 0;
310 uint32_t value = nullValue;
311
312 for (int32_t i = 0; i < numRanges; ++i) {
313 UChar32 rangeEnd = inclusions->getRangeEnd(i);
314 for (UChar32 c = inclusions->getRangeStart(i); c <= rangeEnd; ++c) {
315 // TODO: Get a UCharacterProperty.IntProperty to avoid the property dispatch.
316 uint32_t nextValue = u_getIntPropertyValue(c, property);
317 if (value != nextValue) {
318 if (value != nullValue) {
319 umutablecptrie_setRange(mutableTrie.getAlias(), start, c - 1, value, &errorCode);
320 }
321 start = c;
322 value = nextValue;
323 }
324 }
325 }
326 if (value != 0) {
327 umutablecptrie_setRange(mutableTrie.getAlias(), start, 0x10FFFF, value, &errorCode);
328 }
329
330 UCPTrieType type;
331 if (property == UCHAR_BIDI_CLASS || property == UCHAR_GENERAL_CATEGORY) {
332 type = UCPTRIE_TYPE_FAST;
333 } else {
334 type = UCPTRIE_TYPE_SMALL;
335 }
336 UCPTrieValueWidth valueWidth;
337 // TODO: UCharacterProperty.IntProperty
338 int32_t max = u_getIntPropertyMaxValue(property);
339 if (max <= 0xff) {
340 valueWidth = UCPTRIE_VALUE_BITS_8;
341 } else if (max <= 0xffff) {
342 valueWidth = UCPTRIE_VALUE_BITS_16;
343 } else {
344 valueWidth = UCPTRIE_VALUE_BITS_32;
345 }
346 return reinterpret_cast<UCPMap *>(
347 umutablecptrie_buildImmutable(mutableTrie.getAlias(), type, valueWidth, &errorCode));
348 }
349
350 } // namespace
351
352 U_NAMESPACE_USE
353
354 U_CAPI const USet * U_EXPORT2
u_getBinaryPropertySet(UProperty property,UErrorCode * pErrorCode)355 u_getBinaryPropertySet(UProperty property, UErrorCode *pErrorCode) {
356 if (U_FAILURE(*pErrorCode)) { return nullptr; }
357 if (property < 0 || UCHAR_BINARY_LIMIT <= property) {
358 *pErrorCode = U_ILLEGAL_ARGUMENT_ERROR;
359 return nullptr;
360 }
361 Mutex m(&cpMutex);
362 UnicodeSet *set = sets[property];
363 if (set == nullptr) {
364 sets[property] = set = makeSet(property, *pErrorCode);
365 }
366 if (U_FAILURE(*pErrorCode)) { return nullptr; }
367 return set->toUSet();
368 }
369
370 U_CAPI const UCPMap * U_EXPORT2
u_getIntPropertyMap(UProperty property,UErrorCode * pErrorCode)371 u_getIntPropertyMap(UProperty property, UErrorCode *pErrorCode) {
372 if (U_FAILURE(*pErrorCode)) { return nullptr; }
373 if (property < UCHAR_INT_START || UCHAR_INT_LIMIT <= property) {
374 *pErrorCode = U_ILLEGAL_ARGUMENT_ERROR;
375 return nullptr;
376 }
377 Mutex m(&cpMutex);
378 UCPMap *map = maps[property - UCHAR_INT_START];
379 if (map == nullptr) {
380 maps[property - UCHAR_INT_START] = map = makeMap(property, *pErrorCode);
381 }
382 return map;
383 }
384