1 // © 2018 and later: Unicode, Inc. and others.
2 // License & terms of use: http://www.unicode.org/copyright.html
3 
4 #include "unicode/utypes.h"
5 
6 #if !UCONFIG_NO_FORMATTING
7 
8 // Allow implicit conversion from char16_t* to UnicodeString for this file:
9 // Helpful in toString methods and elsewhere.
10 #define UNISTR_FROM_STRING_EXPLICIT
11 
12 #include "static_unicode_sets.h"
13 #include "umutex.h"
14 #include "ucln_cmn.h"
15 #include "unicode/uniset.h"
16 #include "uresimp.h"
17 #include "cstring.h"
18 #include "uassert.h"
19 
20 using namespace icu;
21 using namespace icu::unisets;
22 
23 
24 namespace {
25 
26 UnicodeSet* gUnicodeSets[UNISETS_KEY_COUNT] = {};
27 
28 // Save the empty instance in static memory to have well-defined behavior if a
29 // regular UnicodeSet cannot be allocated.
30 alignas(UnicodeSet)
31 char gEmptyUnicodeSet[sizeof(UnicodeSet)];
32 
33 // Whether the gEmptyUnicodeSet is initialized and ready to use.
34 UBool gEmptyUnicodeSetInitialized = FALSE;
35 
getImpl(Key key)36 inline UnicodeSet* getImpl(Key key) {
37     UnicodeSet* candidate = gUnicodeSets[key];
38     if (candidate == nullptr) {
39         return reinterpret_cast<UnicodeSet*>(gEmptyUnicodeSet);
40     }
41     return candidate;
42 }
43 
computeUnion(Key k1,Key k2)44 UnicodeSet* computeUnion(Key k1, Key k2) {
45     UnicodeSet* result = new UnicodeSet();
46     if (result == nullptr) {
47         return nullptr;
48     }
49     result->addAll(*getImpl(k1));
50     result->addAll(*getImpl(k2));
51     result->freeze();
52     return result;
53 }
54 
computeUnion(Key k1,Key k2,Key k3)55 UnicodeSet* computeUnion(Key k1, Key k2, Key k3) {
56     UnicodeSet* result = new UnicodeSet();
57     if (result == nullptr) {
58         return nullptr;
59     }
60     result->addAll(*getImpl(k1));
61     result->addAll(*getImpl(k2));
62     result->addAll(*getImpl(k3));
63     result->freeze();
64     return result;
65 }
66 
67 
saveSet(Key key,const UnicodeString & unicodeSetPattern,UErrorCode & status)68 void saveSet(Key key, const UnicodeString& unicodeSetPattern, UErrorCode& status) {
69     // assert unicodeSets.get(key) == null;
70     gUnicodeSets[key] = new UnicodeSet(unicodeSetPattern, status);
71 }
72 
73 class ParseDataSink : public ResourceSink {
74   public:
put(const char * key,ResourceValue & value,UBool,UErrorCode & status)75     void put(const char* key, ResourceValue& value, UBool /*noFallback*/, UErrorCode& status) U_OVERRIDE {
76         ResourceTable contextsTable = value.getTable(status);
77         if (U_FAILURE(status)) { return; }
78         for (int i = 0; contextsTable.getKeyAndValue(i, key, value); i++) {
79             if (uprv_strcmp(key, "date") == 0) {
80                 // ignore
81             } else {
82                 ResourceTable strictnessTable = value.getTable(status);
83                 if (U_FAILURE(status)) { return; }
84                 for (int j = 0; strictnessTable.getKeyAndValue(j, key, value); j++) {
85                     bool isLenient = (uprv_strcmp(key, "lenient") == 0);
86                     ResourceArray array = value.getArray(status);
87                     if (U_FAILURE(status)) { return; }
88                     for (int k = 0; k < array.getSize(); k++) {
89                         array.getValue(k, value);
90                         UnicodeString str = value.getUnicodeString(status);
91                         if (U_FAILURE(status)) { return; }
92                         // There is both lenient and strict data for comma/period,
93                         // but not for any of the other symbols.
94                         if (str.indexOf(u'.') != -1) {
95                             saveSet(isLenient ? PERIOD : STRICT_PERIOD, str, status);
96                         } else if (str.indexOf(u',') != -1) {
97                             saveSet(isLenient ? COMMA : STRICT_COMMA, str, status);
98                         } else if (str.indexOf(u'+') != -1) {
99                             saveSet(PLUS_SIGN, str, status);
100                         } else if (str.indexOf(u'-') != -1) {
101                             saveSet(MINUS_SIGN, str, status);
102                         } else if (str.indexOf(u'$') != -1) {
103                             saveSet(DOLLAR_SIGN, str, status);
104                         } else if (str.indexOf(u'£') != -1) {
105                             saveSet(POUND_SIGN, str, status);
106                         } else if (str.indexOf(u'₹') != -1) {
107                             saveSet(RUPEE_SIGN, str, status);
108                         } else if (str.indexOf(u'¥') != -1) {
109                             saveSet(YEN_SIGN, str, status);
110                         } else if (str.indexOf(u'₩') != -1) {
111                             saveSet(WON_SIGN, str, status);
112                         } else if (str.indexOf(u'%') != -1) {
113                             saveSet(PERCENT_SIGN, str, status);
114                         } else if (str.indexOf(u'‰') != -1) {
115                             saveSet(PERMILLE_SIGN, str, status);
116                         } else if (str.indexOf(u'’') != -1) {
117                             saveSet(APOSTROPHE_SIGN, str, status);
118                         } else {
119                             // Unknown class of parse lenients
120                             // TODO(ICU-20428): Make ICU automatically accept new classes?
121                             U_ASSERT(FALSE);
122                         }
123                         if (U_FAILURE(status)) { return; }
124                     }
125                 }
126             }
127         }
128     }
129 };
130 
131 
132 icu::UInitOnce gNumberParseUniSetsInitOnce = U_INITONCE_INITIALIZER;
133 
cleanupNumberParseUniSets()134 UBool U_CALLCONV cleanupNumberParseUniSets() {
135     if (gEmptyUnicodeSetInitialized) {
136         reinterpret_cast<UnicodeSet*>(gEmptyUnicodeSet)->~UnicodeSet();
137         gEmptyUnicodeSetInitialized = FALSE;
138     }
139     for (int32_t i = 0; i < UNISETS_KEY_COUNT; i++) {
140         delete gUnicodeSets[i];
141         gUnicodeSets[i] = nullptr;
142     }
143     gNumberParseUniSetsInitOnce.reset();
144     return TRUE;
145 }
146 
initNumberParseUniSets(UErrorCode & status)147 void U_CALLCONV initNumberParseUniSets(UErrorCode& status) {
148     ucln_common_registerCleanup(UCLN_COMMON_NUMPARSE_UNISETS, cleanupNumberParseUniSets);
149 
150     // Initialize the empty instance for well-defined fallback behavior
151     new(gEmptyUnicodeSet) UnicodeSet();
152     reinterpret_cast<UnicodeSet*>(gEmptyUnicodeSet)->freeze();
153     gEmptyUnicodeSetInitialized = TRUE;
154 
155     // These sets were decided after discussion with icu-design@. See tickets #13084 and #13309.
156     // Zs+TAB is "horizontal whitespace" according to UTS #18 (blank property).
157     gUnicodeSets[DEFAULT_IGNORABLES] = new UnicodeSet(
158             u"[[:Zs:][\\u0009][:Bidi_Control:][:Variation_Selector:]]", status);
159     gUnicodeSets[STRICT_IGNORABLES] = new UnicodeSet(u"[[:Bidi_Control:]]", status);
160 
161     LocalUResourceBundlePointer rb(ures_open(nullptr, "root", &status));
162     if (U_FAILURE(status)) { return; }
163     ParseDataSink sink;
164     ures_getAllItemsWithFallback(rb.getAlias(), "parse", sink, status);
165     if (U_FAILURE(status)) { return; }
166 
167     // NOTE: It is OK for these assertions to fail if there was a no-data build.
168     U_ASSERT(gUnicodeSets[COMMA] != nullptr);
169     U_ASSERT(gUnicodeSets[STRICT_COMMA] != nullptr);
170     U_ASSERT(gUnicodeSets[PERIOD] != nullptr);
171     U_ASSERT(gUnicodeSets[STRICT_PERIOD] != nullptr);
172     U_ASSERT(gUnicodeSets[APOSTROPHE_SIGN] != nullptr);
173 
174     LocalPointer<UnicodeSet> otherGrouping(new UnicodeSet(
175         u"[٬‘\\u0020\\u00A0\\u2000-\\u200A\\u202F\\u205F\\u3000]",
176         status
177     ), status);
178     if (U_FAILURE(status)) { return; }
179     otherGrouping->addAll(*gUnicodeSets[APOSTROPHE_SIGN]);
180     gUnicodeSets[OTHER_GROUPING_SEPARATORS] = otherGrouping.orphan();
181     gUnicodeSets[ALL_SEPARATORS] = computeUnion(COMMA, PERIOD, OTHER_GROUPING_SEPARATORS);
182     gUnicodeSets[STRICT_ALL_SEPARATORS] = computeUnion(
183             STRICT_COMMA, STRICT_PERIOD, OTHER_GROUPING_SEPARATORS);
184 
185     U_ASSERT(gUnicodeSets[MINUS_SIGN] != nullptr);
186     U_ASSERT(gUnicodeSets[PLUS_SIGN] != nullptr);
187     U_ASSERT(gUnicodeSets[PERCENT_SIGN] != nullptr);
188     U_ASSERT(gUnicodeSets[PERMILLE_SIGN] != nullptr);
189 
190     gUnicodeSets[INFINITY_SIGN] = new UnicodeSet(u"[∞]", status);
191     if (U_FAILURE(status)) { return; }
192 
193     U_ASSERT(gUnicodeSets[DOLLAR_SIGN] != nullptr);
194     U_ASSERT(gUnicodeSets[POUND_SIGN] != nullptr);
195     U_ASSERT(gUnicodeSets[RUPEE_SIGN] != nullptr);
196     U_ASSERT(gUnicodeSets[YEN_SIGN] != nullptr);
197     U_ASSERT(gUnicodeSets[WON_SIGN] != nullptr);
198 
199     gUnicodeSets[DIGITS] = new UnicodeSet(u"[:digit:]", status);
200     if (U_FAILURE(status)) { return; }
201     gUnicodeSets[DIGITS_OR_ALL_SEPARATORS] = computeUnion(DIGITS, ALL_SEPARATORS);
202     gUnicodeSets[DIGITS_OR_STRICT_ALL_SEPARATORS] = computeUnion(DIGITS, STRICT_ALL_SEPARATORS);
203 
204     for (auto* uniset : gUnicodeSets) {
205         if (uniset != nullptr) {
206             uniset->freeze();
207         }
208     }
209 }
210 
211 }
212 
get(Key key)213 const UnicodeSet* unisets::get(Key key) {
214     UErrorCode localStatus = U_ZERO_ERROR;
215     umtx_initOnce(gNumberParseUniSetsInitOnce, &initNumberParseUniSets, localStatus);
216     if (U_FAILURE(localStatus)) {
217         return reinterpret_cast<UnicodeSet*>(gEmptyUnicodeSet);
218     }
219     return getImpl(key);
220 }
221 
chooseFrom(UnicodeString str,Key key1)222 Key unisets::chooseFrom(UnicodeString str, Key key1) {
223     return get(key1)->contains(str) ? key1 : NONE;
224 }
225 
chooseFrom(UnicodeString str,Key key1,Key key2)226 Key unisets::chooseFrom(UnicodeString str, Key key1, Key key2) {
227     return get(key1)->contains(str) ? key1 : chooseFrom(str, key2);
228 }
229 
230 //Key unisets::chooseCurrency(UnicodeString str) {
231 //    if (get(DOLLAR_SIGN)->contains(str)) {
232 //        return DOLLAR_SIGN;
233 //    } else if (get(POUND_SIGN)->contains(str)) {
234 //        return POUND_SIGN;
235 //    } else if (get(RUPEE_SIGN)->contains(str)) {
236 //        return RUPEE_SIGN;
237 //    } else if (get(YEN_SIGN)->contains(str)) {
238 //        return YEN_SIGN;
239 //    } else {
240 //        return NONE;
241 //    }
242 //}
243 
244 
245 #endif /* #if !UCONFIG_NO_FORMATTING */
246