1 // © 2018 and later: Unicode, Inc. and others.
2 // License & terms of use: http://www.unicode.org/copyright.html
3 
4 #include "unicode/utypes.h"
5 
6 #if !UCONFIG_NO_FORMATTING
7 
8 // Allow implicit conversion from char16_t* to UnicodeString for this file:
9 // Helpful in toString methods and elsewhere.
10 #define UNISTR_FROM_STRING_EXPLICIT
11 
12 #include "numparse_types.h"
13 #include "numparse_decimal.h"
14 #include "static_unicode_sets.h"
15 #include "numparse_utils.h"
16 #include "unicode/uchar.h"
17 #include "putilimp.h"
18 #include "number_decimalquantity.h"
19 #include "string_segment.h"
20 
21 using namespace icu;
22 using namespace icu::numparse;
23 using namespace icu::numparse::impl;
24 
25 
DecimalMatcher(const DecimalFormatSymbols & symbols,const Grouper & grouper,parse_flags_t parseFlags)26 DecimalMatcher::DecimalMatcher(const DecimalFormatSymbols& symbols, const Grouper& grouper,
27                                parse_flags_t parseFlags) {
28     if (0 != (parseFlags & PARSE_FLAG_MONETARY_SEPARATORS)) {
29         groupingSeparator = symbols.getConstSymbol(DecimalFormatSymbols::kMonetaryGroupingSeparatorSymbol);
30         decimalSeparator = symbols.getConstSymbol(DecimalFormatSymbols::kMonetarySeparatorSymbol);
31     } else {
32         groupingSeparator = symbols.getConstSymbol(DecimalFormatSymbols::kGroupingSeparatorSymbol);
33         decimalSeparator = symbols.getConstSymbol(DecimalFormatSymbols::kDecimalSeparatorSymbol);
34     }
35     bool strictSeparators = 0 != (parseFlags & PARSE_FLAG_STRICT_SEPARATORS);
36     unisets::Key groupingKey = strictSeparators ? unisets::STRICT_ALL_SEPARATORS
37                                                 : unisets::ALL_SEPARATORS;
38 
39     // Attempt to find separators in the static cache
40 
41     groupingUniSet = unisets::get(groupingKey);
42     unisets::Key decimalKey = unisets::chooseFrom(
43             decimalSeparator,
44             strictSeparators ? unisets::STRICT_COMMA : unisets::COMMA,
45             strictSeparators ? unisets::STRICT_PERIOD : unisets::PERIOD);
46     if (decimalKey >= 0) {
47         decimalUniSet = unisets::get(decimalKey);
48     } else if (!decimalSeparator.isEmpty()) {
49         auto* set = new UnicodeSet();
50         set->add(decimalSeparator.char32At(0));
51         set->freeze();
52         decimalUniSet = set;
53         fLocalDecimalUniSet.adoptInstead(set);
54     } else {
55         decimalUniSet = unisets::get(unisets::EMPTY);
56     }
57 
58     if (groupingKey >= 0 && decimalKey >= 0) {
59         // Everything is available in the static cache
60         separatorSet = groupingUniSet;
61         leadSet = unisets::get(
62                 strictSeparators ? unisets::DIGITS_OR_ALL_SEPARATORS
63                                  : unisets::DIGITS_OR_STRICT_ALL_SEPARATORS);
64     } else {
65         auto* set = new UnicodeSet();
66         set->addAll(*groupingUniSet);
67         set->addAll(*decimalUniSet);
68         set->freeze();
69         separatorSet = set;
70         fLocalSeparatorSet.adoptInstead(set);
71         leadSet = nullptr;
72     }
73 
74     UChar32 cpZero = symbols.getCodePointZero();
75     if (cpZero == -1 || !u_isdigit(cpZero) || u_digit(cpZero, 10) != 0) {
76         // Uncommon case: okay to allocate.
77         auto digitStrings = new UnicodeString[10];
78         fLocalDigitStrings.adoptInstead(digitStrings);
79         for (int32_t i = 0; i <= 9; i++) {
80             digitStrings[i] = symbols.getConstDigitSymbol(i);
81         }
82     }
83 
84     requireGroupingMatch = 0 != (parseFlags & PARSE_FLAG_STRICT_GROUPING_SIZE);
85     groupingDisabled = 0 != (parseFlags & PARSE_FLAG_GROUPING_DISABLED);
86     integerOnly = 0 != (parseFlags & PARSE_FLAG_INTEGER_ONLY);
87     grouping1 = grouper.getPrimary();
88     grouping2 = grouper.getSecondary();
89 
90     // Fraction grouping parsing is disabled for now but could be enabled later.
91     // See https://unicode-org.atlassian.net/browse/ICU-10794
92     // fractionGrouping = 0 != (parseFlags & PARSE_FLAG_FRACTION_GROUPING_ENABLED);
93 }
94 
match(StringSegment & segment,ParsedNumber & result,UErrorCode & status) const95 bool DecimalMatcher::match(StringSegment& segment, ParsedNumber& result, UErrorCode& status) const {
96     return match(segment, result, 0, status);
97 }
98 
match(StringSegment & segment,ParsedNumber & result,int8_t exponentSign,UErrorCode &) const99 bool DecimalMatcher::match(StringSegment& segment, ParsedNumber& result, int8_t exponentSign,
100                            UErrorCode&) const {
101     if (result.seenNumber() && exponentSign == 0) {
102         // A number has already been consumed.
103         return false;
104     } else if (exponentSign != 0) {
105         // scientific notation always comes after the number
106         U_ASSERT(!result.quantity.bogus);
107     }
108 
109     // Initial offset before any character consumption.
110     int32_t initialOffset = segment.getOffset();
111 
112     // Return value: whether to ask for more characters.
113     bool maybeMore = false;
114 
115     // All digits consumed so far.
116     number::impl::DecimalQuantity digitsConsumed;
117     digitsConsumed.bogus = true;
118 
119     // The total number of digits after the decimal place, used for scaling the result.
120     int32_t digitsAfterDecimalPlace = 0;
121 
122     // The actual grouping and decimal separators used in the string.
123     // If non-null, we have seen that token.
124     UnicodeString actualGroupingString;
125     UnicodeString actualDecimalString;
126     actualGroupingString.setToBogus();
127     actualDecimalString.setToBogus();
128 
129     // Information for two groups: the previous group and the current group.
130     //
131     // Each group has three pieces of information:
132     //
133     // Offset: the string position of the beginning of the group, including a leading separator
134     // if there was a leading separator. This is needed in case we need to rewind the parse to
135     // that position.
136     //
137     // Separator type:
138     // 0 => beginning of string
139     // 1 => lead separator is a grouping separator
140     // 2 => lead separator is a decimal separator
141     //
142     // Count: the number of digits in the group. If -1, the group has been validated.
143     int32_t currGroupOffset = 0;
144     int32_t currGroupSepType = 0;
145     int32_t currGroupCount = 0;
146     int32_t prevGroupOffset = -1;
147     int32_t prevGroupSepType = -1;
148     int32_t prevGroupCount = -1;
149 
150     while (segment.length() > 0) {
151         maybeMore = false;
152 
153         // Attempt to match a digit.
154         int8_t digit = -1;
155 
156         // Try by code point digit value.
157         UChar32 cp = segment.getCodePoint();
158         if (u_isdigit(cp)) {
159             segment.adjustOffset(U16_LENGTH(cp));
160             digit = static_cast<int8_t>(u_digit(cp, 10));
161         }
162 
163         // Try by digit string.
164         if (digit == -1 && !fLocalDigitStrings.isNull()) {
165             for (int32_t i = 0; i < 10; i++) {
166                 const UnicodeString& str = fLocalDigitStrings[i];
167                 if (str.isEmpty()) {
168                     continue;
169                 }
170                 int32_t overlap = segment.getCommonPrefixLength(str);
171                 if (overlap == str.length()) {
172                     segment.adjustOffset(overlap);
173                     digit = static_cast<int8_t>(i);
174                     break;
175                 }
176                 maybeMore = maybeMore || (overlap == segment.length());
177             }
178         }
179 
180         if (digit >= 0) {
181             // Digit was found.
182             if (digitsConsumed.bogus) {
183                 digitsConsumed.bogus = false;
184                 digitsConsumed.clear();
185             }
186             digitsConsumed.appendDigit(digit, 0, true);
187             currGroupCount++;
188             if (!actualDecimalString.isBogus()) {
189                 digitsAfterDecimalPlace++;
190             }
191             continue;
192         }
193 
194         // Attempt to match a literal grouping or decimal separator.
195         bool isDecimal = false;
196         bool isGrouping = false;
197 
198         // 1) Attempt the decimal separator string literal.
199         // if (we have not seen a decimal separator yet) { ... }
200         if (actualDecimalString.isBogus() && !decimalSeparator.isEmpty()) {
201             int32_t overlap = segment.getCommonPrefixLength(decimalSeparator);
202             maybeMore = maybeMore || (overlap == segment.length());
203             if (overlap == decimalSeparator.length()) {
204                 isDecimal = true;
205                 actualDecimalString = decimalSeparator;
206             }
207         }
208 
209         // 2) Attempt to match the actual grouping string literal.
210         if (!actualGroupingString.isBogus()) {
211             int32_t overlap = segment.getCommonPrefixLength(actualGroupingString);
212             maybeMore = maybeMore || (overlap == segment.length());
213             if (overlap == actualGroupingString.length()) {
214                 isGrouping = true;
215             }
216         }
217 
218         // 2.5) Attempt to match a new the grouping separator string literal.
219         // if (we have not seen a grouping or decimal separator yet) { ... }
220         if (!groupingDisabled && actualGroupingString.isBogus() && actualDecimalString.isBogus() &&
221             !groupingSeparator.isEmpty()) {
222             int32_t overlap = segment.getCommonPrefixLength(groupingSeparator);
223             maybeMore = maybeMore || (overlap == segment.length());
224             if (overlap == groupingSeparator.length()) {
225                 isGrouping = true;
226                 actualGroupingString = groupingSeparator;
227             }
228         }
229 
230         // 3) Attempt to match a decimal separator from the equivalence set.
231         // if (we have not seen a decimal separator yet) { ... }
232         // The !isGrouping is to confirm that we haven't yet matched the current character.
233         if (!isGrouping && actualDecimalString.isBogus()) {
234             if (decimalUniSet->contains(cp)) {
235                 isDecimal = true;
236                 actualDecimalString = UnicodeString(cp);
237             }
238         }
239 
240         // 4) Attempt to match a grouping separator from the equivalence set.
241         // if (we have not seen a grouping or decimal separator yet) { ... }
242         if (!groupingDisabled && actualGroupingString.isBogus() && actualDecimalString.isBogus()) {
243             if (groupingUniSet->contains(cp)) {
244                 isGrouping = true;
245                 actualGroupingString = UnicodeString(cp);
246             }
247         }
248 
249         // Leave if we failed to match this as a separator.
250         if (!isDecimal && !isGrouping) {
251             break;
252         }
253 
254         // Check for conditions when we don't want to accept the separator.
255         if (isDecimal && integerOnly) {
256             break;
257         } else if (currGroupSepType == 2 && isGrouping) {
258             // Fraction grouping
259             break;
260         }
261 
262         // Validate intermediate grouping sizes.
263         bool prevValidSecondary = validateGroup(prevGroupSepType, prevGroupCount, false);
264         bool currValidPrimary = validateGroup(currGroupSepType, currGroupCount, true);
265         if (!prevValidSecondary || (isDecimal && !currValidPrimary)) {
266             // Invalid grouping sizes.
267             if (isGrouping && currGroupCount == 0) {
268                 // Trailing grouping separators: these are taken care of below
269                 U_ASSERT(currGroupSepType == 1);
270             } else if (requireGroupingMatch) {
271                 // Strict mode: reject the parse
272                 digitsConsumed.clear();
273                 digitsConsumed.bogus = true;
274             }
275             break;
276         } else if (requireGroupingMatch && currGroupCount == 0 && currGroupSepType == 1) {
277             break;
278         } else {
279             // Grouping sizes OK so far.
280             prevGroupOffset = currGroupOffset;
281             prevGroupCount = currGroupCount;
282             if (isDecimal) {
283                 // Do not validate this group any more.
284                 prevGroupSepType = -1;
285             } else {
286                 prevGroupSepType = currGroupSepType;
287             }
288         }
289 
290         // OK to accept the separator.
291         // Special case: don't update currGroup if it is empty; this allows two grouping
292         // separators in a row in lenient mode.
293         if (currGroupCount != 0) {
294             currGroupOffset = segment.getOffset();
295         }
296         currGroupSepType = isGrouping ? 1 : 2;
297         currGroupCount = 0;
298         if (isGrouping) {
299             segment.adjustOffset(actualGroupingString.length());
300         } else {
301             segment.adjustOffset(actualDecimalString.length());
302         }
303     }
304 
305     // End of main loop.
306     // Back up if there was a trailing grouping separator.
307     // Shift prev -> curr so we can check it as a final group.
308     if (currGroupSepType != 2 && currGroupCount == 0) {
309         maybeMore = true;
310         segment.setOffset(currGroupOffset);
311         currGroupOffset = prevGroupOffset;
312         currGroupSepType = prevGroupSepType;
313         currGroupCount = prevGroupCount;
314         prevGroupOffset = -1;
315         prevGroupSepType = 0;
316         prevGroupCount = 1;
317     }
318 
319     // Validate final grouping sizes.
320     bool prevValidSecondary = validateGroup(prevGroupSepType, prevGroupCount, false);
321     bool currValidPrimary = validateGroup(currGroupSepType, currGroupCount, true);
322     if (!requireGroupingMatch) {
323         // The cases we need to handle here are lone digits.
324         // Examples: "1,1"  "1,1,"  "1,1,1"  "1,1,1,"  ",1" (all parse as 1)
325         // See more examples in numberformattestspecification.txt
326         int32_t digitsToRemove = 0;
327         if (!prevValidSecondary) {
328             segment.setOffset(prevGroupOffset);
329             digitsToRemove += prevGroupCount;
330             digitsToRemove += currGroupCount;
331         } else if (!currValidPrimary && (prevGroupSepType != 0 || prevGroupCount != 0)) {
332             maybeMore = true;
333             segment.setOffset(currGroupOffset);
334             digitsToRemove += currGroupCount;
335         }
336         if (digitsToRemove != 0) {
337             digitsConsumed.adjustMagnitude(-digitsToRemove);
338             digitsConsumed.truncate();
339         }
340         prevValidSecondary = true;
341         currValidPrimary = true;
342     }
343     if (currGroupSepType != 2 && (!prevValidSecondary || !currValidPrimary)) {
344         // Grouping failure.
345         digitsConsumed.bogus = true;
346     }
347 
348     // Strings that start with a separator but have no digits,
349     // or strings that failed a grouping size check.
350     if (digitsConsumed.bogus) {
351         maybeMore = maybeMore || (segment.length() == 0);
352         segment.setOffset(initialOffset);
353         return maybeMore;
354     }
355 
356     // We passed all inspections. Start post-processing.
357 
358     // Adjust for fraction part.
359     digitsConsumed.adjustMagnitude(-digitsAfterDecimalPlace);
360 
361     // Set the digits, either normal or exponent.
362     if (exponentSign != 0 && segment.getOffset() != initialOffset) {
363         bool overflow = false;
364         if (digitsConsumed.fitsInLong()) {
365             int64_t exponentLong = digitsConsumed.toLong(false);
366             U_ASSERT(exponentLong >= 0);
367             if (exponentLong <= INT32_MAX) {
368                 auto exponentInt = static_cast<int32_t>(exponentLong);
369                 if (result.quantity.adjustMagnitude(exponentSign * exponentInt)) {
370                     overflow = true;
371                 }
372             } else {
373                 overflow = true;
374             }
375         } else {
376             overflow = true;
377         }
378         if (overflow) {
379             if (exponentSign == -1) {
380                 // Set to zero
381                 result.quantity.clear();
382             } else {
383                 // Set to infinity
384                 result.quantity.bogus = true;
385                 result.flags |= FLAG_INFINITY;
386             }
387         }
388     } else {
389         result.quantity = digitsConsumed;
390     }
391 
392     // Set other information into the result and return.
393     if (!actualDecimalString.isBogus()) {
394         result.flags |= FLAG_HAS_DECIMAL_SEPARATOR;
395     }
396     result.setCharsConsumed(segment);
397     return segment.length() == 0 || maybeMore;
398 }
399 
validateGroup(int32_t sepType,int32_t count,bool isPrimary) const400 bool DecimalMatcher::validateGroup(int32_t sepType, int32_t count, bool isPrimary) const {
401     if (requireGroupingMatch) {
402         if (sepType == -1) {
403             // No such group (prevGroup before first shift).
404             return true;
405         } else if (sepType == 0) {
406             // First group.
407             if (isPrimary) {
408                 // No grouping separators is OK.
409                 return true;
410             } else {
411                 return count != 0 && count <= grouping2;
412             }
413         } else if (sepType == 1) {
414             // Middle group.
415             if (isPrimary) {
416                 return count == grouping1;
417             } else {
418                 return count == grouping2;
419             }
420         } else {
421             U_ASSERT(sepType == 2);
422             // After the decimal separator.
423             return true;
424         }
425     } else {
426         if (sepType == 1) {
427             // #11230: don't accept middle groups with only 1 digit.
428             return count != 1;
429         } else {
430             return true;
431         }
432     }
433 }
434 
smokeTest(const StringSegment & segment) const435 bool DecimalMatcher::smokeTest(const StringSegment& segment) const {
436     // The common case uses a static leadSet for efficiency.
437     if (fLocalDigitStrings.isNull() && leadSet != nullptr) {
438         return segment.startsWith(*leadSet);
439     }
440     if (segment.startsWith(*separatorSet) || u_isdigit(segment.getCodePoint())) {
441         return true;
442     }
443     if (fLocalDigitStrings.isNull()) {
444         return false;
445     }
446     for (int32_t i = 0; i < 10; i++) {
447         if (segment.startsWith(fLocalDigitStrings[i])) {
448             return true;
449         }
450     }
451     return false;
452 }
453 
toString() const454 UnicodeString DecimalMatcher::toString() const {
455     return u"<Decimal>";
456 }
457 
458 
459 #endif /* #if !UCONFIG_NO_FORMATTING */
460