1 // © 2018 and later: Unicode, Inc. and others.
2 // License & terms of use: http://www.unicode.org/copyright.html
3 
4 #include "unicode/utypes.h"
5 
6 #if !UCONFIG_NO_FORMATTING
7 
8 // Allow implicit conversion from char16_t* to UnicodeString for this file:
9 // Helpful in toString methods and elsewhere.
10 #define UNISTR_FROM_STRING_EXPLICIT
11 
12 #include <typeinfo>
13 #include <array>
14 #include "number_types.h"
15 #include "number_patternstring.h"
16 #include "numparse_types.h"
17 #include "numparse_impl.h"
18 #include "numparse_symbols.h"
19 #include "numparse_decimal.h"
20 #include "unicode/numberformatter.h"
21 #include "cstr.h"
22 #include "number_mapper.h"
23 #include "static_unicode_sets.h"
24 
25 using namespace icu;
26 using namespace icu::number;
27 using namespace icu::number::impl;
28 using namespace icu::numparse;
29 using namespace icu::numparse::impl;
30 
31 
32 NumberParseMatcher::~NumberParseMatcher() = default;
33 
34 
35 NumberParserImpl*
createSimpleParser(const Locale & locale,const UnicodeString & patternString,parse_flags_t parseFlags,UErrorCode & status)36 NumberParserImpl::createSimpleParser(const Locale& locale, const UnicodeString& patternString,
37                                      parse_flags_t parseFlags, UErrorCode& status) {
38 
39     LocalPointer<NumberParserImpl> parser(new NumberParserImpl(parseFlags));
40     DecimalFormatSymbols symbols(locale, status);
41 
42     parser->fLocalMatchers.ignorables = {parseFlags};
43     IgnorablesMatcher& ignorables = parser->fLocalMatchers.ignorables;
44 
45     DecimalFormatSymbols dfs(locale, status);
46     dfs.setSymbol(DecimalFormatSymbols::kCurrencySymbol, u"IU$");
47     dfs.setSymbol(DecimalFormatSymbols::kIntlCurrencySymbol, u"ICU");
48     CurrencySymbols currencySymbols({u"ICU", status}, locale, dfs, status);
49 
50     ParsedPatternInfo patternInfo;
51     PatternParser::parseToPatternInfo(patternString, patternInfo, status);
52 
53     // The following statements set up the affix matchers.
54     AffixTokenMatcherSetupData affixSetupData = {
55             currencySymbols, symbols, ignorables, locale, parseFlags};
56     parser->fLocalMatchers.affixTokenMatcherWarehouse = {&affixSetupData};
57     parser->fLocalMatchers.affixMatcherWarehouse = {&parser->fLocalMatchers.affixTokenMatcherWarehouse};
58     parser->fLocalMatchers.affixMatcherWarehouse.createAffixMatchers(
59             patternInfo, *parser, ignorables, parseFlags, status);
60 
61     Grouper grouper = Grouper::forStrategy(UNUM_GROUPING_AUTO);
62     grouper.setLocaleData(patternInfo, locale);
63 
64     parser->addMatcher(parser->fLocalMatchers.ignorables);
65     parser->addMatcher(parser->fLocalMatchers.decimal = {symbols, grouper, parseFlags});
66     parser->addMatcher(parser->fLocalMatchers.minusSign = {symbols, false});
67     parser->addMatcher(parser->fLocalMatchers.plusSign = {symbols, false});
68     parser->addMatcher(parser->fLocalMatchers.percent = {symbols});
69     parser->addMatcher(parser->fLocalMatchers.permille = {symbols});
70     parser->addMatcher(parser->fLocalMatchers.nan = {symbols});
71     parser->addMatcher(parser->fLocalMatchers.infinity = {symbols});
72     parser->addMatcher(parser->fLocalMatchers.padding = {u"@"});
73     parser->addMatcher(parser->fLocalMatchers.scientific = {symbols, grouper});
74     parser->addMatcher(parser->fLocalMatchers.currency = {currencySymbols, symbols, parseFlags, status});
75     parser->addMatcher(parser->fLocalValidators.number = {});
76 
77     parser->freeze();
78     return parser.orphan();
79 }
80 
81 NumberParserImpl*
createParserFromProperties(const number::impl::DecimalFormatProperties & properties,const DecimalFormatSymbols & symbols,bool parseCurrency,UErrorCode & status)82 NumberParserImpl::createParserFromProperties(const number::impl::DecimalFormatProperties& properties,
83                                              const DecimalFormatSymbols& symbols, bool parseCurrency,
84                                              UErrorCode& status) {
85     Locale locale = symbols.getLocale();
86     PropertiesAffixPatternProvider localPAPP;
87     CurrencyPluralInfoAffixProvider localCPIAP;
88     AffixPatternProvider* affixProvider;
89     if (properties.currencyPluralInfo.fPtr.isNull()) {
90         localPAPP.setTo(properties, status);
91         affixProvider = &localPAPP;
92     } else {
93         localCPIAP.setTo(*properties.currencyPluralInfo.fPtr, properties, status);
94         affixProvider = &localCPIAP;
95     }
96     if (affixProvider == nullptr || U_FAILURE(status)) { return nullptr; }
97     CurrencyUnit currency = resolveCurrency(properties, locale, status);
98     CurrencySymbols currencySymbols(currency, locale, symbols, status);
99     bool isStrict = properties.parseMode.getOrDefault(PARSE_MODE_STRICT) == PARSE_MODE_STRICT;
100     Grouper grouper = Grouper::forProperties(properties);
101     int parseFlags = 0;
102     if (affixProvider == nullptr || U_FAILURE(status)) { return nullptr; }
103     if (!properties.parseCaseSensitive) {
104         parseFlags |= PARSE_FLAG_IGNORE_CASE;
105     }
106     if (properties.parseIntegerOnly) {
107         parseFlags |= PARSE_FLAG_INTEGER_ONLY;
108     }
109     if (properties.signAlwaysShown) {
110         parseFlags |= PARSE_FLAG_PLUS_SIGN_ALLOWED;
111     }
112     if (isStrict) {
113         parseFlags |= PARSE_FLAG_STRICT_GROUPING_SIZE;
114         parseFlags |= PARSE_FLAG_STRICT_SEPARATORS;
115         parseFlags |= PARSE_FLAG_USE_FULL_AFFIXES;
116         parseFlags |= PARSE_FLAG_EXACT_AFFIX;
117         parseFlags |= PARSE_FLAG_STRICT_IGNORABLES;
118     } else {
119         parseFlags |= PARSE_FLAG_INCLUDE_UNPAIRED_AFFIXES;
120     }
121     if (grouper.getPrimary() <= 0) {
122         parseFlags |= PARSE_FLAG_GROUPING_DISABLED;
123     }
124     if (parseCurrency || affixProvider->hasCurrencySign()) {
125         parseFlags |= PARSE_FLAG_MONETARY_SEPARATORS;
126     }
127     if (!parseCurrency) {
128         parseFlags |= PARSE_FLAG_NO_FOREIGN_CURRENCY;
129     }
130 
131     LocalPointer<NumberParserImpl> parser(new NumberParserImpl(parseFlags));
132 
133     parser->fLocalMatchers.ignorables = {parseFlags};
134     IgnorablesMatcher& ignorables = parser->fLocalMatchers.ignorables;
135 
136     //////////////////////
137     /// AFFIX MATCHERS ///
138     //////////////////////
139 
140     // The following statements set up the affix matchers.
141     AffixTokenMatcherSetupData affixSetupData = {
142             currencySymbols, symbols, ignorables, locale, parseFlags};
143     parser->fLocalMatchers.affixTokenMatcherWarehouse = {&affixSetupData};
144     parser->fLocalMatchers.affixMatcherWarehouse = {&parser->fLocalMatchers.affixTokenMatcherWarehouse};
145     parser->fLocalMatchers.affixMatcherWarehouse.createAffixMatchers(
146             *affixProvider, *parser, ignorables, parseFlags, status);
147 
148     ////////////////////////
149     /// CURRENCY MATCHER ///
150     ////////////////////////
151 
152     if (parseCurrency || affixProvider->hasCurrencySign()) {
153         parser->addMatcher(parser->fLocalMatchers.currency = {currencySymbols, symbols, parseFlags, status});
154     }
155 
156     ///////////////
157     /// PERCENT ///
158     ///////////////
159 
160     // ICU-TC meeting, April 11, 2018: accept percent/permille only if it is in the pattern,
161     // and to maintain regressive behavior, divide by 100 even if no percent sign is present.
162     if (!isStrict && affixProvider->containsSymbolType(AffixPatternType::TYPE_PERCENT, status)) {
163         parser->addMatcher(parser->fLocalMatchers.percent = {symbols});
164     }
165     if (!isStrict && affixProvider->containsSymbolType(AffixPatternType::TYPE_PERMILLE, status)) {
166         parser->addMatcher(parser->fLocalMatchers.permille = {symbols});
167     }
168 
169     ///////////////////////////////
170     /// OTHER STANDARD MATCHERS ///
171     ///////////////////////////////
172 
173     if (!isStrict) {
174         parser->addMatcher(parser->fLocalMatchers.plusSign = {symbols, false});
175         parser->addMatcher(parser->fLocalMatchers.minusSign = {symbols, false});
176     }
177     parser->addMatcher(parser->fLocalMatchers.nan = {symbols});
178     parser->addMatcher(parser->fLocalMatchers.infinity = {symbols});
179     UnicodeString padString = properties.padString;
180     if (!padString.isBogus() && !ignorables.getSet()->contains(padString)) {
181         parser->addMatcher(parser->fLocalMatchers.padding = {padString});
182     }
183     parser->addMatcher(parser->fLocalMatchers.ignorables);
184     parser->addMatcher(parser->fLocalMatchers.decimal = {symbols, grouper, parseFlags});
185     // NOTE: parseNoExponent doesn't disable scientific parsing if we have a scientific formatter
186     if (!properties.parseNoExponent || properties.minimumExponentDigits > 0) {
187         parser->addMatcher(parser->fLocalMatchers.scientific = {symbols, grouper});
188     }
189 
190     //////////////////
191     /// VALIDATORS ///
192     //////////////////
193 
194     parser->addMatcher(parser->fLocalValidators.number = {});
195     if (isStrict) {
196         parser->addMatcher(parser->fLocalValidators.affix = {});
197     }
198     if (parseCurrency) {
199         parser->addMatcher(parser->fLocalValidators.currency = {});
200     }
201     if (properties.decimalPatternMatchRequired) {
202         bool patternHasDecimalSeparator =
203                 properties.decimalSeparatorAlwaysShown || properties.maximumFractionDigits != 0;
204         parser->addMatcher(parser->fLocalValidators.decimalSeparator = {patternHasDecimalSeparator});
205     }
206     // The multiplier takes care of scaling percentages.
207     Scale multiplier = scaleFromProperties(properties);
208     if (multiplier.isValid()) {
209         parser->addMatcher(parser->fLocalValidators.multiplier = {multiplier});
210     }
211 
212     parser->freeze();
213     return parser.orphan();
214 }
215 
NumberParserImpl(parse_flags_t parseFlags)216 NumberParserImpl::NumberParserImpl(parse_flags_t parseFlags)
217         : fParseFlags(parseFlags) {
218 }
219 
~NumberParserImpl()220 NumberParserImpl::~NumberParserImpl() {
221     fNumMatchers = 0;
222 }
223 
addMatcher(NumberParseMatcher & matcher)224 void NumberParserImpl::addMatcher(NumberParseMatcher& matcher) {
225     if (fNumMatchers + 1 > fMatchers.getCapacity()) {
226         fMatchers.resize(fNumMatchers * 2, fNumMatchers);
227     }
228     fMatchers[fNumMatchers] = &matcher;
229     fNumMatchers++;
230 }
231 
freeze()232 void NumberParserImpl::freeze() {
233     fFrozen = true;
234 }
235 
getParseFlags() const236 parse_flags_t NumberParserImpl::getParseFlags() const {
237     return fParseFlags;
238 }
239 
parse(const UnicodeString & input,bool greedy,ParsedNumber & result,UErrorCode & status) const240 void NumberParserImpl::parse(const UnicodeString& input, bool greedy, ParsedNumber& result,
241                              UErrorCode& status) const {
242     return parse(input, 0, greedy, result, status);
243 }
244 
parse(const UnicodeString & input,int32_t start,bool greedy,ParsedNumber & result,UErrorCode & status) const245 void NumberParserImpl::parse(const UnicodeString& input, int32_t start, bool greedy, ParsedNumber& result,
246                              UErrorCode& status) const {
247     if (U_FAILURE(status)) {
248         return;
249     }
250     U_ASSERT(fFrozen);
251     // TODO: Check start >= 0 and start < input.length()
252     StringSegment segment(input, 0 != (fParseFlags & PARSE_FLAG_IGNORE_CASE));
253     segment.adjustOffset(start);
254     if (greedy) {
255         parseGreedy(segment, result, status);
256     } else if (0 != (fParseFlags & PARSE_FLAG_ALLOW_INFINITE_RECURSION)) {
257         // Start at 1 so that recursionLevels never gets to 0
258         parseLongestRecursive(segment, result, 1, status);
259     } else {
260         // Arbitrary recursion safety limit: 100 levels.
261         parseLongestRecursive(segment, result, -100, status);
262     }
263     for (int32_t i = 0; i < fNumMatchers; i++) {
264         fMatchers[i]->postProcess(result);
265     }
266     result.postProcess();
267 }
268 
parseGreedy(StringSegment & segment,ParsedNumber & result,UErrorCode & status) const269 void NumberParserImpl::parseGreedy(StringSegment& segment, ParsedNumber& result,
270                                             UErrorCode& status) const {
271     // Note: this method is not recursive in order to avoid stack overflow.
272     for (int i = 0; i <fNumMatchers;) {
273         // Base Case
274         if (segment.length() == 0) {
275             return;
276         }
277         const NumberParseMatcher* matcher = fMatchers[i];
278         if (!matcher->smokeTest(segment)) {
279             // Matcher failed smoke test: try the next one
280             i++;
281             continue;
282         }
283         int32_t initialOffset = segment.getOffset();
284         matcher->match(segment, result, status);
285         if (U_FAILURE(status)) {
286             return;
287         }
288         if (segment.getOffset() != initialOffset) {
289             // Greedy heuristic: accept the match and loop back
290             i = 0;
291             continue;
292         } else {
293             // Matcher did not match: try the next one
294             i++;
295             continue;
296         }
297         UPRV_UNREACHABLE;
298     }
299 
300     // NOTE: If we get here, the greedy parse completed without consuming the entire string.
301 }
302 
parseLongestRecursive(StringSegment & segment,ParsedNumber & result,int32_t recursionLevels,UErrorCode & status) const303 void NumberParserImpl::parseLongestRecursive(StringSegment& segment, ParsedNumber& result,
304                                              int32_t recursionLevels,
305                                              UErrorCode& status) const {
306     // Base Case
307     if (segment.length() == 0) {
308         return;
309     }
310 
311     // Safety against stack overflow
312     if (recursionLevels == 0) {
313         return;
314     }
315 
316     // TODO: Give a nice way for the matcher to reset the ParsedNumber?
317     ParsedNumber initial(result);
318     ParsedNumber candidate;
319 
320     int initialOffset = segment.getOffset();
321     for (int32_t i = 0; i < fNumMatchers; i++) {
322         const NumberParseMatcher* matcher = fMatchers[i];
323         if (!matcher->smokeTest(segment)) {
324             continue;
325         }
326 
327         // In a non-greedy parse, we attempt all possible matches and pick the best.
328         for (int32_t charsToConsume = 0; charsToConsume < segment.length();) {
329             charsToConsume += U16_LENGTH(segment.codePointAt(charsToConsume));
330 
331             // Run the matcher on a segment of the current length.
332             candidate = initial;
333             segment.setLength(charsToConsume);
334             bool maybeMore = matcher->match(segment, candidate, status);
335             segment.resetLength();
336             if (U_FAILURE(status)) {
337                 return;
338             }
339 
340             // If the entire segment was consumed, recurse.
341             if (segment.getOffset() - initialOffset == charsToConsume) {
342                 parseLongestRecursive(segment, candidate, recursionLevels + 1, status);
343                 if (U_FAILURE(status)) {
344                     return;
345                 }
346                 if (candidate.isBetterThan(result)) {
347                     result = candidate;
348                 }
349             }
350 
351             // Since the segment can be re-used, reset the offset.
352             // This does not have an effect if the matcher did not consume any chars.
353             segment.setOffset(initialOffset);
354 
355             // Unless the matcher wants to see the next char, continue to the next matcher.
356             if (!maybeMore) {
357                 break;
358             }
359         }
360     }
361 }
362 
toString() const363 UnicodeString NumberParserImpl::toString() const {
364     UnicodeString result(u"<NumberParserImpl matchers:[");
365     for (int32_t i = 0; i < fNumMatchers; i++) {
366         result.append(u' ');
367         result.append(fMatchers[i]->toString());
368     }
369     result.append(u" ]>", -1);
370     return result;
371 }
372 
373 
374 #endif /* #if !UCONFIG_NO_FORMATTING */
375