1 // © 2017 and later: Unicode, Inc. and others.
2 // License & terms of use: http://www.unicode.org/copyright.html
3 
4 #include "unicode/utypes.h"
5 
6 #if !UCONFIG_NO_FORMATTING
7 
8 #include <cstdlib>
9 
10 #include "unicode/simpleformatter.h"
11 #include "unicode/ures.h"
12 #include "ureslocs.h"
13 #include "charstr.h"
14 #include "uresimp.h"
15 #include "measunit_impl.h"
16 #include "number_longnames.h"
17 #include "number_microprops.h"
18 #include <algorithm>
19 #include "cstring.h"
20 #include "util.h"
21 
22 using namespace icu;
23 using namespace icu::number;
24 using namespace icu::number::impl;
25 
26 namespace {
27 
28 /**
29  * Display Name (this format has no placeholder).
30  *
31  * Used as an index into the LongNameHandler::simpleFormats array. Units
32  * resources cover the normal set of PluralRules keys, as well as `dnam` and
33  * `per` forms.
34  */
35 constexpr int32_t DNAM_INDEX = StandardPlural::Form::COUNT;
36 /**
37  * "per" form (e.g. "{0} per day" is day's "per" form).
38  *
39  * Used as an index into the LongNameHandler::simpleFormats array. Units
40  * resources cover the normal set of PluralRules keys, as well as `dnam` and
41  * `per` forms.
42  */
43 constexpr int32_t PER_INDEX = StandardPlural::Form::COUNT + 1;
44 /**
45  * Gender of the word, in languages with grammatical gender.
46  */
47 constexpr int32_t GENDER_INDEX = StandardPlural::Form::COUNT + 2;
48 // Number of keys in the array populated by PluralTableSink.
49 constexpr int32_t ARRAY_LENGTH = StandardPlural::Form::COUNT + 3;
50 
51 // TODO(icu-units#28): load this list from resources, after creating a "&set"
52 // function for use in ldml2icu rules.
53 const int32_t GENDER_COUNT = 7;
54 const char *gGenders[GENDER_COUNT] = {"animate",   "common", "feminine", "inanimate",
55                                       "masculine", "neuter", "personal"};
56 
57 // Converts a UnicodeString to a const char*, either pointing to a string in
58 // gGenders, or pointing to an empty string if an appropriate string was not
59 // found.
getGenderString(UnicodeString uGender,UErrorCode status)60 const char *getGenderString(UnicodeString uGender, UErrorCode status) {
61     if (uGender.length() == 0) {
62         return "";
63     }
64     CharString gender;
65     gender.appendInvariantChars(uGender, status);
66     if (U_FAILURE(status)) {
67         return "";
68     }
69     int32_t first = 0;
70     int32_t last = GENDER_COUNT;
71     while (first < last) {
72         int32_t mid = (first + last) / 2;
73         int32_t cmp = uprv_strcmp(gender.data(), gGenders[mid]);
74         if (cmp == 0) {
75             return gGenders[mid];
76         } else if (cmp > 0) {
77             first = mid + 1;
78         } else if (cmp < 0) {
79             last = mid;
80         }
81     }
82     // We don't return an error in case our gGenders list is incomplete in
83     // production.
84     //
85     // TODO(icu-units#28): a unit test checking all locales' genders are covered
86     // by gGenders? Else load a complete list of genders found in
87     // grammaticalFeatures in an initOnce.
88     return "";
89 }
90 
91 // Returns the array index that corresponds to the given pluralKeyword.
getIndex(const char * pluralKeyword,UErrorCode & status)92 static int32_t getIndex(const char* pluralKeyword, UErrorCode& status) {
93     // pluralKeyword can also be "dnam", "per", or "gender"
94     switch (*pluralKeyword) {
95     case 'd':
96         if (uprv_strcmp(pluralKeyword + 1, "nam") == 0) {
97             return DNAM_INDEX;
98         }
99         break;
100     case 'g':
101         if (uprv_strcmp(pluralKeyword + 1, "ender") == 0) {
102             return GENDER_INDEX;
103         }
104         break;
105     case 'p':
106         if (uprv_strcmp(pluralKeyword + 1, "er") == 0) {
107             return PER_INDEX;
108         }
109         break;
110     default:
111         break;
112     }
113     StandardPlural::Form plural = StandardPlural::fromString(pluralKeyword, status);
114     return plural;
115 }
116 
117 // Selects a string out of the `strings` array which corresponds to the
118 // specified plural form, with fallback to the OTHER form.
119 //
120 // The `strings` array must have ARRAY_LENGTH items: one corresponding to each
121 // of the plural forms, plus a display name ("dnam") and a "per" form.
getWithPlural(const UnicodeString * strings,StandardPlural::Form plural,UErrorCode & status)122 static UnicodeString getWithPlural(
123         const UnicodeString* strings,
124         StandardPlural::Form plural,
125         UErrorCode& status) {
126     UnicodeString result = strings[plural];
127     if (result.isBogus()) {
128         result = strings[StandardPlural::Form::OTHER];
129     }
130     if (result.isBogus()) {
131         // There should always be data in the "other" plural variant.
132         status = U_INTERNAL_PROGRAM_ERROR;
133     }
134     return result;
135 }
136 
137 enum PlaceholderPosition { PH_EMPTY, PH_NONE, PH_BEGINNING, PH_MIDDLE, PH_END };
138 
139 /**
140  * Returns three outputs extracted from pattern.
141  *
142  * @param coreUnit is extracted as per Extract(...) in the spec:
143  *   https://unicode.org/reports/tr35/tr35-general.html#compound-units
144  * @param PlaceholderPosition indicates where in the string the placeholder was
145  *   found.
146  * @param joinerChar Iff the placeholder was at the beginning or end, joinerChar
147  *   contains the space character (if any) that separated the placeholder from
148  *   the rest of the pattern. Otherwise, joinerChar is set to NUL. Only one
149  *   space character is considered.
150  */
extractCorePattern(const UnicodeString & pattern,UnicodeString & coreUnit,PlaceholderPosition & placeholderPosition,UChar & joinerChar)151 void extractCorePattern(const UnicodeString &pattern,
152                         UnicodeString &coreUnit,
153                         PlaceholderPosition &placeholderPosition,
154                         UChar &joinerChar) {
155     joinerChar = 0;
156     int32_t len = pattern.length();
157     if (pattern.startsWith(u"{0}", 3)) {
158         placeholderPosition = PH_BEGINNING;
159         if (u_isJavaSpaceChar(pattern[3])) {
160             joinerChar = pattern[3];
161             coreUnit.setTo(pattern, 4, len - 4);
162         } else {
163             coreUnit.setTo(pattern, 3, len - 3);
164         }
165     } else if (pattern.endsWith(u"{0}", 3)) {
166         placeholderPosition = PH_END;
167         if (u_isJavaSpaceChar(pattern[len - 4])) {
168             coreUnit.setTo(pattern, 0, len - 4);
169             joinerChar = pattern[len - 4];
170         } else {
171             coreUnit.setTo(pattern, 0, len - 3);
172         }
173     } else if (pattern.indexOf(u"{0}", 3, 1, len - 2) == -1) {
174         placeholderPosition = PH_NONE;
175         coreUnit = pattern;
176     } else {
177         placeholderPosition = PH_MIDDLE;
178         coreUnit = pattern;
179     }
180 }
181 
182 //////////////////////////
183 /// BEGIN DATA LOADING ///
184 //////////////////////////
185 
186 // Gets the gender of a built-in unit: unit must be a built-in. Returns an empty
187 // string both in case of unknown gender and in case of unknown unit.
188 UnicodeString
getGenderForBuiltin(const Locale & locale,const MeasureUnit & builtinUnit,UErrorCode & status)189 getGenderForBuiltin(const Locale &locale, const MeasureUnit &builtinUnit, UErrorCode &status) {
190     LocalUResourceBundlePointer unitsBundle(ures_open(U_ICUDATA_UNIT, locale.getName(), &status));
191     if (U_FAILURE(status)) { return {}; }
192 
193     // Map duration-year-person, duration-week-person, etc. to duration-year, duration-week, ...
194     // TODO(ICU-20400): Get duration-*-person data properly with aliases.
195     StringPiece subtypeForResource;
196     int32_t subtypeLen = static_cast<int32_t>(uprv_strlen(builtinUnit.getSubtype()));
197     if (subtypeLen > 7 && uprv_strcmp(builtinUnit.getSubtype() + subtypeLen - 7, "-person") == 0) {
198         subtypeForResource = {builtinUnit.getSubtype(), subtypeLen - 7};
199     } else {
200         subtypeForResource = builtinUnit.getSubtype();
201     }
202 
203     CharString key;
204     key.append("units/", status);
205     key.append(builtinUnit.getType(), status);
206     key.append("/", status);
207     key.append(subtypeForResource, status);
208     key.append("/gender", status);
209 
210     UErrorCode localStatus = status;
211     int32_t resultLen = 0;
212     const UChar *result =
213         ures_getStringByKeyWithFallback(unitsBundle.getAlias(), key.data(), &resultLen, &localStatus);
214     if (U_SUCCESS(localStatus)) {
215         status = localStatus;
216         return UnicodeString(true, result, resultLen);
217     } else {
218         // TODO(icu-units#28): "$unitRes/gender" does not exist. Do we want to
219         // check whether the parent "$unitRes" exists? Then we could return
220         // U_MISSING_RESOURCE_ERROR for incorrect usage (e.g. builtinUnit not
221         // being a builtin).
222         return {};
223     }
224 }
225 
226 // Loads data from a resource tree with paths matching
227 // $key/$pluralForm/$gender/$case, with lateral inheritance for missing cases
228 // and genders.
229 //
230 // An InflectedPluralSink is configured to load data for a specific gender and
231 // case. It loads all plural forms, because selection between plural forms is
232 // dependent upon the value being formatted.
233 //
234 // See data/unit/de.txt and data/unit/fr.txt for examples - take a look at
235 // units/compound/power2: German has case, French has differences for gender,
236 // but no case.
237 //
238 // TODO(icu-units#138): Conceptually similar to PluralTableSink, however the
239 // tree structures are different. After homogenizing the structures, we may be
240 // able to unify the two classes.
241 //
242 // TODO: Spec violation: expects presence of "count" - does not fallback to an
243 // absent "count"! If this fallback were added, getCompoundValue could be
244 // superseded?
245 class InflectedPluralSink : public ResourceSink {
246   public:
247     // Accepts `char*` rather than StringPiece because
248     // ResourceTable::findValue(...) requires a null-terminated `char*`.
249     //
250     // NOTE: outArray MUST have a length of at least ARRAY_LENGTH. No bounds
251     // checking is performed.
InflectedPluralSink(const char * gender,const char * caseVariant,UnicodeString * outArray)252     explicit InflectedPluralSink(const char *gender, const char *caseVariant, UnicodeString *outArray)
253         : gender(gender), caseVariant(caseVariant), outArray(outArray) {
254         // Initialize the array to bogus strings.
255         for (int32_t i = 0; i < ARRAY_LENGTH; i++) {
256             outArray[i].setToBogus();
257         }
258     }
259 
260     // See ResourceSink::put().
put(const char * key,ResourceValue & value,UBool,UErrorCode & status)261     void put(const char *key, ResourceValue &value, UBool /*noFallback*/, UErrorCode &status) U_OVERRIDE {
262         int32_t pluralIndex = getIndex(key, status);
263         if (U_FAILURE(status)) { return; }
264         if (!outArray[pluralIndex].isBogus()) {
265             // We already have a pattern
266             return;
267         }
268         ResourceTable genderTable = value.getTable(status);
269         ResourceTable caseTable; // This instance has to outlive `value`
270         if (loadForPluralForm(genderTable, caseTable, value, status)) {
271             outArray[pluralIndex] = value.getUnicodeString(status);
272         }
273     }
274 
275   private:
276     // Tries to load data for the configured gender from `genderTable`. Returns
277     // true if found, returning the data in `value`. The returned data will be
278     // for the configured gender if found, falling back to "neuter" and
279     // no-gender if not. The caseTable parameter holds the intermediate
280     // ResourceTable for the sake of lifetime management.
loadForPluralForm(const ResourceTable & genderTable,ResourceTable & caseTable,ResourceValue & value,UErrorCode & status)281     bool loadForPluralForm(const ResourceTable &genderTable,
282                            ResourceTable &caseTable,
283                            ResourceValue &value,
284                            UErrorCode &status) {
285         if (uprv_strcmp(gender, "") != 0) {
286             if (loadForGender(genderTable, gender, caseTable, value, status)) {
287                 return true;
288             }
289             if (uprv_strcmp(gender, "neuter") != 0 &&
290                 loadForGender(genderTable, "neuter", caseTable, value, status)) {
291                 return true;
292             }
293         }
294         if (loadForGender(genderTable, "_", caseTable, value, status)) {
295             return true;
296         }
297         return false;
298     }
299 
300     // Tries to load data for the given gender from `genderTable`. Returns true
301     // if found, returning the data in `value`. The returned data will be for
302     // the configured case if found, falling back to "nominative" and no-case if
303     // not.
loadForGender(const ResourceTable & genderTable,const char * genderVal,ResourceTable & caseTable,ResourceValue & value,UErrorCode & status)304     bool loadForGender(const ResourceTable &genderTable,
305                        const char *genderVal,
306                        ResourceTable &caseTable,
307                        ResourceValue &value,
308                        UErrorCode &status) {
309         if (!genderTable.findValue(genderVal, value)) {
310             return false;
311         }
312         caseTable = value.getTable(status);
313         if (uprv_strcmp(caseVariant, "") != 0) {
314             if (loadForCase(caseTable, caseVariant, value)) {
315                 return true;
316             }
317             if (uprv_strcmp(caseVariant, "nominative") != 0 &&
318                 loadForCase(caseTable, "nominative", value)) {
319                 return true;
320             }
321         }
322         if (loadForCase(caseTable, "_", value)) {
323             return true;
324         }
325         return false;
326     }
327 
328     // Tries to load data for the given case from `caseTable`. Returns true if
329     // found, returning the data in `value`.
loadForCase(const ResourceTable & caseTable,const char * caseValue,ResourceValue & value)330     bool loadForCase(const ResourceTable &caseTable, const char *caseValue, ResourceValue &value) {
331         if (!caseTable.findValue(caseValue, value)) {
332             return false;
333         }
334         return true;
335     }
336 
337     const char *gender;
338     const char *caseVariant;
339     UnicodeString *outArray;
340 };
341 
342 // Fetches localised formatting patterns for the given subKey. See documentation
343 // for InflectedPluralSink for details.
344 //
345 // Data is loaded for the appropriate unit width, with missing data filled in
346 // from unitsShort.
getInflectedMeasureData(StringPiece subKey,const Locale & locale,const UNumberUnitWidth & width,const char * gender,const char * caseVariant,UnicodeString * outArray,UErrorCode & status)347 void getInflectedMeasureData(StringPiece subKey,
348                              const Locale &locale,
349                              const UNumberUnitWidth &width,
350                              const char *gender,
351                              const char *caseVariant,
352                              UnicodeString *outArray,
353                              UErrorCode &status) {
354     InflectedPluralSink sink(gender, caseVariant, outArray);
355     LocalUResourceBundlePointer unitsBundle(ures_open(U_ICUDATA_UNIT, locale.getName(), &status));
356     if (U_FAILURE(status)) { return; }
357 
358     CharString key;
359     key.append("units", status);
360     if (width == UNUM_UNIT_WIDTH_NARROW) {
361         key.append("Narrow", status);
362     } else if (width == UNUM_UNIT_WIDTH_SHORT) {
363         key.append("Short", status);
364     }
365     key.append("/", status);
366     key.append(subKey, status);
367 
368     UErrorCode localStatus = status;
369     ures_getAllChildrenWithFallback(unitsBundle.getAlias(), key.data(), sink, localStatus);
370     if (width == UNUM_UNIT_WIDTH_SHORT) {
371         status = localStatus;
372         return;
373     }
374 }
375 
376 class PluralTableSink : public ResourceSink {
377   public:
378     // NOTE: outArray MUST have a length of at least ARRAY_LENGTH. No bounds
379     // checking is performed.
PluralTableSink(UnicodeString * outArray)380     explicit PluralTableSink(UnicodeString *outArray) : outArray(outArray) {
381         // Initialize the array to bogus strings.
382         for (int32_t i = 0; i < ARRAY_LENGTH; i++) {
383             outArray[i].setToBogus();
384         }
385     }
386 
put(const char * key,ResourceValue & value,UBool,UErrorCode & status)387     void put(const char *key, ResourceValue &value, UBool /*noFallback*/, UErrorCode &status) U_OVERRIDE {
388         if (uprv_strcmp(key, "case") == 0) {
389             return;
390         }
391         int32_t index = getIndex(key, status);
392         if (U_FAILURE(status)) { return; }
393         if (!outArray[index].isBogus()) {
394             return;
395         }
396         outArray[index] = value.getUnicodeString(status);
397         if (U_FAILURE(status)) { return; }
398     }
399 
400   private:
401     UnicodeString *outArray;
402 };
403 
404 /**
405  * Populates outArray with `locale`-specific values for `unit` through use of
406  * PluralTableSink. Only the set of basic units are supported!
407  *
408  * Reading from resources *unitsNarrow* and *unitsShort* (for width
409  * UNUM_UNIT_WIDTH_NARROW), or just *unitsShort* (for width
410  * UNUM_UNIT_WIDTH_SHORT). For other widths, it reads just "units".
411  *
412  * @param unit must be a built-in unit, i.e. must have a type and subtype,
413  *     listed in gTypes and gSubTypes in measunit.cpp.
414  * @param unitDisplayCase the empty string and "nominative" are treated the
415  *     same. For other cases, strings for the requested case are used if found.
416  *     (For any missing case-specific data, we fall back to nominative.)
417  * @param outArray must be of fixed length ARRAY_LENGTH.
418  */
getMeasureData(const Locale & locale,const MeasureUnit & unit,const UNumberUnitWidth & width,const char * unitDisplayCase,UnicodeString * outArray,UErrorCode & status)419 void getMeasureData(const Locale &locale,
420                     const MeasureUnit &unit,
421                     const UNumberUnitWidth &width,
422                     const char *unitDisplayCase,
423                     UnicodeString *outArray,
424                     UErrorCode &status) {
425     PluralTableSink sink(outArray);
426     LocalUResourceBundlePointer unitsBundle(ures_open(U_ICUDATA_UNIT, locale.getName(), &status));
427     if (U_FAILURE(status)) { return; }
428 
429     CharString subKey;
430     subKey.append("/", status);
431     subKey.append(unit.getType(), status);
432     subKey.append("/", status);
433 
434     // Map duration-year-person, duration-week-person, etc. to duration-year, duration-week, ...
435     // TODO(ICU-20400): Get duration-*-person data properly with aliases.
436     int32_t subtypeLen = static_cast<int32_t>(uprv_strlen(unit.getSubtype()));
437     if (subtypeLen > 7 && uprv_strcmp(unit.getSubtype() + subtypeLen - 7, "-person") == 0) {
438         subKey.append({unit.getSubtype(), subtypeLen - 7}, status);
439     } else {
440         subKey.append({unit.getSubtype(), subtypeLen}, status);
441     }
442 
443     if (width != UNUM_UNIT_WIDTH_FULL_NAME) {
444         UErrorCode localStatus = status;
445         CharString genderKey;
446         genderKey.append("units", localStatus);
447         genderKey.append(subKey, localStatus);
448         genderKey.append("/gender", localStatus);
449         StackUResourceBundle fillIn;
450         ures_getByKeyWithFallback(unitsBundle.getAlias(), genderKey.data(), fillIn.getAlias(),
451                                   &localStatus);
452         outArray[GENDER_INDEX] = ures_getUnicodeString(fillIn.getAlias(), &localStatus);
453     }
454 
455     CharString key;
456     key.append("units", status);
457     if (width == UNUM_UNIT_WIDTH_NARROW) {
458         key.append("Narrow", status);
459     } else if (width == UNUM_UNIT_WIDTH_SHORT) {
460         key.append("Short", status);
461     }
462     key.append(subKey, status);
463 
464     // Grab desired case first, if available. Then grab no-case data to fill in
465     // the gaps.
466     if (width == UNUM_UNIT_WIDTH_FULL_NAME && unitDisplayCase[0] != 0) {
467         CharString caseKey;
468         caseKey.append(key, status);
469         caseKey.append("/case/", status);
470         caseKey.append(unitDisplayCase, status);
471 
472         UErrorCode localStatus = U_ZERO_ERROR;
473         // TODO(icu-units#138): our fallback logic is not spec-compliant:
474         // lateral fallback should happen before locale fallback. Switch to
475         // getInflectedMeasureData after homogenizing data format? Find a unit
476         // test case that demonstrates the incorrect fallback logic (via
477         // regional variant of an inflected language?)
478         ures_getAllChildrenWithFallback(unitsBundle.getAlias(), caseKey.data(), sink, localStatus);
479     }
480 
481     // TODO(icu-units#138): our fallback logic is not spec-compliant: we
482     // check the given case, then go straight to the no-case data. The spec
483     // states we should first look for case="nominative". As part of #138,
484     // either get the spec changed, or add unit tests that warn us if
485     // case="nominative" data differs from no-case data?
486     UErrorCode localStatus = U_ZERO_ERROR;
487     ures_getAllChildrenWithFallback(unitsBundle.getAlias(), key.data(), sink, localStatus);
488     if (width == UNUM_UNIT_WIDTH_SHORT) {
489         if (U_FAILURE(localStatus)) {
490             status = localStatus;
491         }
492         return;
493     }
494 }
495 
496 // NOTE: outArray MUST have a length of at least ARRAY_LENGTH.
getCurrencyLongNameData(const Locale & locale,const CurrencyUnit & currency,UnicodeString * outArray,UErrorCode & status)497 void getCurrencyLongNameData(const Locale &locale, const CurrencyUnit &currency, UnicodeString *outArray,
498                              UErrorCode &status) {
499     // In ICU4J, this method gets a CurrencyData from CurrencyData.provider.
500     // TODO(ICU4J): Implement this without going through CurrencyData, like in ICU4C?
501     PluralTableSink sink(outArray);
502     LocalUResourceBundlePointer unitsBundle(ures_open(U_ICUDATA_CURR, locale.getName(), &status));
503     if (U_FAILURE(status)) { return; }
504     ures_getAllChildrenWithFallback(unitsBundle.getAlias(), "CurrencyUnitPatterns", sink, status);
505     if (U_FAILURE(status)) { return; }
506     for (int32_t i = 0; i < StandardPlural::Form::COUNT; i++) {
507         UnicodeString &pattern = outArray[i];
508         if (pattern.isBogus()) {
509             continue;
510         }
511         int32_t longNameLen = 0;
512         const char16_t *longName = ucurr_getPluralName(
513                 currency.getISOCurrency(),
514                 locale.getName(),
515                 nullptr /* isChoiceFormat */,
516                 StandardPlural::getKeyword(static_cast<StandardPlural::Form>(i)),
517                 &longNameLen,
518                 &status);
519         // Example pattern from data: "{0} {1}"
520         // Example output after find-and-replace: "{0} US dollars"
521         pattern.findAndReplace(UnicodeString(u"{1}"), UnicodeString(longName, longNameLen));
522     }
523 }
524 
getCompoundValue(StringPiece compoundKey,const Locale & locale,const UNumberUnitWidth & width,UErrorCode & status)525 UnicodeString getCompoundValue(StringPiece compoundKey,
526                                const Locale &locale,
527                                const UNumberUnitWidth &width,
528                                UErrorCode &status) {
529     LocalUResourceBundlePointer unitsBundle(ures_open(U_ICUDATA_UNIT, locale.getName(), &status));
530     if (U_FAILURE(status)) { return {}; }
531     CharString key;
532     key.append("units", status);
533     if (width == UNUM_UNIT_WIDTH_NARROW) {
534         key.append("Narrow", status);
535     } else if (width == UNUM_UNIT_WIDTH_SHORT) {
536         key.append("Short", status);
537     }
538     key.append("/compound/", status);
539     key.append(compoundKey, status);
540 
541     UErrorCode localStatus = status;
542     int32_t len = 0;
543     const UChar *ptr =
544         ures_getStringByKeyWithFallback(unitsBundle.getAlias(), key.data(), &len, &localStatus);
545     if (U_FAILURE(localStatus) && width != UNUM_UNIT_WIDTH_SHORT) {
546         // Fall back to short, which contains more compound data
547         key.clear();
548         key.append("unitsShort/compound/", status);
549         key.append(compoundKey, status);
550         ptr = ures_getStringByKeyWithFallback(unitsBundle.getAlias(), key.data(), &len, &status);
551     } else {
552         status = localStatus;
553     }
554     if (U_FAILURE(status)) {
555         return {};
556     }
557     return UnicodeString(ptr, len);
558 }
559 
560 /**
561  * Loads and applies deriveComponent rules from CLDR's grammaticalFeatures.xml.
562  *
563  * Consider a deriveComponent rule that looks like this:
564  *
565  *     <deriveComponent feature="case" structure="per" value0="compound" value1="nominative"/>
566  *
567  * Instantiating an instance as follows:
568  *
569  *     DerivedComponents d(loc, "case", "per");
570  *
571  * Applying the rule in the XML element above, `d.value0("foo")` will be "foo",
572  * and `d.value1("foo")` will be "nominative".
573  *
574  * The values returned by value0(...) and value1(...) are valid only while the
575  * instance exists. In case of any kind of failure, value0(...) and value1(...)
576  * will return "".
577  */
578 class DerivedComponents {
579   public:
580     /**
581      * Constructor.
582      *
583      * The feature and structure parameters must be null-terminated. The string
584      * referenced by compoundValue must exist for longer than the
585      * DerivedComponents instance.
586      */
DerivedComponents(const Locale & locale,const char * feature,const char * structure)587     DerivedComponents(const Locale &locale, const char *feature, const char *structure) {
588         StackUResourceBundle derivationsBundle, stackBundle;
589         ures_openDirectFillIn(derivationsBundle.getAlias(), NULL, "grammaticalFeatures", &status);
590         ures_getByKey(derivationsBundle.getAlias(), "grammaticalData", derivationsBundle.getAlias(),
591                       &status);
592         ures_getByKey(derivationsBundle.getAlias(), "derivations", derivationsBundle.getAlias(),
593                       &status);
594         if (U_FAILURE(status)) {
595             return;
596         }
597         UErrorCode localStatus = U_ZERO_ERROR;
598         // TODO(icu-units#28): use standard normal locale resolution algorithms
599         // rather than just grabbing language:
600         ures_getByKey(derivationsBundle.getAlias(), locale.getLanguage(), stackBundle.getAlias(),
601                       &localStatus);
602         // TODO(icu-units#28):
603         // - code currently assumes if the locale exists, the rules are there -
604         //   instead of falling back to root when the requested rule is missing.
605         // - investigate ures.h functions, see if one that uses res_findResource()
606         //   might be better (or use res_findResource directly), or maybe help
607         //   improve ures documentation to guide function selection?
608         if (localStatus == U_MISSING_RESOURCE_ERROR) {
609             ures_getByKey(derivationsBundle.getAlias(), "root", stackBundle.getAlias(), &status);
610         } else {
611             status = localStatus;
612         }
613         ures_getByKey(stackBundle.getAlias(), "component", stackBundle.getAlias(), &status);
614         ures_getByKey(stackBundle.getAlias(), feature, stackBundle.getAlias(), &status);
615         ures_getByKey(stackBundle.getAlias(), structure, stackBundle.getAlias(), &status);
616         UnicodeString val0 = ures_getUnicodeStringByIndex(stackBundle.getAlias(), 0, &status);
617         UnicodeString val1 = ures_getUnicodeStringByIndex(stackBundle.getAlias(), 1, &status);
618         if (U_SUCCESS(status)) {
619             if (val0.compare(UnicodeString(u"compound")) == 0) {
620                 compound0_ = true;
621             } else {
622                 compound0_ = false;
623                 value0_.appendInvariantChars(val0, status);
624             }
625             if (val1.compare(UnicodeString(u"compound")) == 0) {
626                 compound1_ = true;
627             } else {
628                 compound1_ = false;
629                 value1_.appendInvariantChars(val1, status);
630             }
631         }
632     }
633 
634     // Returns a StringPiece that is only valid as long as the instance exists.
value0(const StringPiece compoundValue) const635     StringPiece value0(const StringPiece compoundValue) const {
636         return compound0_ ? compoundValue : value0_.toStringPiece();
637     }
638 
639     // Returns a StringPiece that is only valid as long as the instance exists.
value1(const StringPiece compoundValue) const640     StringPiece value1(const StringPiece compoundValue) const {
641         return compound1_ ? compoundValue : value1_.toStringPiece();
642     }
643 
644     // Returns a char* that is only valid as long as the instance exists.
value0(const char * compoundValue) const645     const char *value0(const char *compoundValue) const {
646         return compound0_ ? compoundValue : value0_.data();
647     }
648 
649     // Returns a char* that is only valid as long as the instance exists.
value1(const char * compoundValue) const650     const char *value1(const char *compoundValue) const {
651         return compound1_ ? compoundValue : value1_.data();
652     }
653 
654   private:
655     UErrorCode status = U_ZERO_ERROR;
656 
657     // Holds strings referred to by value0 and value1;
658     bool compound0_ = false, compound1_ = false;
659     CharString value0_, value1_;
660 };
661 
662 // TODO(icu-units#28): test somehow? Associate with an ICU ticket for adding
663 // testsuite support for testing with synthetic data?
664 /**
665  * Loads and returns the value in rules that look like these:
666  *
667  * <deriveCompound feature="gender" structure="per" value="0"/>
668  * <deriveCompound feature="gender" structure="times" value="1"/>
669  *
670  * Currently a fake example, but spec compliant:
671  * <deriveCompound feature="gender" structure="power" value="feminine"/>
672  *
673  * NOTE: If U_FAILURE(status), returns an empty string.
674  */
675 UnicodeString
getDeriveCompoundRule(Locale locale,const char * feature,const char * structure,UErrorCode & status)676 getDeriveCompoundRule(Locale locale, const char *feature, const char *structure, UErrorCode &status) {
677     StackUResourceBundle derivationsBundle, stackBundle;
678     ures_openDirectFillIn(derivationsBundle.getAlias(), NULL, "grammaticalFeatures", &status);
679     ures_getByKey(derivationsBundle.getAlias(), "grammaticalData", derivationsBundle.getAlias(),
680                   &status);
681     ures_getByKey(derivationsBundle.getAlias(), "derivations", derivationsBundle.getAlias(), &status);
682     // TODO: use standard normal locale resolution algorithms rather than just grabbing language:
683     ures_getByKey(derivationsBundle.getAlias(), locale.getLanguage(), stackBundle.getAlias(), &status);
684     // TODO:
685     // - code currently assumes if the locale exists, the rules are there -
686     //   instead of falling back to root when the requested rule is missing.
687     // - investigate ures.h functions, see if one that uses res_findResource()
688     //   might be better (or use res_findResource directly), or maybe help
689     //   improve ures documentation to guide function selection?
690     if (status == U_MISSING_RESOURCE_ERROR) {
691         status = U_ZERO_ERROR;
692         ures_getByKey(derivationsBundle.getAlias(), "root", stackBundle.getAlias(), &status);
693     }
694     ures_getByKey(stackBundle.getAlias(), "compound", stackBundle.getAlias(), &status);
695     ures_getByKey(stackBundle.getAlias(), feature, stackBundle.getAlias(), &status);
696     UnicodeString uVal = ures_getUnicodeStringByKey(stackBundle.getAlias(), structure, &status);
697     if (U_FAILURE(status)) {
698         return {};
699     }
700     U_ASSERT(!uVal.isBogus());
701     return uVal;
702 }
703 
704 // Returns the gender string for structures following these rules:
705 //
706 // <deriveCompound feature="gender" structure="per" value="0"/>
707 // <deriveCompound feature="gender" structure="times" value="1"/>
708 //
709 // Fake example:
710 // <deriveCompound feature="gender" structure="power" value="feminine"/>
711 //
712 // data0 and data1 should be pattern arrays (UnicodeString[ARRAY_SIZE]) that
713 // correspond to value="0" and value="1".
714 //
715 // Pass a nullptr to data1 if the structure has no concept of value="1" (e.g.
716 // "prefix" doesn't).
getDerivedGender(Locale locale,const char * structure,UnicodeString * data0,UnicodeString * data1,UErrorCode & status)717 UnicodeString getDerivedGender(Locale locale,
718                                const char *structure,
719                                UnicodeString *data0,
720                                UnicodeString *data1,
721                                UErrorCode &status) {
722     UnicodeString val = getDeriveCompoundRule(locale, "gender", structure, status);
723     if (val.length() == 1) {
724         switch (val[0]) {
725         case u'0':
726             return data0[GENDER_INDEX];
727         case u'1':
728             if (data1 == nullptr) {
729                 return {};
730             }
731             return data1[GENDER_INDEX];
732         }
733     }
734     return val;
735 }
736 
737 ////////////////////////
738 /// END DATA LOADING ///
739 ////////////////////////
740 
741 // TODO: promote this somewhere? It's based on patternprops.cpp' trimWhitespace
trimSpaceChars(const UChar * s,int32_t & length)742 const UChar *trimSpaceChars(const UChar *s, int32_t &length) {
743     if (length <= 0 || (!u_isJavaSpaceChar(s[0]) && !u_isJavaSpaceChar(s[length - 1]))) {
744         return s;
745     }
746     int32_t start = 0;
747     int32_t limit = length;
748     while (start < limit && u_isJavaSpaceChar(s[start])) {
749         ++start;
750     }
751     if (start < limit) {
752         // There is non-white space at start; we will not move limit below that,
753         // so we need not test start<limit in the loop.
754         while (u_isJavaSpaceChar(s[limit - 1])) {
755             --limit;
756         }
757     }
758     length = limit - start;
759     return s + start;
760 }
761 
762 /**
763  * Calculates the gender of an arbitrary unit: this is the *second*
764  * implementation of an algorithm to do this:
765  *
766  * Gender is also calculated in "processPatternTimes": that code path is "bottom
767  * up", loading the gender for every component of a compound unit (at the same
768  * time as loading the Long Names formatting patterns), even if the gender is
769  * unneeded, then combining the single units' genders into the compound unit's
770  * gender, according to the rules. This algorithm does a lazier "top-down"
771  * evaluation, starting with the compound unit, calculating which single unit's
772  * gender is needed by breaking it down according to the rules, and then loading
773  * only the gender of the one single unit who's gender is needed.
774  *
775  * For future refactorings:
776  * 1. we could drop processPatternTimes' gender calculation and just call this
777  *    function: for UNUM_UNIT_WIDTH_FULL_NAME, the unit gender is in the very
778  *    same table as the formatting patterns, so loading it then may be
779  *    efficient. For other unit widths however, it needs to be explicitly looked
780  *    up anyway.
781  * 2. alternatively, if CLDR is providing all the genders we need such that we
782  *    don't need to calculate them in ICU anymore, we could drop this function
783  *    and keep only processPatternTimes' calculation. (And optimise it a bit?)
784  *
785  * @param locale The desired locale.
786  * @param unit The measure unit to calculate the gender for.
787  * @return The gender string for the unit, or an empty string if unknown or
788  *     ungendered.
789  */
calculateGenderForUnit(const Locale & locale,const MeasureUnit & unit,UErrorCode & status)790 UnicodeString calculateGenderForUnit(const Locale &locale, const MeasureUnit &unit, UErrorCode &status) {
791     MeasureUnitImpl impl;
792     const MeasureUnitImpl& mui = MeasureUnitImpl::forMeasureUnit(unit, impl, status);
793     int32_t singleUnitIndex = 0;
794     if (mui.complexity == UMEASURE_UNIT_COMPOUND) {
795         int32_t startSlice = 0;
796         // inclusive
797         int32_t endSlice = mui.singleUnits.length()-1;
798         U_ASSERT(endSlice > 0); // Else it would not be COMPOUND
799         if (mui.singleUnits[endSlice]->dimensionality < 0) {
800             // We have a -per- construct
801             UnicodeString perRule = getDeriveCompoundRule(locale, "gender", "per", status);
802             if (perRule.length() != 1) {
803                 // Fixed gender for -per- units
804                 return perRule;
805             }
806             if (perRule[0] == u'1') {
807                 // Find the start of the denominator. We already know there is one.
808                 while (mui.singleUnits[startSlice]->dimensionality >= 0) {
809                     startSlice++;
810                 }
811             } else {
812                 // Find the end of the numerator
813                 while (endSlice >= 0 && mui.singleUnits[endSlice]->dimensionality < 0) {
814                     endSlice--;
815                 }
816                 if (endSlice < 0) {
817                     // We have only a denominator, e.g. "per-second".
818                     // TODO(icu-units#28): find out what gender to use in the
819                     // absence of a first value - mentioned in CLDR-14253.
820                     return {};
821                 }
822             }
823         }
824         if (endSlice > startSlice) {
825             // We have a -times- construct
826             UnicodeString timesRule = getDeriveCompoundRule(locale, "gender", "times", status);
827             if (timesRule.length() != 1) {
828                 // Fixed gender for -times- units
829                 return timesRule;
830             }
831             if (timesRule[0] == u'0') {
832                 endSlice = startSlice;
833             } else {
834                 // We assume timesRule[0] == u'1'
835                 startSlice = endSlice;
836             }
837         }
838         U_ASSERT(startSlice == endSlice);
839         singleUnitIndex = startSlice;
840     } else if (mui.complexity == UMEASURE_UNIT_MIXED) {
841         status = U_INTERNAL_PROGRAM_ERROR;
842         return {};
843     } else {
844         U_ASSERT(mui.complexity == UMEASURE_UNIT_SINGLE);
845         U_ASSERT(mui.singleUnits.length() == 1);
846     }
847 
848     // Now we know which singleUnit's gender we want
849     const SingleUnitImpl *singleUnit = mui.singleUnits[singleUnitIndex];
850     // Check for any power-prefix gender override:
851     if (std::abs(singleUnit->dimensionality) != 1) {
852         UnicodeString powerRule = getDeriveCompoundRule(locale, "gender", "power", status);
853         if (powerRule.length() != 1) {
854             // Fixed gender for -powN- units
855             return powerRule;
856         }
857         // powerRule[0] == u'0'; u'1' not currently in spec.
858     }
859     // Check for any SI and binary prefix gender override:
860     if (std::abs(singleUnit->dimensionality) != 1) {
861         UnicodeString prefixRule = getDeriveCompoundRule(locale, "gender", "prefix", status);
862         if (prefixRule.length() != 1) {
863             // Fixed gender for -powN- units
864             return prefixRule;
865         }
866         // prefixRule[0] == u'0'; u'1' not currently in spec.
867     }
868     // Now we've boiled it down to the gender of one simple unit identifier:
869     return getGenderForBuiltin(locale, MeasureUnit::forIdentifier(singleUnit->getSimpleUnitID(), status),
870                                status);
871 }
872 
maybeCalculateGender(const Locale & locale,const MeasureUnit & unitRef,UnicodeString * outArray,UErrorCode & status)873 void maybeCalculateGender(const Locale &locale,
874                           const MeasureUnit &unitRef,
875                           UnicodeString *outArray,
876                           UErrorCode &status) {
877     if (outArray[GENDER_INDEX].isBogus()) {
878         UnicodeString meterGender = getGenderForBuiltin(locale, MeasureUnit::getMeter(), status);
879         if (meterGender.isEmpty()) {
880             // No gender for meter: assume ungendered language
881             return;
882         }
883         // We have a gendered language, but are lacking gender for unitRef.
884         outArray[GENDER_INDEX] = calculateGenderForUnit(locale, unitRef, status);
885     }
886 }
887 
888 } // namespace
889 
forMeasureUnit(const Locale & loc,const MeasureUnit & unitRef,const UNumberUnitWidth & width,const char * unitDisplayCase,const PluralRules * rules,const MicroPropsGenerator * parent,LongNameHandler * fillIn,UErrorCode & status)890 void LongNameHandler::forMeasureUnit(const Locale &loc,
891                                      const MeasureUnit &unitRef,
892                                      const UNumberUnitWidth &width,
893                                      const char *unitDisplayCase,
894                                      const PluralRules *rules,
895                                      const MicroPropsGenerator *parent,
896                                      LongNameHandler *fillIn,
897                                      UErrorCode &status) {
898     // From https://unicode.org/reports/tr35/tr35-general.html#compound-units -
899     // Points 1 and 2 are mostly handled by MeasureUnit:
900     //
901     // 1. If the unitId is empty or invalid, fail
902     // 2. Put the unitId into normalized order
903     U_ASSERT(fillIn != nullptr);
904 
905     if (uprv_strcmp(unitRef.getType(), "") != 0) {
906         // Handling built-in units:
907         //
908         // 3. Set result to be getValue(unitId with length, pluralCategory, caseVariant)
909         //    - If result is not empty, return it
910         UnicodeString simpleFormats[ARRAY_LENGTH];
911         getMeasureData(loc, unitRef, width, unitDisplayCase, simpleFormats, status);
912         maybeCalculateGender(loc, unitRef, simpleFormats, status);
913         if (U_FAILURE(status)) {
914             return;
915         }
916         fillIn->rules = rules;
917         fillIn->parent = parent;
918         fillIn->simpleFormatsToModifiers(simpleFormats,
919                                          {UFIELD_CATEGORY_NUMBER, UNUM_MEASURE_UNIT_FIELD}, status);
920         if (!simpleFormats[GENDER_INDEX].isBogus()) {
921             fillIn->gender = getGenderString(simpleFormats[GENDER_INDEX], status);
922         }
923         return;
924 
925         // TODO(icu-units#145): figure out why this causes a failure in
926         // format/MeasureFormatTest/TestIndividualPluralFallback and other
927         // tests, when it should have been an alternative for the lines above:
928 
929         // forArbitraryUnit(loc, unitRef, width, unitDisplayCase, fillIn, status);
930         // fillIn->rules = rules;
931         // fillIn->parent = parent;
932         // return;
933     } else {
934         // Check if it is a MeasureUnit this constructor handles: this
935         // constructor does not handle mixed units
936         U_ASSERT(unitRef.getComplexity(status) != UMEASURE_UNIT_MIXED);
937         forArbitraryUnit(loc, unitRef, width, unitDisplayCase, fillIn, status);
938         fillIn->rules = rules;
939         fillIn->parent = parent;
940         return;
941     }
942 }
943 
forArbitraryUnit(const Locale & loc,const MeasureUnit & unitRef,const UNumberUnitWidth & width,const char * unitDisplayCase,LongNameHandler * fillIn,UErrorCode & status)944 void LongNameHandler::forArbitraryUnit(const Locale &loc,
945                                        const MeasureUnit &unitRef,
946                                        const UNumberUnitWidth &width,
947                                        const char *unitDisplayCase,
948                                        LongNameHandler *fillIn,
949                                        UErrorCode &status) {
950     if (U_FAILURE(status)) {
951         return;
952     }
953     if (fillIn == nullptr) {
954         status = U_INTERNAL_PROGRAM_ERROR;
955         return;
956     }
957 
958     // Numbered list items are from the algorithms at
959     // https://unicode.org/reports/tr35/tr35-general.html#compound-units:
960     //
961     // 4. Divide the unitId into numerator (the part before the "-per-") and
962     //    denominator (the part after the "-per-). If both are empty, fail
963     MeasureUnitImpl unit;
964     MeasureUnitImpl perUnit;
965     {
966         MeasureUnitImpl fullUnit = MeasureUnitImpl::forMeasureUnitMaybeCopy(unitRef, status);
967         if (U_FAILURE(status)) {
968             return;
969         }
970         for (int32_t i = 0; i < fullUnit.singleUnits.length(); i++) {
971             SingleUnitImpl *subUnit = fullUnit.singleUnits[i];
972             if (subUnit->dimensionality > 0) {
973                 unit.appendSingleUnit(*subUnit, status);
974             } else {
975                 subUnit->dimensionality *= -1;
976                 perUnit.appendSingleUnit(*subUnit, status);
977             }
978         }
979     }
980 
981     // TODO(icu-units#28): check placeholder logic, see if it needs to be
982     // present here instead of only in processPatternTimes:
983     //
984     // 5. Set both globalPlaceholder and globalPlaceholderPosition to be empty
985 
986     DerivedComponents derivedPerCases(loc, "case", "per");
987 
988     // 6. numeratorUnitString
989     UnicodeString numeratorUnitData[ARRAY_LENGTH];
990     processPatternTimes(std::move(unit), loc, width, derivedPerCases.value0(unitDisplayCase),
991                         numeratorUnitData, status);
992 
993     // 7. denominatorUnitString
994     UnicodeString denominatorUnitData[ARRAY_LENGTH];
995     processPatternTimes(std::move(perUnit), loc, width, derivedPerCases.value1(unitDisplayCase),
996                         denominatorUnitData, status);
997 
998     // TODO(icu-units#139):
999     // - implement DerivedComponents for "plural/times" and "plural/power":
1000     //   French has different rules, we'll be producing the wrong results
1001     //   currently. (Prove via tests!)
1002     // - implement DerivedComponents for "plural/per", "plural/prefix",
1003     //   "case/times", "case/power", and "case/prefix" - although they're
1004     //   currently hardcoded. Languages with different rules are surely on the
1005     //   way.
1006     //
1007     // Currently we only use "case/per", "plural/times", "case/times", and
1008     // "case/power".
1009     //
1010     // This may have impact on multiSimpleFormatsToModifiers(...) below too?
1011     // These rules are currently (ICU 69) all the same and hard-coded below.
1012     UnicodeString perUnitPattern;
1013     if (!denominatorUnitData[PER_INDEX].isBogus()) {
1014         // If we have no denominator, we obtain the empty string:
1015         perUnitPattern = denominatorUnitData[PER_INDEX];
1016     } else {
1017         // 8. Set perPattern to be getValue([per], locale, length)
1018         UnicodeString rawPerUnitFormat = getCompoundValue("per", loc, width, status);
1019         // rawPerUnitFormat is something like "{0} per {1}"; we need to substitute in the secondary unit.
1020         SimpleFormatter perPatternFormatter(rawPerUnitFormat, 2, 2, status);
1021         if (U_FAILURE(status)) {
1022             return;
1023         }
1024         // Plural and placeholder handling for 7. denominatorUnitString:
1025         // TODO(icu-units#139): hardcoded:
1026         // <deriveComponent feature="plural" structure="per" value0="compound" value1="one"/>
1027         UnicodeString denominatorFormat =
1028             getWithPlural(denominatorUnitData, StandardPlural::Form::ONE, status);
1029         // Some "one" pattern may not contain "{0}". For example in "ar" or "ne" locale.
1030         SimpleFormatter denominatorFormatter(denominatorFormat, 0, 1, status);
1031         if (U_FAILURE(status)) {
1032             return;
1033         }
1034         UnicodeString denominatorPattern = denominatorFormatter.getTextWithNoArguments();
1035         int32_t trimmedLen = denominatorPattern.length();
1036         const UChar *trimmed = trimSpaceChars(denominatorPattern.getBuffer(), trimmedLen);
1037         UnicodeString denominatorString(false, trimmed, trimmedLen);
1038         // 9. If the denominatorString is empty, set result to
1039         //    [numeratorString], otherwise set result to format(perPattern,
1040         //    numeratorString, denominatorString)
1041         //
1042         // TODO(icu-units#28): Why does UnicodeString need to be explicit in the
1043         // following line?
1044         perPatternFormatter.format(UnicodeString(u"{0}"), denominatorString, perUnitPattern, status);
1045         if (U_FAILURE(status)) {
1046             return;
1047         }
1048     }
1049     if (perUnitPattern.length() == 0) {
1050         fillIn->simpleFormatsToModifiers(numeratorUnitData,
1051                                          {UFIELD_CATEGORY_NUMBER, UNUM_MEASURE_UNIT_FIELD}, status);
1052     } else {
1053         fillIn->multiSimpleFormatsToModifiers(numeratorUnitData, perUnitPattern,
1054                                               {UFIELD_CATEGORY_NUMBER, UNUM_MEASURE_UNIT_FIELD}, status);
1055     }
1056 
1057     // Gender
1058     //
1059     // TODO(icu-units#28): find out what gender to use in the absence of a first
1060     // value - e.g. what's the gender of "per-second"? Mentioned in CLDR-14253.
1061     //
1062     // gender/per deriveCompound rules don't say:
1063     // <deriveCompound feature="gender" structure="per" value="0"/> <!-- gender(gram-per-meter) ←  gender(gram) -->
1064     fillIn->gender = getGenderString(
1065         getDerivedGender(loc, "per", numeratorUnitData, denominatorUnitData, status), status);
1066 }
1067 
processPatternTimes(MeasureUnitImpl && productUnit,Locale loc,const UNumberUnitWidth & width,const char * caseVariant,UnicodeString * outArray,UErrorCode & status)1068 void LongNameHandler::processPatternTimes(MeasureUnitImpl &&productUnit,
1069                                           Locale loc,
1070                                           const UNumberUnitWidth &width,
1071                                           const char *caseVariant,
1072                                           UnicodeString *outArray,
1073                                           UErrorCode &status) {
1074     if (U_FAILURE(status)) {
1075         return;
1076     }
1077     if (productUnit.complexity == UMEASURE_UNIT_MIXED) {
1078         // These are handled by MixedUnitLongNameHandler
1079         status = U_UNSUPPORTED_ERROR;
1080         return;
1081     }
1082 
1083 #if U_DEBUG
1084     for (int32_t pluralIndex = 0; pluralIndex < ARRAY_LENGTH; pluralIndex++) {
1085         U_ASSERT(outArray[pluralIndex].length() == 0);
1086         U_ASSERT(!outArray[pluralIndex].isBogus());
1087     }
1088 #endif
1089 
1090     if (productUnit.identifier.isEmpty()) {
1091         // TODO(icu-units#28): consider when serialize should be called.
1092         // identifier might also be empty for MeasureUnit().
1093         productUnit.serialize(status);
1094     }
1095     if (U_FAILURE(status)) {
1096         return;
1097     }
1098     if (productUnit.identifier.length() == 0) {
1099         // MeasureUnit(): no units: return empty strings.
1100         return;
1101     }
1102 
1103     MeasureUnit builtinUnit;
1104     if (MeasureUnit::findBySubType(productUnit.identifier.toStringPiece(), &builtinUnit)) {
1105         // TODO(icu-units#145): spec doesn't cover builtin-per-builtin, it
1106         // breaks them all down. Do we want to drop this?
1107         // - findBySubType isn't super efficient, if we skip it and go to basic
1108         //   singles, we don't have to construct MeasureUnit's anymore.
1109         // - Check all the existing unit tests that fail without this: is it due
1110         //   to incorrect fallback via getMeasureData?
1111         // - Do those unit tests cover this code path representatively?
1112         if (builtinUnit != MeasureUnit()) {
1113             getMeasureData(loc, builtinUnit, width, caseVariant, outArray, status);
1114             maybeCalculateGender(loc, builtinUnit, outArray, status);
1115         }
1116         return;
1117     }
1118 
1119     // 2. Set timesPattern to be getValue(times, locale, length)
1120     UnicodeString timesPattern = getCompoundValue("times", loc, width, status);
1121     SimpleFormatter timesPatternFormatter(timesPattern, 2, 2, status);
1122     if (U_FAILURE(status)) {
1123         return;
1124     }
1125 
1126     PlaceholderPosition globalPlaceholder[ARRAY_LENGTH];
1127     UChar globalJoinerChar = 0;
1128     // Numbered list items are from the algorithms at
1129     // https://unicode.org/reports/tr35/tr35-general.html#compound-units:
1130     //
1131     // pattern(...) point 5:
1132     // - Set both globalPlaceholder and globalPlaceholderPosition to be empty
1133     //
1134     // 3. Set result to be empty
1135     for (int32_t pluralIndex = 0; pluralIndex < ARRAY_LENGTH; pluralIndex++) {
1136         // Initial state: empty string pattern, via all falling back to OTHER:
1137         if (pluralIndex == StandardPlural::Form::OTHER) {
1138             outArray[pluralIndex].remove();
1139         } else {
1140             outArray[pluralIndex].setToBogus();
1141         }
1142         globalPlaceholder[pluralIndex] = PH_EMPTY;
1143     }
1144 
1145     // Empty string represents "compound" (propagate the plural form).
1146     const char *pluralCategory = "";
1147     DerivedComponents derivedTimesPlurals(loc, "plural", "times");
1148     DerivedComponents derivedTimesCases(loc, "case", "times");
1149     DerivedComponents derivedPowerCases(loc, "case", "power");
1150 
1151     // 4. For each single_unit in product_unit
1152     for (int32_t singleUnitIndex = 0; singleUnitIndex < productUnit.singleUnits.length();
1153          singleUnitIndex++) {
1154         SingleUnitImpl *singleUnit = productUnit.singleUnits[singleUnitIndex];
1155         const char *singlePluralCategory;
1156         const char *singleCaseVariant;
1157         // TODO(icu-units#28): ensure we have unit tests that change/fail if we
1158         // assign incorrect case variants here:
1159         if (singleUnitIndex < productUnit.singleUnits.length() - 1) {
1160             // 4.1. If hasMultiple
1161             singlePluralCategory = derivedTimesPlurals.value0(pluralCategory);
1162             singleCaseVariant = derivedTimesCases.value0(caseVariant);
1163             pluralCategory = derivedTimesPlurals.value1(pluralCategory);
1164             caseVariant = derivedTimesCases.value1(caseVariant);
1165         } else {
1166             singlePluralCategory = derivedTimesPlurals.value1(pluralCategory);
1167             singleCaseVariant = derivedTimesCases.value1(caseVariant);
1168         }
1169 
1170         // 4.2. Get the gender of that single_unit
1171         MeasureUnit simpleUnit;
1172         if (!MeasureUnit::findBySubType(singleUnit->getSimpleUnitID(), &simpleUnit)) {
1173             // Ideally all simple units should be known, but they're not:
1174             // 100-kilometer is internally treated as a simple unit, but it is
1175             // not a built-in unit and does not have formatting data in CLDR 39.
1176             //
1177             // TODO(icu-units#28): test (desirable) invariants in unit tests.
1178             status = U_UNSUPPORTED_ERROR;
1179             return;
1180         }
1181         const char *gender = getGenderString(getGenderForBuiltin(loc, simpleUnit, status), status);
1182 
1183         // 4.3. If singleUnit starts with a dimensionality_prefix, such as 'square-'
1184         U_ASSERT(singleUnit->dimensionality > 0);
1185         int32_t dimensionality = singleUnit->dimensionality;
1186         UnicodeString dimensionalityPrefixPatterns[ARRAY_LENGTH];
1187         if (dimensionality != 1) {
1188             // 4.3.1. set dimensionalityPrefixPattern to be
1189             //   getValue(that dimensionality_prefix, locale, length, singlePluralCategory, singleCaseVariant, gender),
1190             //   such as "{0} kwadratowym"
1191             CharString dimensionalityKey("compound/power", status);
1192             dimensionalityKey.appendNumber(dimensionality, status);
1193             getInflectedMeasureData(dimensionalityKey.toStringPiece(), loc, width, gender,
1194                                     singleCaseVariant, dimensionalityPrefixPatterns, status);
1195             if (U_FAILURE(status)) {
1196                 // At the time of writing, only pow2 and pow3 are supported.
1197                 // Attempting to format other powers results in a
1198                 // U_RESOURCE_TYPE_MISMATCH. We convert the error if we
1199                 // understand it:
1200                 if (status == U_RESOURCE_TYPE_MISMATCH && dimensionality > 3) {
1201                     status = U_UNSUPPORTED_ERROR;
1202                 }
1203                 return;
1204             }
1205 
1206             // TODO(icu-units#139):
1207             // 4.3.2. set singlePluralCategory to be power0(singlePluralCategory)
1208 
1209             // 4.3.3. set singleCaseVariant to be power0(singleCaseVariant)
1210             singleCaseVariant = derivedPowerCases.value0(singleCaseVariant);
1211             // 4.3.4. remove the dimensionality_prefix from singleUnit
1212             singleUnit->dimensionality = 1;
1213         }
1214 
1215         // 4.4. if singleUnit starts with an si_prefix, such as 'centi'
1216         UMeasurePrefix prefix = singleUnit->unitPrefix;
1217         UnicodeString prefixPattern;
1218         if (prefix != UMEASURE_PREFIX_ONE) {
1219             // 4.4.1. set siPrefixPattern to be getValue(that si_prefix, locale,
1220             //        length), such as "centy{0}"
1221             CharString prefixKey;
1222             // prefixKey looks like "1024p3" or "10p-2":
1223             prefixKey.appendNumber(umeas_getPrefixBase(prefix), status);
1224             prefixKey.append('p', status);
1225             prefixKey.appendNumber(umeas_getPrefixPower(prefix), status);
1226             // Contains a pattern like "centy{0}".
1227             prefixPattern = getCompoundValue(prefixKey.toStringPiece(), loc, width, status);
1228 
1229             // 4.4.2. set singlePluralCategory to be prefix0(singlePluralCategory)
1230             //
1231             // TODO(icu-units#139): that refers to these rules:
1232             // <deriveComponent feature="plural" structure="prefix" value0="one" value1="compound"/>
1233             // though I'm not sure what other value they might end up having.
1234             //
1235             // 4.4.3. set singleCaseVariant to be prefix0(singleCaseVariant)
1236             //
1237             // TODO(icu-units#139): that refers to:
1238             // <deriveComponent feature="case" structure="prefix" value0="nominative"
1239             // value1="compound"/> but the prefix (value0) doesn't have case, the rest simply
1240             // propagates.
1241 
1242             // 4.4.4. remove the si_prefix from singleUnit
1243             singleUnit->unitPrefix = UMEASURE_PREFIX_ONE;
1244         }
1245 
1246         // 4.5. Set corePattern to be the getValue(singleUnit, locale, length,
1247         //      singlePluralCategory, singleCaseVariant), such as "{0} metrem"
1248         UnicodeString singleUnitArray[ARRAY_LENGTH];
1249         // At this point we are left with a Simple Unit:
1250         U_ASSERT(uprv_strcmp(singleUnit->build(status).getIdentifier(), singleUnit->getSimpleUnitID()) ==
1251                  0);
1252         getMeasureData(loc, singleUnit->build(status), width, singleCaseVariant, singleUnitArray,
1253                        status);
1254         if (U_FAILURE(status)) {
1255             // Shouldn't happen if we have data for all single units
1256             return;
1257         }
1258 
1259         // Calculate output gender
1260         if (!singleUnitArray[GENDER_INDEX].isBogus()) {
1261             U_ASSERT(!singleUnitArray[GENDER_INDEX].isEmpty());
1262             UnicodeString uVal;
1263 
1264             if (prefix != UMEASURE_PREFIX_ONE) {
1265                 singleUnitArray[GENDER_INDEX] =
1266                     getDerivedGender(loc, "prefix", singleUnitArray, nullptr, status);
1267             }
1268 
1269             if (dimensionality != 1) {
1270                 singleUnitArray[GENDER_INDEX] =
1271                     getDerivedGender(loc, "power", singleUnitArray, nullptr, status);
1272             }
1273 
1274             UnicodeString timesGenderRule = getDeriveCompoundRule(loc, "gender", "times", status);
1275             if (timesGenderRule.length() == 1) {
1276                 switch (timesGenderRule[0]) {
1277                 case u'0':
1278                     if (singleUnitIndex == 0) {
1279                         U_ASSERT(outArray[GENDER_INDEX].isBogus());
1280                         outArray[GENDER_INDEX] = singleUnitArray[GENDER_INDEX];
1281                     }
1282                     break;
1283                 case u'1':
1284                     if (singleUnitIndex == productUnit.singleUnits.length() - 1) {
1285                         U_ASSERT(outArray[GENDER_INDEX].isBogus());
1286                         outArray[GENDER_INDEX] = singleUnitArray[GENDER_INDEX];
1287                     }
1288                 }
1289             } else {
1290                 if (outArray[GENDER_INDEX].isBogus()) {
1291                     outArray[GENDER_INDEX] = timesGenderRule;
1292                 }
1293             }
1294         }
1295 
1296         // Calculate resulting patterns for each plural form
1297         for (int32_t pluralIndex = 0; pluralIndex < StandardPlural::Form::COUNT; pluralIndex++) {
1298             StandardPlural::Form plural = static_cast<StandardPlural::Form>(pluralIndex);
1299 
1300             // singleUnitArray[pluralIndex] looks something like "{0} Meter"
1301             if (outArray[pluralIndex].isBogus()) {
1302                 if (singleUnitArray[pluralIndex].isBogus()) {
1303                     // Let the usual plural fallback mechanism take care of this
1304                     // plural form
1305                     continue;
1306                 } else {
1307                     // Since our singleUnit can have a plural form that outArray
1308                     // doesn't yet have (relying on fallback to OTHER), we start
1309                     // by grabbing it with the normal plural fallback mechanism
1310                     outArray[pluralIndex] = getWithPlural(outArray, plural, status);
1311                     if (U_FAILURE(status)) {
1312                         return;
1313                     }
1314                 }
1315             }
1316 
1317             if (uprv_strcmp(singlePluralCategory, "") != 0) {
1318                 plural = static_cast<StandardPlural::Form>(getIndex(singlePluralCategory, status));
1319             }
1320 
1321             // 4.6. Extract(corePattern, coreUnit, placeholder, placeholderPosition) from that pattern.
1322             UnicodeString coreUnit;
1323             PlaceholderPosition placeholderPosition;
1324             UChar joinerChar;
1325             extractCorePattern(getWithPlural(singleUnitArray, plural, status), coreUnit,
1326                                placeholderPosition, joinerChar);
1327 
1328             // 4.7 If the position is middle, then fail
1329             if (placeholderPosition == PH_MIDDLE) {
1330                 status = U_UNSUPPORTED_ERROR;
1331                 return;
1332             }
1333 
1334             // 4.8. If globalPlaceholder is empty
1335             if (globalPlaceholder[pluralIndex] == PH_EMPTY) {
1336                 globalPlaceholder[pluralIndex] = placeholderPosition;
1337                 globalJoinerChar = joinerChar;
1338             } else {
1339                 // Expect all units involved to have the same placeholder position
1340                 U_ASSERT(globalPlaceholder[pluralIndex] == placeholderPosition);
1341                 // TODO(icu-units#28): Do we want to add a unit test that checks
1342                 // for consistent joiner chars? Probably not, given how
1343                 // inconsistent they are. File a CLDR ticket with examples?
1344             }
1345             // Now coreUnit would be just "Meter"
1346 
1347             // 4.9. If siPrefixPattern is not empty
1348             if (prefix != UMEASURE_PREFIX_ONE) {
1349                 SimpleFormatter prefixCompiled(prefixPattern, 1, 1, status);
1350                 if (U_FAILURE(status)) {
1351                     return;
1352                 }
1353 
1354                 // 4.9.1. Set coreUnit to be the combineLowercasing(locale, length, siPrefixPattern,
1355                 //        coreUnit)
1356                 UnicodeString tmp;
1357                 // combineLowercasing(locale, length, prefixPattern, coreUnit)
1358                 //
1359                 // TODO(icu-units#28): run this only if prefixPattern does not
1360                 // contain space characters - do languages "as", "bn", "hi",
1361                 // "kk", etc have concepts of upper and lower case?:
1362                 if (width == UNUM_UNIT_WIDTH_FULL_NAME) {
1363                     coreUnit.toLower(loc);
1364                 }
1365                 prefixCompiled.format(coreUnit, tmp, status);
1366                 if (U_FAILURE(status)) {
1367                     return;
1368                 }
1369                 coreUnit = tmp;
1370             }
1371 
1372             // 4.10. If dimensionalityPrefixPattern is not empty
1373             if (dimensionality != 1) {
1374                 SimpleFormatter dimensionalityCompiled(
1375                     getWithPlural(dimensionalityPrefixPatterns, plural, status), 1, 1, status);
1376                 if (U_FAILURE(status)) {
1377                     return;
1378                 }
1379 
1380                 // 4.10.1. Set coreUnit to be the combineLowercasing(locale, length,
1381                 //         dimensionalityPrefixPattern, coreUnit)
1382                 UnicodeString tmp;
1383                 // combineLowercasing(locale, length, prefixPattern, coreUnit)
1384                 //
1385                 // TODO(icu-units#28): run this only if prefixPattern does not
1386                 // contain space characters - do languages "as", "bn", "hi",
1387                 // "kk", etc have concepts of upper and lower case?:
1388                 if (width == UNUM_UNIT_WIDTH_FULL_NAME) {
1389                     coreUnit.toLower(loc);
1390                 }
1391                 dimensionalityCompiled.format(coreUnit, tmp, status);
1392                 if (U_FAILURE(status)) {
1393                     return;
1394                 }
1395                 coreUnit = tmp;
1396             }
1397 
1398             if (outArray[pluralIndex].length() == 0) {
1399                 // 4.11. If the result is empty, set result to be coreUnit
1400                 outArray[pluralIndex] = coreUnit;
1401             } else {
1402                 // 4.12. Otherwise set result to be format(timesPattern, result, coreUnit)
1403                 UnicodeString tmp;
1404                 timesPatternFormatter.format(outArray[pluralIndex], coreUnit, tmp, status);
1405                 outArray[pluralIndex] = tmp;
1406             }
1407         }
1408     }
1409     for (int32_t pluralIndex = 0; pluralIndex < StandardPlural::Form::COUNT; pluralIndex++) {
1410         if (globalPlaceholder[pluralIndex] == PH_BEGINNING) {
1411             UnicodeString tmp;
1412             tmp.append(u"{0}", 3);
1413             if (globalJoinerChar != 0) {
1414                 tmp.append(globalJoinerChar);
1415             }
1416             tmp.append(outArray[pluralIndex]);
1417             outArray[pluralIndex] = tmp;
1418         } else if (globalPlaceholder[pluralIndex] == PH_END) {
1419             if (globalJoinerChar != 0) {
1420                 outArray[pluralIndex].append(globalJoinerChar);
1421             }
1422             outArray[pluralIndex].append(u"{0}", 3);
1423         }
1424     }
1425 }
1426 
getUnitDisplayName(const Locale & loc,const MeasureUnit & unit,UNumberUnitWidth width,UErrorCode & status)1427 UnicodeString LongNameHandler::getUnitDisplayName(
1428         const Locale& loc,
1429         const MeasureUnit& unit,
1430         UNumberUnitWidth width,
1431         UErrorCode& status) {
1432     if (U_FAILURE(status)) {
1433         return ICU_Utility::makeBogusString();
1434     }
1435     UnicodeString simpleFormats[ARRAY_LENGTH];
1436     getMeasureData(loc, unit, width, "", simpleFormats, status);
1437     return simpleFormats[DNAM_INDEX];
1438 }
1439 
getUnitPattern(const Locale & loc,const MeasureUnit & unit,UNumberUnitWidth width,StandardPlural::Form pluralForm,UErrorCode & status)1440 UnicodeString LongNameHandler::getUnitPattern(
1441         const Locale& loc,
1442         const MeasureUnit& unit,
1443         UNumberUnitWidth width,
1444         StandardPlural::Form pluralForm,
1445         UErrorCode& status) {
1446     if (U_FAILURE(status)) {
1447         return ICU_Utility::makeBogusString();
1448     }
1449     UnicodeString simpleFormats[ARRAY_LENGTH];
1450     getMeasureData(loc, unit, width, "", simpleFormats, status);
1451     // The above already handles fallback from other widths to short
1452     if (U_FAILURE(status)) {
1453         return ICU_Utility::makeBogusString();
1454     }
1455     // Now handle fallback from other plural forms to OTHER
1456     return (!(simpleFormats[pluralForm]).isBogus())? simpleFormats[pluralForm]:
1457             simpleFormats[StandardPlural::Form::OTHER];
1458 }
1459 
forCurrencyLongNames(const Locale & loc,const CurrencyUnit & currency,const PluralRules * rules,const MicroPropsGenerator * parent,UErrorCode & status)1460 LongNameHandler* LongNameHandler::forCurrencyLongNames(const Locale &loc, const CurrencyUnit &currency,
1461                                                       const PluralRules *rules,
1462                                                       const MicroPropsGenerator *parent,
1463                                                       UErrorCode &status) {
1464     auto* result = new LongNameHandler(rules, parent);
1465     if (result == nullptr) {
1466         status = U_MEMORY_ALLOCATION_ERROR;
1467         return nullptr;
1468     }
1469     UnicodeString simpleFormats[ARRAY_LENGTH];
1470     getCurrencyLongNameData(loc, currency, simpleFormats, status);
1471     if (U_FAILURE(status)) { return nullptr; }
1472     result->simpleFormatsToModifiers(simpleFormats, {UFIELD_CATEGORY_NUMBER, UNUM_CURRENCY_FIELD}, status);
1473     // TODO(icu-units#28): currency gender?
1474     return result;
1475 }
1476 
simpleFormatsToModifiers(const UnicodeString * simpleFormats,Field field,UErrorCode & status)1477 void LongNameHandler::simpleFormatsToModifiers(const UnicodeString *simpleFormats, Field field,
1478                                                UErrorCode &status) {
1479     for (int32_t i = 0; i < StandardPlural::Form::COUNT; i++) {
1480         StandardPlural::Form plural = static_cast<StandardPlural::Form>(i);
1481         UnicodeString simpleFormat = getWithPlural(simpleFormats, plural, status);
1482         if (U_FAILURE(status)) { return; }
1483         SimpleFormatter compiledFormatter(simpleFormat, 0, 1, status);
1484         if (U_FAILURE(status)) { return; }
1485         fModifiers[i] = SimpleModifier(compiledFormatter, field, false, {this, SIGNUM_POS_ZERO, plural});
1486     }
1487 }
1488 
multiSimpleFormatsToModifiers(const UnicodeString * leadFormats,UnicodeString trailFormat,Field field,UErrorCode & status)1489 void LongNameHandler::multiSimpleFormatsToModifiers(const UnicodeString *leadFormats, UnicodeString trailFormat,
1490                                                     Field field, UErrorCode &status) {
1491     SimpleFormatter trailCompiled(trailFormat, 1, 1, status);
1492     if (U_FAILURE(status)) { return; }
1493     for (int32_t i = 0; i < StandardPlural::Form::COUNT; i++) {
1494         StandardPlural::Form plural = static_cast<StandardPlural::Form>(i);
1495         UnicodeString leadFormat = getWithPlural(leadFormats, plural, status);
1496         if (U_FAILURE(status)) { return; }
1497         UnicodeString compoundFormat;
1498         if (leadFormat.length() == 0) {
1499             compoundFormat = trailFormat;
1500         } else {
1501             trailCompiled.format(leadFormat, compoundFormat, status);
1502             if (U_FAILURE(status)) { return; }
1503         }
1504         SimpleFormatter compoundCompiled(compoundFormat, 0, 1, status);
1505         if (U_FAILURE(status)) { return; }
1506         fModifiers[i] = SimpleModifier(compoundCompiled, field, false, {this, SIGNUM_POS_ZERO, plural});
1507     }
1508 }
1509 
processQuantity(DecimalQuantity & quantity,MicroProps & micros,UErrorCode & status) const1510 void LongNameHandler::processQuantity(DecimalQuantity &quantity, MicroProps &micros,
1511                                       UErrorCode &status) const {
1512     if (parent != NULL) {
1513         parent->processQuantity(quantity, micros, status);
1514     }
1515     StandardPlural::Form pluralForm = utils::getPluralSafe(micros.rounder, rules, quantity, status);
1516     micros.modOuter = &fModifiers[pluralForm];
1517     micros.gender = gender;
1518 }
1519 
getModifier(Signum,StandardPlural::Form plural) const1520 const Modifier* LongNameHandler::getModifier(Signum /*signum*/, StandardPlural::Form plural) const {
1521     return &fModifiers[plural];
1522 }
1523 
forMeasureUnit(const Locale & loc,const MeasureUnit & mixedUnit,const UNumberUnitWidth & width,const char * unitDisplayCase,const PluralRules * rules,const MicroPropsGenerator * parent,MixedUnitLongNameHandler * fillIn,UErrorCode & status)1524 void MixedUnitLongNameHandler::forMeasureUnit(const Locale &loc,
1525                                               const MeasureUnit &mixedUnit,
1526                                               const UNumberUnitWidth &width,
1527                                               const char *unitDisplayCase,
1528                                               const PluralRules *rules,
1529                                               const MicroPropsGenerator *parent,
1530                                               MixedUnitLongNameHandler *fillIn,
1531                                               UErrorCode &status) {
1532     U_ASSERT(mixedUnit.getComplexity(status) == UMEASURE_UNIT_MIXED);
1533     U_ASSERT(fillIn != nullptr);
1534     if (U_FAILURE(status)) {
1535         return;
1536     }
1537 
1538     MeasureUnitImpl temp;
1539     const MeasureUnitImpl &impl = MeasureUnitImpl::forMeasureUnit(mixedUnit, temp, status);
1540     // Defensive, for production code:
1541     if (impl.complexity != UMEASURE_UNIT_MIXED) {
1542         // Should be using the normal LongNameHandler
1543         status = U_UNSUPPORTED_ERROR;
1544         return;
1545     }
1546 
1547     fillIn->fMixedUnitCount = impl.singleUnits.length();
1548     fillIn->fMixedUnitData.adoptInstead(new UnicodeString[fillIn->fMixedUnitCount * ARRAY_LENGTH]);
1549     for (int32_t i = 0; i < fillIn->fMixedUnitCount; i++) {
1550         // Grab data for each of the components.
1551         UnicodeString *unitData = &fillIn->fMixedUnitData[i * ARRAY_LENGTH];
1552         // TODO(CLDR-14502): check from the CLDR-14502 ticket whether this
1553         // propagation of unitDisplayCase is correct:
1554         getMeasureData(loc, impl.singleUnits[i]->build(status), width, unitDisplayCase, unitData,
1555                        status);
1556         // TODO(ICU-21494): if we add support for gender for mixed units, we may
1557         // need maybeCalculateGender() here.
1558     }
1559 
1560     // TODO(icu-units#120): Make sure ICU doesn't output zero-valued
1561     // high-magnitude fields
1562     // * for mixed units count N, produce N listFormatters, one for each subset
1563     //   that might be formatted.
1564     UListFormatterWidth listWidth = ULISTFMT_WIDTH_SHORT;
1565     if (width == UNUM_UNIT_WIDTH_NARROW) {
1566         listWidth = ULISTFMT_WIDTH_NARROW;
1567     } else if (width == UNUM_UNIT_WIDTH_FULL_NAME) {
1568         // This might be the same as SHORT in most languages:
1569         listWidth = ULISTFMT_WIDTH_WIDE;
1570     }
1571     fillIn->fListFormatter.adoptInsteadAndCheckErrorCode(
1572         ListFormatter::createInstance(loc, ULISTFMT_TYPE_UNITS, listWidth, status), status);
1573     // TODO(ICU-21494): grab gender of each unit, calculate the gender
1574     // associated with this list formatter, save it for later.
1575     fillIn->rules = rules;
1576     fillIn->parent = parent;
1577 
1578     // We need a localised NumberFormatter for the numbers of the bigger units
1579     // (providing Arabic numerals, for example).
1580     fillIn->fNumberFormatter = NumberFormatter::withLocale(loc);
1581 }
1582 
processQuantity(DecimalQuantity & quantity,MicroProps & micros,UErrorCode & status) const1583 void MixedUnitLongNameHandler::processQuantity(DecimalQuantity &quantity, MicroProps &micros,
1584                                                UErrorCode &status) const {
1585     U_ASSERT(fMixedUnitCount > 1);
1586     if (parent != nullptr) {
1587         parent->processQuantity(quantity, micros, status);
1588     }
1589     micros.modOuter = getMixedUnitModifier(quantity, micros, status);
1590 }
1591 
getMixedUnitModifier(DecimalQuantity & quantity,MicroProps & micros,UErrorCode & status) const1592 const Modifier *MixedUnitLongNameHandler::getMixedUnitModifier(DecimalQuantity &quantity,
1593                                                                MicroProps &micros,
1594                                                                UErrorCode &status) const {
1595     if (micros.mixedMeasuresCount == 0) {
1596         U_ASSERT(micros.mixedMeasuresCount > 0); // Mixed unit: we must have more than one unit value
1597         status = U_UNSUPPORTED_ERROR;
1598         return &micros.helpers.emptyWeakModifier;
1599     }
1600 
1601     // Algorithm:
1602     //
1603     // For the mixed-units measurement of: "3 yard, 1 foot, 2.6 inch", we should
1604     // find "3 yard" and "1 foot" in micros.mixedMeasures.
1605     //
1606     // Obtain long-names with plural forms corresponding to measure values:
1607     //   * {0} yards, {0} foot, {0} inches
1608     //
1609     // Format the integer values appropriately and modify with the format
1610     // strings:
1611     //   - 3 yards, 1 foot
1612     //
1613     // Use ListFormatter to combine, with one placeholder:
1614     //   - 3 yards, 1 foot and {0} inches
1615     //
1616     // Return a SimpleModifier for this pattern, letting the rest of the
1617     // pipeline take care of the remaining inches.
1618 
1619     LocalArray<UnicodeString> outputMeasuresList(new UnicodeString[fMixedUnitCount], status);
1620     if (U_FAILURE(status)) {
1621         return &micros.helpers.emptyWeakModifier;
1622     }
1623 
1624     StandardPlural::Form quantityPlural = StandardPlural::Form::OTHER;
1625     for (int32_t i = 0; i < micros.mixedMeasuresCount; i++) {
1626         DecimalQuantity fdec;
1627 
1628         // If numbers are negative, only the first number needs to have its
1629         // negative sign formatted.
1630         int64_t number = i > 0 ? std::abs(micros.mixedMeasures[i]) : micros.mixedMeasures[i];
1631 
1632         if (micros.indexOfQuantity == i) { // Insert placeholder for `quantity`
1633             // If quantity is not the first value and quantity is negative
1634             if (micros.indexOfQuantity > 0 && quantity.isNegative()) {
1635                 quantity.negate();
1636             }
1637 
1638             StandardPlural::Form quantityPlural =
1639                 utils::getPluralSafe(micros.rounder, rules, quantity, status);
1640             UnicodeString quantityFormatWithPlural =
1641                 getWithPlural(&fMixedUnitData[i * ARRAY_LENGTH], quantityPlural, status);
1642             SimpleFormatter quantityFormatter(quantityFormatWithPlural, 0, 1, status);
1643             quantityFormatter.format(UnicodeString(u"{0}"), outputMeasuresList[i], status);
1644         } else {
1645             fdec.setToLong(number);
1646             StandardPlural::Form pluralForm = utils::getStandardPlural(rules, fdec);
1647             UnicodeString simpleFormat =
1648                 getWithPlural(&fMixedUnitData[i * ARRAY_LENGTH], pluralForm, status);
1649             SimpleFormatter compiledFormatter(simpleFormat, 0, 1, status);
1650             UnicodeString num;
1651             auto appendable = UnicodeStringAppendable(num);
1652 
1653             fNumberFormatter.formatDecimalQuantity(fdec, status).appendTo(appendable, status);
1654             compiledFormatter.format(num, outputMeasuresList[i], status);
1655         }
1656     }
1657 
1658     // TODO(ICU-21494): implement gender for lists of mixed units. Presumably we
1659     // can set micros.gender to the gender associated with the list formatter in
1660     // use below (once we have correct support for that). And then document this
1661     // appropriately? "getMixedUnitModifier" doesn't sound like it would do
1662     // something like this.
1663 
1664     // Combine list into a "premixed" pattern
1665     UnicodeString premixedFormatPattern;
1666     fListFormatter->format(outputMeasuresList.getAlias(), fMixedUnitCount, premixedFormatPattern,
1667                            status);
1668     SimpleFormatter premixedCompiled(premixedFormatPattern, 0, 1, status);
1669     if (U_FAILURE(status)) {
1670         return &micros.helpers.emptyWeakModifier;
1671     }
1672 
1673     micros.helpers.mixedUnitModifier =
1674         SimpleModifier(premixedCompiled, kUndefinedField, false, {this, SIGNUM_POS_ZERO, quantityPlural});
1675     return &micros.helpers.mixedUnitModifier;
1676 }
1677 
getModifier(Signum,StandardPlural::Form) const1678 const Modifier *MixedUnitLongNameHandler::getModifier(Signum /*signum*/,
1679                                                       StandardPlural::Form /*plural*/) const {
1680     // TODO(icu-units#28): investigate this method when investigating where
1681     // ModifierStore::getModifier() gets used. To be sure it remains
1682     // unreachable:
1683     UPRV_UNREACHABLE_EXIT;
1684     return nullptr;
1685 }
1686 
forMeasureUnits(const Locale & loc,const MaybeStackVector<MeasureUnit> & units,const UNumberUnitWidth & width,const char * unitDisplayCase,const PluralRules * rules,const MicroPropsGenerator * parent,UErrorCode & status)1687 LongNameMultiplexer *LongNameMultiplexer::forMeasureUnits(const Locale &loc,
1688                                                           const MaybeStackVector<MeasureUnit> &units,
1689                                                           const UNumberUnitWidth &width,
1690                                                           const char *unitDisplayCase,
1691                                                           const PluralRules *rules,
1692                                                           const MicroPropsGenerator *parent,
1693                                                           UErrorCode &status) {
1694     LocalPointer<LongNameMultiplexer> result(new LongNameMultiplexer(parent), status);
1695     if (U_FAILURE(status)) {
1696         return nullptr;
1697     }
1698     U_ASSERT(units.length() > 0);
1699     if (result->fHandlers.resize(units.length()) == nullptr) {
1700         status = U_MEMORY_ALLOCATION_ERROR;
1701         return nullptr;
1702     }
1703     result->fMeasureUnits.adoptInstead(new MeasureUnit[units.length()]);
1704     for (int32_t i = 0, length = units.length(); i < length; i++) {
1705         const MeasureUnit &unit = *units[i];
1706         result->fMeasureUnits[i] = unit;
1707         if (unit.getComplexity(status) == UMEASURE_UNIT_MIXED) {
1708             MixedUnitLongNameHandler *mlnh = result->fMixedUnitHandlers.createAndCheckErrorCode(status);
1709             MixedUnitLongNameHandler::forMeasureUnit(loc, unit, width, unitDisplayCase, rules, NULL,
1710                                                      mlnh, status);
1711             result->fHandlers[i] = mlnh;
1712         } else {
1713             LongNameHandler *lnh = result->fLongNameHandlers.createAndCheckErrorCode(status);
1714             LongNameHandler::forMeasureUnit(loc, unit, width, unitDisplayCase, rules, NULL, lnh, status);
1715             result->fHandlers[i] = lnh;
1716         }
1717         if (U_FAILURE(status)) {
1718             return nullptr;
1719         }
1720     }
1721     return result.orphan();
1722 }
1723 
processQuantity(DecimalQuantity & quantity,MicroProps & micros,UErrorCode & status) const1724 void LongNameMultiplexer::processQuantity(DecimalQuantity &quantity, MicroProps &micros,
1725                                           UErrorCode &status) const {
1726     // We call parent->processQuantity() from the Multiplexer, instead of
1727     // letting LongNameHandler handle it: we don't know which LongNameHandler to
1728     // call until we've called the parent!
1729     fParent->processQuantity(quantity, micros, status);
1730 
1731     // Call the correct LongNameHandler based on outputUnit
1732     for (int i = 0; i < fHandlers.getCapacity(); i++) {
1733         if (fMeasureUnits[i] == micros.outputUnit) {
1734             fHandlers[i]->processQuantity(quantity, micros, status);
1735             return;
1736         }
1737     }
1738     if (U_FAILURE(status)) {
1739         return;
1740     }
1741     // We shouldn't receive any outputUnit for which we haven't already got a
1742     // LongNameHandler:
1743     status = U_INTERNAL_PROGRAM_ERROR;
1744 }
1745 
1746 #endif /* #if !UCONFIG_NO_FORMATTING */
1747