1 /* -*- Mode: C++; tab-width: 8; indent-tabs-mode: nil; c-basic-offset: 2 -*- 2 * vim: set ts=8 sts=2 et sw=2 tw=80: 3 * This Source Code Form is subject to the terms of the Mozilla Public 4 * License, v. 2.0. If a copy of the MPL was not distributed with this 5 * file, You can obtain one at http://mozilla.org/MPL/2.0/. */ 6 7 #ifndef builtin_intl_SharedIntlData_h 8 #define builtin_intl_SharedIntlData_h 9 10 #include "mozilla/MemoryReporting.h" 11 #include "mozilla/UniquePtr.h" 12 13 #include <stddef.h> 14 15 #include "js/AllocPolicy.h" 16 #include "js/CharacterEncoding.h" 17 #include "js/GCAPI.h" 18 #include "js/GCHashTable.h" 19 #include "js/Result.h" 20 #include "js/RootingAPI.h" 21 #include "js/Utility.h" 22 #include "vm/StringType.h" 23 24 namespace mozilla::intl { 25 class DateTimePatternGenerator; 26 } // namespace mozilla::intl 27 28 namespace js { 29 30 class ArrayObject; 31 32 namespace intl { 33 34 /** 35 * This deleter class exists so that mozilla::intl::DateTimePatternGenerator 36 * can be a forward declaration, but still be used inside of a UniquePtr. 37 */ 38 class DateTimePatternGeneratorDeleter { 39 public: 40 void operator()(mozilla::intl::DateTimePatternGenerator* ptr); 41 }; 42 43 /** 44 * Stores Intl data which can be shared across compartments (but not contexts). 45 * 46 * Used for data which is expensive when computed repeatedly or is not 47 * available through ICU. 48 */ 49 class SharedIntlData { 50 struct LinearStringLookup { 51 union { 52 const JS::Latin1Char* latin1Chars; 53 const char16_t* twoByteChars; 54 }; 55 bool isLatin1; 56 size_t length; 57 JS::AutoCheckCannotGC nogc; 58 HashNumber hash = 0; 59 LinearStringLookupLinearStringLookup60 explicit LinearStringLookup(JSLinearString* string) 61 : isLatin1(string->hasLatin1Chars()), length(string->length()) { 62 if (isLatin1) { 63 latin1Chars = string->latin1Chars(nogc); 64 } else { 65 twoByteChars = string->twoByteChars(nogc); 66 } 67 } 68 LinearStringLookupLinearStringLookup69 LinearStringLookup(const char* chars, size_t length) 70 : isLatin1(true), length(length) { 71 latin1Chars = reinterpret_cast<const JS::Latin1Char*>(chars); 72 } 73 }; 74 75 public: 76 /** 77 * Information tracking the set of the supported time zone names, derived 78 * from the IANA time zone database <https://www.iana.org/time-zones>. 79 * 80 * There are two kinds of IANA time zone names: Zone and Link (denoted as 81 * such in database source files). Zone names are the canonical, preferred 82 * name for a time zone, e.g. Asia/Kolkata. Link names simply refer to 83 * target Zone names for their meaning, e.g. Asia/Calcutta targets 84 * Asia/Kolkata. That a name is a Link doesn't *necessarily* reflect a 85 * sense of deprecation: some Link names also exist partly for convenience, 86 * e.g. UTC and GMT as Link names targeting the Zone name Etc/UTC. 87 * 88 * Two data sources determine the time zone names we support: those ICU 89 * supports and IANA's zone information. 90 * 91 * Unfortunately the names ICU and IANA support, and their Link 92 * relationships from name to target, aren't identical, so we can't simply 93 * implicitly trust ICU's name handling. We must perform various 94 * preprocessing of user-provided zone names and post-processing of 95 * ICU-provided zone names to implement ECMA-402's IANA-consistent behavior. 96 * 97 * Also see <https://ssl.icu-project.org/trac/ticket/12044> and 98 * <http://unicode.org/cldr/trac/ticket/9892>. 99 */ 100 101 using TimeZoneName = JSAtom*; 102 103 struct TimeZoneHasher { 104 struct Lookup : LinearStringLookup { 105 explicit Lookup(JSLinearString* timeZone); 106 }; 107 hashTimeZoneHasher108 static js::HashNumber hash(const Lookup& lookup) { return lookup.hash; } 109 static bool match(TimeZoneName key, const Lookup& lookup); 110 }; 111 112 using TimeZoneSet = 113 GCHashSet<TimeZoneName, TimeZoneHasher, SystemAllocPolicy>; 114 using TimeZoneMap = 115 GCHashMap<TimeZoneName, TimeZoneName, TimeZoneHasher, SystemAllocPolicy>; 116 117 private: 118 /** 119 * As a threshold matter, available time zones are those time zones ICU 120 * supports, via ucal_openTimeZones. But ICU supports additional non-IANA 121 * time zones described in intl/icu/source/tools/tzcode/icuzones (listed in 122 * IntlTimeZoneData.cpp's |legacyICUTimeZones|) for its own backwards 123 * compatibility purposes. This set consists of ICU's supported time zones, 124 * minus all backwards-compatibility time zones. 125 */ 126 TimeZoneSet availableTimeZones; 127 128 /** 129 * IANA treats some time zone names as Zones, that ICU instead treats as 130 * Links. For example, IANA considers "America/Indiana/Indianapolis" to be 131 * a Zone and "America/Fort_Wayne" a Link that targets it, but ICU 132 * considers the former a Link that targets "America/Indianapolis" (which 133 * IANA treats as a Link). 134 * 135 * ECMA-402 requires that we respect IANA data, so if we're asked to 136 * canonicalize a time zone name in this set, we must *not* return ICU's 137 * canonicalization. 138 */ 139 TimeZoneSet ianaZonesTreatedAsLinksByICU; 140 141 /** 142 * IANA treats some time zone names as Links to one target, that ICU 143 * instead treats as either Zones, or Links to different targets. An 144 * example of the former is "Asia/Calcutta, which IANA assigns the target 145 * "Asia/Kolkata" but ICU considers its own Zone. An example of the latter 146 * is "America/Virgin", which IANA assigns the target 147 * "America/Port_of_Spain" but ICU assigns the target "America/St_Thomas". 148 * 149 * ECMA-402 requires that we respect IANA data, so if we're asked to 150 * canonicalize a time zone name that's a key in this map, we *must* return 151 * the corresponding value and *must not* return ICU's canonicalization. 152 */ 153 TimeZoneMap ianaLinksCanonicalizedDifferentlyByICU; 154 155 bool timeZoneDataInitialized = false; 156 157 /** 158 * Precomputes the available time zone names, because it's too expensive to 159 * call ucal_openTimeZones() repeatedly. 160 */ 161 bool ensureTimeZones(JSContext* cx); 162 163 public: 164 /** 165 * Returns the validated time zone name in |result|. If the input time zone 166 * isn't a valid IANA time zone name, |result| remains unchanged. 167 */ 168 bool validateTimeZoneName(JSContext* cx, JS::Handle<JSString*> timeZone, 169 JS::MutableHandle<JSAtom*> result); 170 171 /** 172 * Returns the canonical time zone name in |result|. If no canonical name 173 * was found, |result| remains unchanged. 174 * 175 * This method only handles time zones which are canonicalized differently 176 * by ICU when compared to IANA. 177 */ 178 bool tryCanonicalizeTimeZoneConsistentWithIANA( 179 JSContext* cx, JS::Handle<JSString*> timeZone, 180 JS::MutableHandle<JSAtom*> result); 181 182 /** 183 * Returns an iterator over all available time zones supported by ICU. The 184 * returned time zone names aren't canonicalized. 185 */ 186 JS::Result<TimeZoneSet::Iterator> availableTimeZonesIteration(JSContext* cx); 187 188 private: 189 using Locale = JSAtom*; 190 191 struct LocaleHasher { 192 struct Lookup : LinearStringLookup { 193 explicit Lookup(JSLinearString* locale); 194 Lookup(const char* chars, size_t length); 195 }; 196 hashLocaleHasher197 static js::HashNumber hash(const Lookup& lookup) { return lookup.hash; } 198 static bool match(Locale key, const Lookup& lookup); 199 }; 200 201 using LocaleSet = GCHashSet<Locale, LocaleHasher, SystemAllocPolicy>; 202 203 // Set of supported locales for all Intl service constructors except Collator, 204 // which uses its own set. 205 // 206 // UDateFormat: 207 // udat_[count,get]Available() return the same results as their 208 // uloc_[count,get]Available() counterparts. 209 // 210 // UNumberFormatter: 211 // unum_[count,get]Available() return the same results as their 212 // uloc_[count,get]Available() counterparts. 213 // 214 // UListFormatter, UPluralRules, and URelativeDateTimeFormatter: 215 // We're going to use ULocale availableLocales as per ICU recommendation: 216 // https://unicode-org.atlassian.net/browse/ICU-12756 217 LocaleSet supportedLocales; 218 219 // ucol_[count,get]Available() return different results compared to 220 // uloc_[count,get]Available(), we can't use |supportedLocales| here. 221 LocaleSet collatorSupportedLocales; 222 223 bool supportedLocalesInitialized = false; 224 225 // CountAvailable and GetAvailable describe the signatures used for ICU API 226 // to determine available locales for various functionality. 227 using CountAvailable = int32_t (*)(); 228 using GetAvailable = const char* (*)(int32_t localeIndex); 229 230 template <class AvailableLocales> 231 static bool getAvailableLocales(JSContext* cx, LocaleSet& locales, 232 const AvailableLocales& availableLocales); 233 234 /** 235 * Precomputes the available locales sets. 236 */ 237 bool ensureSupportedLocales(JSContext* cx); 238 239 public: 240 enum class SupportedLocaleKind { 241 Collator, 242 DateTimeFormat, 243 DisplayNames, 244 ListFormat, 245 NumberFormat, 246 PluralRules, 247 RelativeTimeFormat 248 }; 249 250 /** 251 * Sets |supported| to true if |locale| is supported by the requested Intl 252 * service constructor. Otherwise sets |supported| to false. 253 */ 254 [[nodiscard]] bool isSupportedLocale(JSContext* cx, SupportedLocaleKind kind, 255 JS::Handle<JSString*> locale, 256 bool* supported); 257 258 /** 259 * Returns all available locales for |kind|. 260 */ 261 ArrayObject* availableLocalesOf(JSContext* cx, SupportedLocaleKind kind); 262 263 private: 264 /** 265 * The case first parameter (BCP47 key "kf") allows to switch the order of 266 * upper- and lower-case characters. ICU doesn't directly provide an API 267 * to query the default case first value of a given locale, but instead 268 * requires to instantiate a collator object and then query the case first 269 * attribute (UCOL_CASE_FIRST). 270 * To avoid instantiating an additional collator object whenever we need 271 * to retrieve the default case first value of a specific locale, we 272 * compute the default case first value for every supported locale only 273 * once and then keep a list of all locales which don't use the default 274 * case first setting. 275 * There is almost no difference between lower-case first and when case 276 * first is disabled (UCOL_LOWER_FIRST resp. UCOL_OFF), so we only need to 277 * track locales which use upper-case first as their default setting. 278 * 279 * Instantiating collator objects for each available locale is slow 280 * (bug 1527879), therefore we're hardcoding the two locales using upper-case 281 * first ("da" (Danish) and "mt" (Maltese)) and only assert in debug-mode 282 * these two locales match the upper-case first locales returned by ICU. A 283 * system-ICU may support a different set of locales, therefore we're always 284 * calling into ICU to find the upper-case first locales in that case. 285 */ 286 287 #if DEBUG || MOZ_SYSTEM_ICU 288 LocaleSet upperCaseFirstLocales; 289 290 bool upperCaseFirstInitialized = false; 291 292 /** 293 * Precomputes the available locales which use upper-case first sorting. 294 */ 295 bool ensureUpperCaseFirstLocales(JSContext* cx); 296 #endif 297 298 public: 299 /** 300 * Sets |isUpperFirst| to true if |locale| sorts upper-case characters 301 * before lower-case characters. 302 */ 303 bool isUpperCaseFirst(JSContext* cx, JS::Handle<JSString*> locale, 304 bool* isUpperFirst); 305 306 private: 307 using UniqueDateTimePatternGenerator = 308 mozilla::UniquePtr<mozilla::intl::DateTimePatternGenerator, 309 DateTimePatternGeneratorDeleter>; 310 311 UniqueDateTimePatternGenerator dateTimePatternGenerator; 312 JS::UniqueChars dateTimePatternGeneratorLocale; 313 314 public: 315 /** 316 * Get a non-owned cached instance of the DateTimePatternGenerator, which is 317 * expensive to instantiate. 318 * 319 * See: https://bugzilla.mozilla.org/show_bug.cgi?id=1549578 320 */ 321 mozilla::intl::DateTimePatternGenerator* getDateTimePatternGenerator( 322 JSContext* cx, const char* locale); 323 324 public: 325 void destroyInstance(); 326 327 void trace(JSTracer* trc); 328 329 size_t sizeOfExcludingThis(mozilla::MallocSizeOf mallocSizeOf) const; 330 }; 331 332 } // namespace intl 333 334 } // namespace js 335 336 #endif /* builtin_intl_SharedIntlData_h */ 337