1 /* -*- Mode: C++; tab-width: 8; indent-tabs-mode: nil; c-basic-offset: 2 -*-
2 * vim: set ts=8 sts=2 et sw=2 tw=80:
3 * This Source Code Form is subject to the terms of the Mozilla Public
4 * License, v. 2.0. If a copy of the MPL was not distributed with this
5 * file, You can obtain one at http://mozilla.org/MPL/2.0/. */
6
7 /* Structured representation of Unicode locale IDs used with Intl functions. */
8
9 #ifndef builtin_intl_LanguageTag_h
10 #define builtin_intl_LanguageTag_h
11
12 #include "mozilla/Assertions.h"
13 #include "mozilla/Span.h"
14 #include "mozilla/TextUtils.h"
15 #include "mozilla/TypedEnumBits.h"
16 #include "mozilla/Variant.h"
17
18 #include <algorithm>
19 #include <stddef.h>
20 #include <stdint.h>
21 #include <string.h>
22 #include <utility>
23
24 #include "js/AllocPolicy.h"
25 #include "js/GCAPI.h"
26 #include "js/Result.h"
27 #include "js/RootingAPI.h"
28 #include "js/Utility.h"
29 #include "js/Vector.h"
30
31 struct JS_PUBLIC_API JSContext;
32 class JSLinearString;
33 class JS_PUBLIC_API JSString;
34 class JS_PUBLIC_API JSTracer;
35
36 namespace js {
37
38 namespace intl {
39
40 /**
41 * Return true if |language| is a valid language subtag.
42 */
43 template <typename CharT>
44 bool IsStructurallyValidLanguageTag(mozilla::Span<const CharT> language);
45
46 /**
47 * Return true if |script| is a valid script subtag.
48 */
49 template <typename CharT>
50 bool IsStructurallyValidScriptTag(mozilla::Span<const CharT> script);
51
52 /**
53 * Return true if |region| is a valid region subtag.
54 */
55 template <typename CharT>
56 bool IsStructurallyValidRegionTag(mozilla::Span<const CharT> region);
57
58 #ifdef DEBUG
59 /**
60 * Return true if |variant| is a valid variant subtag.
61 */
62 bool IsStructurallyValidVariantTag(mozilla::Span<const char> variant);
63
64 /**
65 * Return true if |extension| is a valid Unicode extension subtag.
66 */
67 bool IsStructurallyValidUnicodeExtensionTag(
68 mozilla::Span<const char> extension);
69
70 /**
71 * Return true if |privateUse| is a valid private-use subtag.
72 */
73 bool IsStructurallyValidPrivateUseTag(mozilla::Span<const char> privateUse);
74
75 #endif
76
77 template <typename CharT>
AsciiToLowerCase(CharT c)78 char AsciiToLowerCase(CharT c) {
79 MOZ_ASSERT(mozilla::IsAscii(c));
80 return mozilla::IsAsciiUppercaseAlpha(c) ? (c + 0x20) : c;
81 }
82
83 template <typename CharT>
AsciiToUpperCase(CharT c)84 char AsciiToUpperCase(CharT c) {
85 MOZ_ASSERT(mozilla::IsAscii(c));
86 return mozilla::IsAsciiLowercaseAlpha(c) ? (c - 0x20) : c;
87 }
88
89 template <typename CharT>
AsciiToLowerCase(CharT * chars,size_t length,char * dest)90 void AsciiToLowerCase(CharT* chars, size_t length, char* dest) {
91 // Tell the analysis the |std::transform| function can't GC.
92 JS::AutoSuppressGCAnalysis nogc;
93
94 char (&fn)(CharT) = AsciiToLowerCase;
95 std::transform(chars, chars + length, dest, fn);
96 }
97
98 template <typename CharT>
AsciiToUpperCase(CharT * chars,size_t length,char * dest)99 void AsciiToUpperCase(CharT* chars, size_t length, char* dest) {
100 // Tell the analysis the |std::transform| function can't GC.
101 JS::AutoSuppressGCAnalysis nogc;
102
103 char (&fn)(CharT) = AsciiToUpperCase;
104 std::transform(chars, chars + length, dest, fn);
105 }
106
107 template <typename CharT>
AsciiToTitleCase(CharT * chars,size_t length,char * dest)108 void AsciiToTitleCase(CharT* chars, size_t length, char* dest) {
109 if (length > 0) {
110 AsciiToUpperCase(chars, 1, dest);
111 AsciiToLowerCase(chars + 1, length - 1, dest + 1);
112 }
113 }
114
115 // Constants for language subtag lengths.
116 namespace LanguageTagLimits {
117
118 // unicode_language_subtag = alpha{2,3} | alpha{5,8} ;
119 static constexpr size_t LanguageLength = 8;
120
121 // unicode_script_subtag = alpha{4} ;
122 static constexpr size_t ScriptLength = 4;
123
124 // unicode_region_subtag = (alpha{2} | digit{3}) ;
125 static constexpr size_t RegionLength = 3;
126 static constexpr size_t AlphaRegionLength = 2;
127 static constexpr size_t DigitRegionLength = 3;
128
129 // key = alphanum alpha ;
130 static constexpr size_t UnicodeKeyLength = 2;
131
132 // tkey = alpha digit ;
133 static constexpr size_t TransformKeyLength = 2;
134
135 } // namespace LanguageTagLimits
136
137 // Fixed size language subtag which is stored inline in LanguageTag.
138 template <size_t Length>
139 class LanguageTagSubtag final {
140 uint8_t length_ = 0;
141 char chars_[Length] = {}; // zero initialize
142
143 public:
144 LanguageTagSubtag() = default;
145
146 LanguageTagSubtag(const LanguageTagSubtag&) = delete;
147 LanguageTagSubtag& operator=(const LanguageTagSubtag&) = delete;
148
length()149 size_t length() const { return length_; }
missing()150 bool missing() const { return length_ == 0; }
present()151 bool present() const { return length_ > 0; }
152
span()153 mozilla::Span<const char> span() const { return {chars_, length_}; }
154
155 template <typename CharT>
set(mozilla::Span<const CharT> str)156 void set(mozilla::Span<const CharT> str) {
157 MOZ_ASSERT(str.size() <= Length);
158 std::copy_n(str.data(), str.size(), chars_);
159 length_ = str.size();
160 }
161
162 // The toXYZCase() methods are using |Length| instead of |length()|, because
163 // current compilers (tested GCC and Clang) can't infer the maximum string
164 // length - even when using hints like |std::min| - and instead are emitting
165 // SIMD optimized code. Using a fixed sized length avoids emitting the SIMD
166 // code. (Emitting SIMD code doesn't make sense here, because the SIMD code
167 // only kicks in for long strings.) A fixed length will additionally ensure
168 // the compiler unrolls the loop in the case conversion code.
169
toLowerCase()170 void toLowerCase() { AsciiToLowerCase(chars_, Length, chars_); }
171
toUpperCase()172 void toUpperCase() { AsciiToUpperCase(chars_, Length, chars_); }
173
toTitleCase()174 void toTitleCase() { AsciiToTitleCase(chars_, Length, chars_); }
175
176 template <size_t N>
equalTo(const char (& str)[N])177 bool equalTo(const char (&str)[N]) const {
178 static_assert(N - 1 <= Length,
179 "subtag literals must not exceed the maximum subtag length");
180
181 return length_ == N - 1 && memcmp(chars_, str, N - 1) == 0;
182 }
183 };
184
185 using LanguageSubtag = LanguageTagSubtag<LanguageTagLimits::LanguageLength>;
186 using ScriptSubtag = LanguageTagSubtag<LanguageTagLimits::ScriptLength>;
187 using RegionSubtag = LanguageTagSubtag<LanguageTagLimits::RegionLength>;
188
189 /**
190 * Object representing a language tag.
191 *
192 * All subtags are already in canonicalized case.
193 */
194 class MOZ_STACK_CLASS LanguageTag final {
195 LanguageSubtag language_ = {};
196 ScriptSubtag script_ = {};
197 RegionSubtag region_ = {};
198
199 using VariantsVector = Vector<JS::UniqueChars, 2>;
200 using ExtensionsVector = Vector<JS::UniqueChars, 2>;
201
202 VariantsVector variants_;
203 ExtensionsVector extensions_;
204 JS::UniqueChars privateuse_ = nullptr;
205
206 friend class LanguageTagParser;
207
208 bool canonicalizeUnicodeExtension(JSContext* cx,
209 JS::UniqueChars& unicodeExtension);
210
211 bool canonicalizeTransformExtension(JSContext* cx,
212 JS::UniqueChars& transformExtension);
213
214 public:
215 static bool languageMapping(LanguageSubtag& language);
216 static bool complexLanguageMapping(const LanguageSubtag& language);
217
218 private:
219 static bool regionMapping(RegionSubtag& region);
220 static bool complexRegionMapping(const RegionSubtag& region);
221
222 void performComplexLanguageMappings();
223 void performComplexRegionMappings();
224 MOZ_MUST_USE bool performVariantMappings(JSContext* cx);
225
226 MOZ_MUST_USE bool updateGrandfatheredMappings(JSContext* cx);
227
228 static const char* replaceTransformExtensionType(
229 mozilla::Span<const char> key, mozilla::Span<const char> type);
230
231 public:
232 /**
233 * Given a Unicode key and type, return the null-terminated preferred
234 * replacement for that type if there is one, or null if there is none, e.g.
235 * in effect
236 * |replaceUnicodeExtensionType("ca", "islamicc") == "islamic-civil"|
237 * and
238 * |replaceUnicodeExtensionType("ca", "islamic-civil") == nullptr|.
239 */
240 static const char* replaceUnicodeExtensionType(
241 mozilla::Span<const char> key, mozilla::Span<const char> type);
242
243 public:
LanguageTag(JSContext * cx)244 explicit LanguageTag(JSContext* cx) : variants_(cx), extensions_(cx) {}
245
246 LanguageTag(const LanguageTag&) = delete;
247 LanguageTag& operator=(const LanguageTag&) = delete;
248
language()249 const LanguageSubtag& language() const { return language_; }
script()250 const ScriptSubtag& script() const { return script_; }
region()251 const RegionSubtag& region() const { return region_; }
variants()252 const auto& variants() const { return variants_; }
extensions()253 const auto& extensions() const { return extensions_; }
privateuse()254 const char* privateuse() const { return privateuse_.get(); }
255
256 /**
257 * Return the Unicode extension subtag or nullptr if not present.
258 */
259 const char* unicodeExtension() const;
260
261 private:
262 ptrdiff_t unicodeExtensionIndex() const;
263
264 public:
265 /**
266 * Set the language subtag. The input must be a valid language subtag.
267 */
268 template <size_t N>
setLanguage(const char (& language)[N])269 void setLanguage(const char (&language)[N]) {
270 mozilla::Span<const char> span(language, N - 1);
271 MOZ_ASSERT(IsStructurallyValidLanguageTag(span));
272 language_.set(span);
273 }
274
275 /**
276 * Set the language subtag. The input must be a valid language subtag.
277 */
setLanguage(const LanguageSubtag & language)278 void setLanguage(const LanguageSubtag& language) {
279 MOZ_ASSERT(IsStructurallyValidLanguageTag(language.span()));
280 language_.set(language.span());
281 }
282
283 /**
284 * Set the script subtag. The input must be a valid script subtag.
285 */
286 template <size_t N>
setScript(const char (& script)[N])287 void setScript(const char (&script)[N]) {
288 mozilla::Span<const char> span(script, N - 1);
289 MOZ_ASSERT(IsStructurallyValidScriptTag(span));
290 script_.set(span);
291 }
292
293 /**
294 * Set the script subtag. The input must be a valid script subtag or the empty
295 * string.
296 */
setScript(const ScriptSubtag & script)297 void setScript(const ScriptSubtag& script) {
298 MOZ_ASSERT(script.missing() || IsStructurallyValidScriptTag(script.span()));
299 script_.set(script.span());
300 }
301
302 /**
303 * Set the region subtag. The input must be a valid region subtag.
304 */
305 template <size_t N>
setRegion(const char (& region)[N])306 void setRegion(const char (®ion)[N]) {
307 mozilla::Span<const char> span(region, N - 1);
308 MOZ_ASSERT(IsStructurallyValidRegionTag(span));
309 region_.set(span);
310 }
311
312 /**
313 * Set the region subtag. The input must be a valid region subtag or the empty
314 * empty string.
315 */
setRegion(const RegionSubtag & region)316 void setRegion(const RegionSubtag& region) {
317 MOZ_ASSERT(region.missing() || IsStructurallyValidRegionTag(region.span()));
318 region_.set(region.span());
319 }
320
321 /**
322 * Removes all variant subtags.
323 */
clearVariants()324 void clearVariants() { variants_.clearAndFree(); }
325
326 /**
327 * Set the Unicode extension subtag. The input must be a valid Unicode
328 * extension subtag.
329 */
330 bool setUnicodeExtension(JS::UniqueChars extension);
331
332 /**
333 * Remove any Unicode extension subtag if present.
334 */
335 void clearUnicodeExtension();
336
337 /**
338 * Set the private-use subtag. The input must be a valid private-use subtag
339 * or nullptr.
340 */
setPrivateuse(JS::UniqueChars privateuse)341 void setPrivateuse(JS::UniqueChars privateuse) {
342 MOZ_ASSERT(!privateuse ||
343 IsStructurallyValidPrivateUseTag(
344 {privateuse.get(), strlen(privateuse.get())}));
345 privateuse_ = std::move(privateuse);
346 }
347
348 private:
349 enum class DuplicateVariants { Reject, Accept };
350
351 bool canonicalizeBaseName(JSContext* cx, DuplicateVariants duplicateVariants);
352
353 public:
354 /**
355 * Canonicalize the base-name subtags, that means the language, script,
356 * region, and variant subtags.
357 */
canonicalizeBaseName(JSContext * cx)358 bool canonicalizeBaseName(JSContext* cx) {
359 return canonicalizeBaseName(cx, DuplicateVariants::Reject);
360 }
361
362 /**
363 * Canonicalize all extension subtags.
364 */
365 bool canonicalizeExtensions(JSContext* cx);
366
367 /**
368 * Canonicalizes the given structurally valid Unicode BCP 47 locale
369 * identifier, including regularized case of subtags. For example, the
370 * language tag Zh-haNS-bu-variant2-Variant1-u-ca-chinese-t-Zh-laTN-x-PRIVATE,
371 * where
372 *
373 * Zh ; 2*3ALPHA
374 * -haNS ; ["-" script]
375 * -bu ; ["-" region]
376 * -variant2 ; *("-" variant)
377 * -Variant1
378 * -u-ca-chinese ; *("-" extension)
379 * -t-Zh-laTN
380 * -x-PRIVATE ; ["-" privateuse]
381 *
382 * becomes zh-Hans-MM-variant1-variant2-t-zh-latn-u-ca-chinese-x-private
383 *
384 * Spec: ECMAScript Internationalization API Specification, 6.2.3.
385 */
canonicalize(JSContext * cx)386 bool canonicalize(JSContext* cx) {
387 return canonicalizeBaseName(cx) && canonicalizeExtensions(cx);
388 }
389
390 /**
391 * Return the string representation of this language tag.
392 */
393 JSString* toString(JSContext* cx) const;
394
395 /**
396 * Return the string representation of this language tag as a null-terminated
397 * C-string.
398 */
399 JS::UniqueChars toStringZ(JSContext* cx) const;
400
401 /**
402 * Add likely-subtags to the language tag.
403 *
404 * Spec: <https://www.unicode.org/reports/tr35/#Likely_Subtags>
405 */
406 bool addLikelySubtags(JSContext* cx);
407
408 /**
409 * Remove likely-subtags from the language tag.
410 *
411 * Spec: <https://www.unicode.org/reports/tr35/#Likely_Subtags>
412 */
413 bool removeLikelySubtags(JSContext* cx);
414 };
415
416 /**
417 * Parser for Unicode BCP 47 locale identifiers.
418 *
419 * <https://unicode.org/reports/tr35/#Unicode_Language_and_Locale_Identifiers>
420 */
421 class MOZ_STACK_CLASS LanguageTagParser final {
422 public:
423 // Exposed as |public| for |MOZ_MAKE_ENUM_CLASS_BITWISE_OPERATORS|.
424 enum class TokenKind : uint8_t {
425 None = 0b000,
426 Alpha = 0b001,
427 Digit = 0b010,
428 AlphaDigit = 0b011,
429 Error = 0b100
430 };
431
432 private:
433 class Token final {
434 size_t index_;
435 size_t length_;
436 TokenKind kind_;
437
438 public:
Token(TokenKind kind,size_t index,size_t length)439 Token(TokenKind kind, size_t index, size_t length)
440 : index_(index), length_(length), kind_(kind) {}
441
kind()442 TokenKind kind() const { return kind_; }
index()443 size_t index() const { return index_; }
length()444 size_t length() const { return length_; }
445
isError()446 bool isError() const { return kind_ == TokenKind::Error; }
isNone()447 bool isNone() const { return kind_ == TokenKind::None; }
isAlpha()448 bool isAlpha() const { return kind_ == TokenKind::Alpha; }
isDigit()449 bool isDigit() const { return kind_ == TokenKind::Digit; }
isAlphaDigit()450 bool isAlphaDigit() const { return kind_ == TokenKind::AlphaDigit; }
451 };
452
453 using LocaleChars = mozilla::Variant<const JS::Latin1Char*, const char16_t*>;
454
455 const LocaleChars& locale_;
456 size_t length_;
457 size_t index_ = 0;
458
LanguageTagParser(const LocaleChars & locale,size_t length)459 LanguageTagParser(const LocaleChars& locale, size_t length)
460 : locale_(locale), length_(length) {}
461
charAtUnchecked(size_t index)462 char16_t charAtUnchecked(size_t index) const {
463 if (locale_.is<const JS::Latin1Char*>()) {
464 return locale_.as<const JS::Latin1Char*>()[index];
465 }
466 return locale_.as<const char16_t*>()[index];
467 }
468
charAt(size_t index)469 char charAt(size_t index) const {
470 char16_t c = charAtUnchecked(index);
471 MOZ_ASSERT(mozilla::IsAscii(c));
472 return c;
473 }
474
475 // Copy the token characters into |subtag|.
476 template <size_t N>
copyChars(const Token & tok,LanguageTagSubtag<N> & subtag)477 void copyChars(const Token& tok, LanguageTagSubtag<N>& subtag) const {
478 size_t index = tok.index();
479 size_t length = tok.length();
480 if (locale_.is<const JS::Latin1Char*>()) {
481 using T = const JS::Latin1Char;
482 subtag.set(mozilla::MakeSpan(locale_.as<T*>() + index, length));
483 } else {
484 using T = const char16_t;
485 subtag.set(mozilla::MakeSpan(locale_.as<T*>() + index, length));
486 }
487 }
488
489 // Create a string copy of |length| characters starting at |index|.
490 JS::UniqueChars chars(JSContext* cx, size_t index, size_t length) const;
491
492 // Create a string copy of the token characters.
chars(JSContext * cx,const Token & tok)493 JS::UniqueChars chars(JSContext* cx, const Token& tok) const {
494 return chars(cx, tok.index(), tok.length());
495 }
496
extension(JSContext * cx,const Token & start,const Token & end)497 JS::UniqueChars extension(JSContext* cx, const Token& start,
498 const Token& end) const {
499 MOZ_ASSERT(start.index() < end.index());
500
501 size_t length = end.index() - 1 - start.index();
502 return chars(cx, start.index(), length);
503 }
504
505 Token nextToken();
506
507 // unicode_language_subtag = alpha{2,3} | alpha{5,8} ;
508 //
509 // Four character language subtags are not allowed in Unicode BCP 47 locale
510 // identifiers. Also see the comparison to Unicode CLDR locale identifiers in
511 // <https://unicode.org/reports/tr35/#BCP_47_Conformance>.
isLanguage(const Token & tok)512 bool isLanguage(const Token& tok) const {
513 return tok.isAlpha() && ((2 <= tok.length() && tok.length() <= 3) ||
514 (5 <= tok.length() && tok.length() <= 8));
515 }
516
517 // unicode_script_subtag = alpha{4} ;
isScript(const Token & tok)518 bool isScript(const Token& tok) const {
519 return tok.isAlpha() && tok.length() == 4;
520 }
521
522 // unicode_region_subtag = (alpha{2} | digit{3}) ;
isRegion(const Token & tok)523 bool isRegion(const Token& tok) const {
524 return (tok.isAlpha() && tok.length() == 2) ||
525 (tok.isDigit() && tok.length() == 3);
526 }
527
528 // unicode_variant_subtag = (alphanum{5,8} | digit alphanum{3}) ;
isVariant(const Token & tok)529 bool isVariant(const Token& tok) const {
530 return (5 <= tok.length() && tok.length() <= 8) ||
531 (tok.length() == 4 && mozilla::IsAsciiDigit(charAt(tok.index())));
532 }
533
534 // Returns the code unit of the first character at the given singleton token.
535 // Always returns the lower case form of an alphabetical character.
singletonKey(const Token & tok)536 char singletonKey(const Token& tok) const {
537 MOZ_ASSERT(tok.length() == 1);
538 return AsciiToLowerCase(charAt(tok.index()));
539 }
540
541 // extensions = unicode_locale_extensions |
542 // transformed_extensions |
543 // other_extensions ;
544 //
545 // unicode_locale_extensions = sep [uU] ((sep keyword)+ |
546 // (sep attribute)+ (sep keyword)*) ;
547 //
548 // transformed_extensions = sep [tT] ((sep tlang (sep tfield)*) |
549 // (sep tfield)+) ;
550 //
551 // other_extensions = sep [alphanum-[tTuUxX]] (sep alphanum{2,8})+ ;
isExtensionStart(const Token & tok)552 bool isExtensionStart(const Token& tok) const {
553 return tok.length() == 1 && singletonKey(tok) != 'x';
554 }
555
556 // other_extensions = sep [alphanum-[tTuUxX]] (sep alphanum{2,8})+ ;
isOtherExtensionPart(const Token & tok)557 bool isOtherExtensionPart(const Token& tok) const {
558 return 2 <= tok.length() && tok.length() <= 8;
559 }
560
561 // unicode_locale_extensions = sep [uU] ((sep keyword)+ |
562 // (sep attribute)+ (sep keyword)*) ;
563 // keyword = key (sep type)? ;
isUnicodeExtensionPart(const Token & tok)564 bool isUnicodeExtensionPart(const Token& tok) const {
565 return isUnicodeExtensionKey(tok) || isUnicodeExtensionType(tok) ||
566 isUnicodeExtensionAttribute(tok);
567 }
568
569 // attribute = alphanum{3,8} ;
isUnicodeExtensionAttribute(const Token & tok)570 bool isUnicodeExtensionAttribute(const Token& tok) const {
571 return 3 <= tok.length() && tok.length() <= 8;
572 }
573
574 // key = alphanum alpha ;
isUnicodeExtensionKey(const Token & tok)575 bool isUnicodeExtensionKey(const Token& tok) const {
576 return tok.length() == 2 && mozilla::IsAsciiAlpha(charAt(tok.index() + 1));
577 }
578
579 // type = alphanum{3,8} (sep alphanum{3,8})* ;
isUnicodeExtensionType(const Token & tok)580 bool isUnicodeExtensionType(const Token& tok) const {
581 return 3 <= tok.length() && tok.length() <= 8;
582 }
583
584 // tkey = alpha digit ;
isTransformExtensionKey(const Token & tok)585 bool isTransformExtensionKey(const Token& tok) const {
586 return tok.length() == 2 && mozilla::IsAsciiAlpha(charAt(tok.index())) &&
587 mozilla::IsAsciiDigit(charAt(tok.index() + 1));
588 }
589
590 // tvalue = (sep alphanum{3,8})+ ;
isTransformExtensionPart(const Token & tok)591 bool isTransformExtensionPart(const Token& tok) const {
592 return 3 <= tok.length() && tok.length() <= 8;
593 }
594
595 // pu_extensions = sep [xX] (sep alphanum{1,8})+ ;
isPrivateUseStart(const Token & tok)596 bool isPrivateUseStart(const Token& tok) const {
597 return tok.length() == 1 && singletonKey(tok) == 'x';
598 }
599
600 // pu_extensions = sep [xX] (sep alphanum{1,8})+ ;
isPrivateUsePart(const Token & tok)601 bool isPrivateUsePart(const Token& tok) const {
602 return 1 <= tok.length() && tok.length() <= 8;
603 }
604
605 // Helper function for use in |parseBaseName| and
606 // |parseTlangInTransformExtension|. Do not use this directly!
607 static JS::Result<bool> internalParseBaseName(JSContext* cx,
608 LanguageTagParser& ts,
609 LanguageTag& tag, Token& tok);
610
611 // Parse the `unicode_language_id` production, i.e. the
612 // language/script/region/variants portion of a language tag, into |tag|.
613 // |tok| must be the current token.
parseBaseName(JSContext * cx,LanguageTagParser & ts,LanguageTag & tag,Token & tok)614 static JS::Result<bool> parseBaseName(JSContext* cx, LanguageTagParser& ts,
615 LanguageTag& tag, Token& tok) {
616 return internalParseBaseName(cx, ts, tag, tok);
617 }
618
619 // Parse the `tlang` production within a parsed 't' transform extension.
620 // The precise requirements for "previously parsed" are:
621 //
622 // * the input begins from current token |tok| with a valid `tlang`
623 // * the `tlang` is wholly lowercase (*not* canonical case)
624 // * variant subtags in the `tlang` may contain duplicates and be
625 // unordered
626 //
627 // Return an error on internal failure. Otherwise, return a success value. If
628 // there was no `tlang`, then |tag.language().missing()|. But if there was a
629 // `tlang`, then |tag| is filled with subtags exactly as they appeared in the
630 // parse input.
parseTlangInTransformExtension(JSContext * cx,LanguageTagParser & ts,LanguageTag & tag,Token & tok)631 static JS::Result<JS::Ok> parseTlangInTransformExtension(
632 JSContext* cx, LanguageTagParser& ts, LanguageTag& tag, Token& tok) {
633 MOZ_ASSERT(ts.isLanguage(tok));
634 return internalParseBaseName(cx, ts, tag, tok).map([](bool parsed) {
635 MOZ_ASSERT(parsed);
636 return JS::Ok();
637 });
638 }
639
640 friend class LanguageTag;
641
642 class Range final {
643 size_t begin_;
644 size_t length_;
645
646 public:
Range(size_t begin,size_t length)647 Range(size_t begin, size_t length) : begin_(begin), length_(length) {}
648
649 template <typename T>
begin(T * ptr)650 T* begin(T* ptr) const {
651 return ptr + begin_;
652 }
653
length()654 size_t length() const { return length_; }
655 };
656
657 using TFieldVector = js::Vector<Range, 8>;
658 using AttributesVector = js::Vector<Range, 8>;
659 using KeywordsVector = js::Vector<Range, 8>;
660
661 // Parse |extension|, which must be a validated, fully lowercase
662 // `transformed_extensions` subtag, and fill |tag| and |fields| from the
663 // `tlang` and `tfield` components. Data in |tag| is lowercase, consistent
664 // with |extension|.
665 static JS::Result<bool> parseTransformExtension(
666 JSContext* cx, mozilla::Span<const char> extension, LanguageTag& tag,
667 TFieldVector& fields);
668
669 // Parse |extension|, which must be a validated, fully lowercase
670 // `unicode_locale_extensions` subtag, and fill |attributes| and |keywords|
671 // from the `attribute` and `keyword` components.
672 static JS::Result<bool> parseUnicodeExtension(
673 JSContext* cx, mozilla::Span<const char> extension,
674 AttributesVector& attributes, KeywordsVector& keywords);
675
676 static JS::Result<bool> tryParse(JSContext* cx, LocaleChars& localeChars,
677 size_t localeLength, LanguageTag& tag);
678
679 public:
680 // Parse the input string as a language tag. Reports an error to the context
681 // if the input can't be parsed completely.
682 static bool parse(JSContext* cx, JSLinearString* locale, LanguageTag& tag);
683
684 // Parse the input string as a language tag. Reports an error to the context
685 // if the input can't be parsed completely.
686 static bool parse(JSContext* cx, mozilla::Span<const char> locale,
687 LanguageTag& tag);
688
689 // Parse the input string as a language tag. Returns Ok(true) if the input
690 // could be completely parsed, Ok(false) if the input couldn't be parsed,
691 // or Err() in case of internal error.
692 static JS::Result<bool> tryParse(JSContext* cx, JSLinearString* locale,
693 LanguageTag& tag);
694
695 // Parse the input string as a language tag. Returns Ok(true) if the input
696 // could be completely parsed, Ok(false) if the input couldn't be parsed,
697 // or Err() in case of internal error.
698 static JS::Result<bool> tryParse(JSContext* cx,
699 mozilla::Span<const char> locale,
700 LanguageTag& tag);
701
702 // Parse the input string as the base-name parts (language, script, region,
703 // variants) of a language tag. Ignores any trailing characters.
704 static bool parseBaseName(JSContext* cx, mozilla::Span<const char> locale,
705 LanguageTag& tag);
706
707 // Parse the input string as the base-name parts (language, script, region,
708 // variants) of a language tag. Returns Ok(true) if the input could be
709 // completely parsed, Ok(false) if the input couldn't be parsed, or Err() in
710 // case of internal error.
711 static JS::Result<bool> tryParseBaseName(JSContext* cx,
712 JSLinearString* locale,
713 LanguageTag& tag);
714
715 // Return true iff |extension| can be parsed as a Unicode extension subtag.
716 static bool canParseUnicodeExtension(mozilla::Span<const char> extension);
717
718 // Return true iff |unicodeType| can be parsed as a Unicode extension type.
719 static bool canParseUnicodeExtensionType(JSLinearString* unicodeType);
720 };
721
722 MOZ_MAKE_ENUM_CLASS_BITWISE_OPERATORS(LanguageTagParser::TokenKind)
723
724 /**
725 * Parse a string as a standalone |language| tag. If |str| is a standalone
726 * language tag, store it in |result| and return true. Otherwise return false.
727 */
728 MOZ_MUST_USE bool ParseStandaloneLanguageTag(JS::Handle<JSLinearString*> str,
729 LanguageSubtag& result);
730
731 /**
732 * Parse a string as a standalone |script| tag. If |str| is a standalone script
733 * tag, store it in |result| and return true. Otherwise return false.
734 */
735 MOZ_MUST_USE bool ParseStandaloneScriptTag(JS::Handle<JSLinearString*> str,
736 ScriptSubtag& result);
737
738 /**
739 * Parse a string as a standalone |region| tag. If |str| is a standalone region
740 * tag, store it in |result| and return true. Otherwise return false.
741 */
742 MOZ_MUST_USE bool ParseStandaloneRegionTag(JS::Handle<JSLinearString*> str,
743 RegionSubtag& result);
744
745 /**
746 * Parse a string as an ISO-639 language code. Return |nullptr| in the result if
747 * the input could not be parsed or the canonical form of the resulting language
748 * tag contains more than a single language subtag.
749 */
750 JS::Result<JSString*> ParseStandaloneISO639LanguageTag(
751 JSContext* cx, JS::Handle<JSLinearString*> str);
752
753 class UnicodeExtensionKeyword final {
754 char key_[LanguageTagLimits::UnicodeKeyLength];
755 JSLinearString* type_;
756
757 public:
758 using UnicodeKey = const char (&)[LanguageTagLimits::UnicodeKeyLength + 1];
759 using UnicodeKeySpan =
760 mozilla::Span<const char, LanguageTagLimits::UnicodeKeyLength>;
761
UnicodeExtensionKeyword(UnicodeKey key,JSLinearString * type)762 UnicodeExtensionKeyword(UnicodeKey key, JSLinearString* type)
763 : key_{key[0], key[1]}, type_(type) {}
764
key()765 UnicodeKeySpan key() const { return {key_, sizeof(key_)}; }
type()766 JSLinearString* type() const { return type_; }
767
768 void trace(JSTracer* trc);
769 };
770
771 extern MOZ_MUST_USE bool ApplyUnicodeExtensionToTag(
772 JSContext* cx, LanguageTag& tag,
773 JS::HandleVector<UnicodeExtensionKeyword> keywords);
774
775 } // namespace intl
776
777 } // namespace js
778
779 #endif /* builtin_intl_LanguageTag_h */
780