1 /* -*- Mode: C++; tab-width: 8; indent-tabs-mode: nil; c-basic-offset: 2 -*-
2  * vim: set ts=8 sts=2 et sw=2 tw=80:
3  * This Source Code Form is subject to the terms of the Mozilla Public
4  * License, v. 2.0. If a copy of the MPL was not distributed with this
5  * file, You can obtain one at http://mozilla.org/MPL/2.0/. */
6 
7 /* Structured representation of Unicode locale IDs used with Intl functions. */
8 
9 #ifndef builtin_intl_LanguageTag_h
10 #define builtin_intl_LanguageTag_h
11 
12 #include "mozilla/Assertions.h"
13 #include "mozilla/Span.h"
14 #include "mozilla/TextUtils.h"
15 #include "mozilla/TypedEnumBits.h"
16 #include "mozilla/Variant.h"
17 
18 #include <algorithm>
19 #include <stddef.h>
20 #include <stdint.h>
21 #include <string.h>
22 #include <utility>
23 
24 #include "js/AllocPolicy.h"
25 #include "js/GCAPI.h"
26 #include "js/Result.h"
27 #include "js/RootingAPI.h"
28 #include "js/Utility.h"
29 #include "js/Vector.h"
30 
31 struct JS_PUBLIC_API JSContext;
32 class JSLinearString;
33 class JS_PUBLIC_API JSString;
34 class JS_PUBLIC_API JSTracer;
35 
36 namespace js {
37 
38 namespace intl {
39 
40 /**
41  * Return true if |language| is a valid language subtag.
42  */
43 template <typename CharT>
44 bool IsStructurallyValidLanguageTag(mozilla::Span<const CharT> language);
45 
46 /**
47  * Return true if |script| is a valid script subtag.
48  */
49 template <typename CharT>
50 bool IsStructurallyValidScriptTag(mozilla::Span<const CharT> script);
51 
52 /**
53  * Return true if |region| is a valid region subtag.
54  */
55 template <typename CharT>
56 bool IsStructurallyValidRegionTag(mozilla::Span<const CharT> region);
57 
58 #ifdef DEBUG
59 /**
60  * Return true if |variant| is a valid variant subtag.
61  */
62 bool IsStructurallyValidVariantTag(mozilla::Span<const char> variant);
63 
64 /**
65  * Return true if |extension| is a valid Unicode extension subtag.
66  */
67 bool IsStructurallyValidUnicodeExtensionTag(
68     mozilla::Span<const char> extension);
69 
70 /**
71  * Return true if |privateUse| is a valid private-use subtag.
72  */
73 bool IsStructurallyValidPrivateUseTag(mozilla::Span<const char> privateUse);
74 
75 #endif
76 
77 template <typename CharT>
AsciiToLowerCase(CharT c)78 char AsciiToLowerCase(CharT c) {
79   MOZ_ASSERT(mozilla::IsAscii(c));
80   return mozilla::IsAsciiUppercaseAlpha(c) ? (c + 0x20) : c;
81 }
82 
83 template <typename CharT>
AsciiToUpperCase(CharT c)84 char AsciiToUpperCase(CharT c) {
85   MOZ_ASSERT(mozilla::IsAscii(c));
86   return mozilla::IsAsciiLowercaseAlpha(c) ? (c - 0x20) : c;
87 }
88 
89 template <typename CharT>
AsciiToLowerCase(CharT * chars,size_t length,char * dest)90 void AsciiToLowerCase(CharT* chars, size_t length, char* dest) {
91   // Tell the analysis the |std::transform| function can't GC.
92   JS::AutoSuppressGCAnalysis nogc;
93 
94   char (&fn)(CharT) = AsciiToLowerCase;
95   std::transform(chars, chars + length, dest, fn);
96 }
97 
98 template <typename CharT>
AsciiToUpperCase(CharT * chars,size_t length,char * dest)99 void AsciiToUpperCase(CharT* chars, size_t length, char* dest) {
100   // Tell the analysis the |std::transform| function can't GC.
101   JS::AutoSuppressGCAnalysis nogc;
102 
103   char (&fn)(CharT) = AsciiToUpperCase;
104   std::transform(chars, chars + length, dest, fn);
105 }
106 
107 template <typename CharT>
AsciiToTitleCase(CharT * chars,size_t length,char * dest)108 void AsciiToTitleCase(CharT* chars, size_t length, char* dest) {
109   if (length > 0) {
110     AsciiToUpperCase(chars, 1, dest);
111     AsciiToLowerCase(chars + 1, length - 1, dest + 1);
112   }
113 }
114 
115 // Constants for language subtag lengths.
116 namespace LanguageTagLimits {
117 
118 // unicode_language_subtag = alpha{2,3} | alpha{5,8} ;
119 static constexpr size_t LanguageLength = 8;
120 
121 // unicode_script_subtag = alpha{4} ;
122 static constexpr size_t ScriptLength = 4;
123 
124 // unicode_region_subtag = (alpha{2} | digit{3}) ;
125 static constexpr size_t RegionLength = 3;
126 static constexpr size_t AlphaRegionLength = 2;
127 static constexpr size_t DigitRegionLength = 3;
128 
129 // key = alphanum alpha ;
130 static constexpr size_t UnicodeKeyLength = 2;
131 
132 // tkey = alpha digit ;
133 static constexpr size_t TransformKeyLength = 2;
134 
135 }  // namespace LanguageTagLimits
136 
137 // Fixed size language subtag which is stored inline in LanguageTag.
138 template <size_t Length>
139 class LanguageTagSubtag final {
140   uint8_t length_ = 0;
141   char chars_[Length] = {};  // zero initialize
142 
143  public:
144   LanguageTagSubtag() = default;
145 
146   LanguageTagSubtag(const LanguageTagSubtag&) = delete;
147   LanguageTagSubtag& operator=(const LanguageTagSubtag&) = delete;
148 
length()149   size_t length() const { return length_; }
missing()150   bool missing() const { return length_ == 0; }
present()151   bool present() const { return length_ > 0; }
152 
span()153   mozilla::Span<const char> span() const { return {chars_, length_}; }
154 
155   template <typename CharT>
set(mozilla::Span<const CharT> str)156   void set(mozilla::Span<const CharT> str) {
157     MOZ_ASSERT(str.size() <= Length);
158     std::copy_n(str.data(), str.size(), chars_);
159     length_ = str.size();
160   }
161 
162   // The toXYZCase() methods are using |Length| instead of |length()|, because
163   // current compilers (tested GCC and Clang) can't infer the maximum string
164   // length - even when using hints like |std::min| - and instead are emitting
165   // SIMD optimized code. Using a fixed sized length avoids emitting the SIMD
166   // code. (Emitting SIMD code doesn't make sense here, because the SIMD code
167   // only kicks in for long strings.) A fixed length will additionally ensure
168   // the compiler unrolls the loop in the case conversion code.
169 
toLowerCase()170   void toLowerCase() { AsciiToLowerCase(chars_, Length, chars_); }
171 
toUpperCase()172   void toUpperCase() { AsciiToUpperCase(chars_, Length, chars_); }
173 
toTitleCase()174   void toTitleCase() { AsciiToTitleCase(chars_, Length, chars_); }
175 
176   template <size_t N>
equalTo(const char (& str)[N])177   bool equalTo(const char (&str)[N]) const {
178     static_assert(N - 1 <= Length,
179                   "subtag literals must not exceed the maximum subtag length");
180 
181     return length_ == N - 1 && memcmp(chars_, str, N - 1) == 0;
182   }
183 };
184 
185 using LanguageSubtag = LanguageTagSubtag<LanguageTagLimits::LanguageLength>;
186 using ScriptSubtag = LanguageTagSubtag<LanguageTagLimits::ScriptLength>;
187 using RegionSubtag = LanguageTagSubtag<LanguageTagLimits::RegionLength>;
188 
189 /**
190  * Object representing a language tag.
191  *
192  * All subtags are already in canonicalized case.
193  */
194 class MOZ_STACK_CLASS LanguageTag final {
195   LanguageSubtag language_ = {};
196   ScriptSubtag script_ = {};
197   RegionSubtag region_ = {};
198 
199   using VariantsVector = Vector<JS::UniqueChars, 2>;
200   using ExtensionsVector = Vector<JS::UniqueChars, 2>;
201 
202   VariantsVector variants_;
203   ExtensionsVector extensions_;
204   JS::UniqueChars privateuse_ = nullptr;
205 
206   friend class LanguageTagParser;
207 
208   bool canonicalizeUnicodeExtension(JSContext* cx,
209                                     JS::UniqueChars& unicodeExtension);
210 
211   bool canonicalizeTransformExtension(JSContext* cx,
212                                       JS::UniqueChars& transformExtension);
213 
214  public:
215   static bool languageMapping(LanguageSubtag& language);
216   static bool complexLanguageMapping(const LanguageSubtag& language);
217 
218  private:
219   static bool regionMapping(RegionSubtag& region);
220   static bool complexRegionMapping(const RegionSubtag& region);
221 
222   void performComplexLanguageMappings();
223   void performComplexRegionMappings();
224   MOZ_MUST_USE bool performVariantMappings(JSContext* cx);
225 
226   MOZ_MUST_USE bool updateGrandfatheredMappings(JSContext* cx);
227 
228   static const char* replaceTransformExtensionType(
229       mozilla::Span<const char> key, mozilla::Span<const char> type);
230 
231  public:
232   /**
233    * Given a Unicode key and type, return the null-terminated preferred
234    * replacement for that type if there is one, or null if there is none, e.g.
235    * in effect
236    * |replaceUnicodeExtensionType("ca", "islamicc") == "islamic-civil"|
237    * and
238    * |replaceUnicodeExtensionType("ca", "islamic-civil") == nullptr|.
239    */
240   static const char* replaceUnicodeExtensionType(
241       mozilla::Span<const char> key, mozilla::Span<const char> type);
242 
243  public:
LanguageTag(JSContext * cx)244   explicit LanguageTag(JSContext* cx) : variants_(cx), extensions_(cx) {}
245 
246   LanguageTag(const LanguageTag&) = delete;
247   LanguageTag& operator=(const LanguageTag&) = delete;
248 
language()249   const LanguageSubtag& language() const { return language_; }
script()250   const ScriptSubtag& script() const { return script_; }
region()251   const RegionSubtag& region() const { return region_; }
variants()252   const auto& variants() const { return variants_; }
extensions()253   const auto& extensions() const { return extensions_; }
privateuse()254   const char* privateuse() const { return privateuse_.get(); }
255 
256   /**
257    * Return the Unicode extension subtag or nullptr if not present.
258    */
259   const char* unicodeExtension() const;
260 
261  private:
262   ptrdiff_t unicodeExtensionIndex() const;
263 
264  public:
265   /**
266    * Set the language subtag. The input must be a valid language subtag.
267    */
268   template <size_t N>
setLanguage(const char (& language)[N])269   void setLanguage(const char (&language)[N]) {
270     mozilla::Span<const char> span(language, N - 1);
271     MOZ_ASSERT(IsStructurallyValidLanguageTag(span));
272     language_.set(span);
273   }
274 
275   /**
276    * Set the language subtag. The input must be a valid language subtag.
277    */
setLanguage(const LanguageSubtag & language)278   void setLanguage(const LanguageSubtag& language) {
279     MOZ_ASSERT(IsStructurallyValidLanguageTag(language.span()));
280     language_.set(language.span());
281   }
282 
283   /**
284    * Set the script subtag. The input must be a valid script subtag.
285    */
286   template <size_t N>
setScript(const char (& script)[N])287   void setScript(const char (&script)[N]) {
288     mozilla::Span<const char> span(script, N - 1);
289     MOZ_ASSERT(IsStructurallyValidScriptTag(span));
290     script_.set(span);
291   }
292 
293   /**
294    * Set the script subtag. The input must be a valid script subtag or the empty
295    * string.
296    */
setScript(const ScriptSubtag & script)297   void setScript(const ScriptSubtag& script) {
298     MOZ_ASSERT(script.missing() || IsStructurallyValidScriptTag(script.span()));
299     script_.set(script.span());
300   }
301 
302   /**
303    * Set the region subtag. The input must be a valid region subtag.
304    */
305   template <size_t N>
setRegion(const char (& region)[N])306   void setRegion(const char (&region)[N]) {
307     mozilla::Span<const char> span(region, N - 1);
308     MOZ_ASSERT(IsStructurallyValidRegionTag(span));
309     region_.set(span);
310   }
311 
312   /**
313    * Set the region subtag. The input must be a valid region subtag or the empty
314    * empty string.
315    */
setRegion(const RegionSubtag & region)316   void setRegion(const RegionSubtag& region) {
317     MOZ_ASSERT(region.missing() || IsStructurallyValidRegionTag(region.span()));
318     region_.set(region.span());
319   }
320 
321   /**
322    * Removes all variant subtags.
323    */
clearVariants()324   void clearVariants() { variants_.clearAndFree(); }
325 
326   /**
327    * Set the Unicode extension subtag. The input must be a valid Unicode
328    * extension subtag.
329    */
330   bool setUnicodeExtension(JS::UniqueChars extension);
331 
332   /**
333    * Remove any Unicode extension subtag if present.
334    */
335   void clearUnicodeExtension();
336 
337   /**
338    * Set the private-use subtag. The input must be a valid private-use subtag
339    * or nullptr.
340    */
setPrivateuse(JS::UniqueChars privateuse)341   void setPrivateuse(JS::UniqueChars privateuse) {
342     MOZ_ASSERT(!privateuse ||
343                IsStructurallyValidPrivateUseTag(
344                    {privateuse.get(), strlen(privateuse.get())}));
345     privateuse_ = std::move(privateuse);
346   }
347 
348  private:
349   enum class DuplicateVariants { Reject, Accept };
350 
351   bool canonicalizeBaseName(JSContext* cx, DuplicateVariants duplicateVariants);
352 
353  public:
354   /**
355    * Canonicalize the base-name subtags, that means the language, script,
356    * region, and variant subtags.
357    */
canonicalizeBaseName(JSContext * cx)358   bool canonicalizeBaseName(JSContext* cx) {
359     return canonicalizeBaseName(cx, DuplicateVariants::Reject);
360   }
361 
362   /**
363    * Canonicalize all extension subtags.
364    */
365   bool canonicalizeExtensions(JSContext* cx);
366 
367   /**
368    * Canonicalizes the given structurally valid Unicode BCP 47 locale
369    * identifier, including regularized case of subtags. For example, the
370    * language tag Zh-haNS-bu-variant2-Variant1-u-ca-chinese-t-Zh-laTN-x-PRIVATE,
371    * where
372    *
373    *     Zh             ; 2*3ALPHA
374    *     -haNS          ; ["-" script]
375    *     -bu            ; ["-" region]
376    *     -variant2      ; *("-" variant)
377    *     -Variant1
378    *     -u-ca-chinese  ; *("-" extension)
379    *     -t-Zh-laTN
380    *     -x-PRIVATE     ; ["-" privateuse]
381    *
382    * becomes zh-Hans-MM-variant1-variant2-t-zh-latn-u-ca-chinese-x-private
383    *
384    * Spec: ECMAScript Internationalization API Specification, 6.2.3.
385    */
canonicalize(JSContext * cx)386   bool canonicalize(JSContext* cx) {
387     return canonicalizeBaseName(cx) && canonicalizeExtensions(cx);
388   }
389 
390   /**
391    * Return the string representation of this language tag.
392    */
393   JSString* toString(JSContext* cx) const;
394 
395   /**
396    * Return the string representation of this language tag as a null-terminated
397    * C-string.
398    */
399   JS::UniqueChars toStringZ(JSContext* cx) const;
400 
401   /**
402    * Add likely-subtags to the language tag.
403    *
404    * Spec: <https://www.unicode.org/reports/tr35/#Likely_Subtags>
405    */
406   bool addLikelySubtags(JSContext* cx);
407 
408   /**
409    * Remove likely-subtags from the language tag.
410    *
411    * Spec: <https://www.unicode.org/reports/tr35/#Likely_Subtags>
412    */
413   bool removeLikelySubtags(JSContext* cx);
414 };
415 
416 /**
417  * Parser for Unicode BCP 47 locale identifiers.
418  *
419  * <https://unicode.org/reports/tr35/#Unicode_Language_and_Locale_Identifiers>
420  */
421 class MOZ_STACK_CLASS LanguageTagParser final {
422  public:
423   // Exposed as |public| for |MOZ_MAKE_ENUM_CLASS_BITWISE_OPERATORS|.
424   enum class TokenKind : uint8_t {
425     None = 0b000,
426     Alpha = 0b001,
427     Digit = 0b010,
428     AlphaDigit = 0b011,
429     Error = 0b100
430   };
431 
432  private:
433   class Token final {
434     size_t index_;
435     size_t length_;
436     TokenKind kind_;
437 
438    public:
Token(TokenKind kind,size_t index,size_t length)439     Token(TokenKind kind, size_t index, size_t length)
440         : index_(index), length_(length), kind_(kind) {}
441 
kind()442     TokenKind kind() const { return kind_; }
index()443     size_t index() const { return index_; }
length()444     size_t length() const { return length_; }
445 
isError()446     bool isError() const { return kind_ == TokenKind::Error; }
isNone()447     bool isNone() const { return kind_ == TokenKind::None; }
isAlpha()448     bool isAlpha() const { return kind_ == TokenKind::Alpha; }
isDigit()449     bool isDigit() const { return kind_ == TokenKind::Digit; }
isAlphaDigit()450     bool isAlphaDigit() const { return kind_ == TokenKind::AlphaDigit; }
451   };
452 
453   using LocaleChars = mozilla::Variant<const JS::Latin1Char*, const char16_t*>;
454 
455   const LocaleChars& locale_;
456   size_t length_;
457   size_t index_ = 0;
458 
LanguageTagParser(const LocaleChars & locale,size_t length)459   LanguageTagParser(const LocaleChars& locale, size_t length)
460       : locale_(locale), length_(length) {}
461 
charAtUnchecked(size_t index)462   char16_t charAtUnchecked(size_t index) const {
463     if (locale_.is<const JS::Latin1Char*>()) {
464       return locale_.as<const JS::Latin1Char*>()[index];
465     }
466     return locale_.as<const char16_t*>()[index];
467   }
468 
charAt(size_t index)469   char charAt(size_t index) const {
470     char16_t c = charAtUnchecked(index);
471     MOZ_ASSERT(mozilla::IsAscii(c));
472     return c;
473   }
474 
475   // Copy the token characters into |subtag|.
476   template <size_t N>
copyChars(const Token & tok,LanguageTagSubtag<N> & subtag)477   void copyChars(const Token& tok, LanguageTagSubtag<N>& subtag) const {
478     size_t index = tok.index();
479     size_t length = tok.length();
480     if (locale_.is<const JS::Latin1Char*>()) {
481       using T = const JS::Latin1Char;
482       subtag.set(mozilla::MakeSpan(locale_.as<T*>() + index, length));
483     } else {
484       using T = const char16_t;
485       subtag.set(mozilla::MakeSpan(locale_.as<T*>() + index, length));
486     }
487   }
488 
489   // Create a string copy of |length| characters starting at |index|.
490   JS::UniqueChars chars(JSContext* cx, size_t index, size_t length) const;
491 
492   // Create a string copy of the token characters.
chars(JSContext * cx,const Token & tok)493   JS::UniqueChars chars(JSContext* cx, const Token& tok) const {
494     return chars(cx, tok.index(), tok.length());
495   }
496 
extension(JSContext * cx,const Token & start,const Token & end)497   JS::UniqueChars extension(JSContext* cx, const Token& start,
498                             const Token& end) const {
499     MOZ_ASSERT(start.index() < end.index());
500 
501     size_t length = end.index() - 1 - start.index();
502     return chars(cx, start.index(), length);
503   }
504 
505   Token nextToken();
506 
507   // unicode_language_subtag = alpha{2,3} | alpha{5,8} ;
508   //
509   // Four character language subtags are not allowed in Unicode BCP 47 locale
510   // identifiers. Also see the comparison to Unicode CLDR locale identifiers in
511   // <https://unicode.org/reports/tr35/#BCP_47_Conformance>.
isLanguage(const Token & tok)512   bool isLanguage(const Token& tok) const {
513     return tok.isAlpha() && ((2 <= tok.length() && tok.length() <= 3) ||
514                              (5 <= tok.length() && tok.length() <= 8));
515   }
516 
517   // unicode_script_subtag = alpha{4} ;
isScript(const Token & tok)518   bool isScript(const Token& tok) const {
519     return tok.isAlpha() && tok.length() == 4;
520   }
521 
522   // unicode_region_subtag = (alpha{2} | digit{3}) ;
isRegion(const Token & tok)523   bool isRegion(const Token& tok) const {
524     return (tok.isAlpha() && tok.length() == 2) ||
525            (tok.isDigit() && tok.length() == 3);
526   }
527 
528   // unicode_variant_subtag = (alphanum{5,8} | digit alphanum{3}) ;
isVariant(const Token & tok)529   bool isVariant(const Token& tok) const {
530     return (5 <= tok.length() && tok.length() <= 8) ||
531            (tok.length() == 4 && mozilla::IsAsciiDigit(charAt(tok.index())));
532   }
533 
534   // Returns the code unit of the first character at the given singleton token.
535   // Always returns the lower case form of an alphabetical character.
singletonKey(const Token & tok)536   char singletonKey(const Token& tok) const {
537     MOZ_ASSERT(tok.length() == 1);
538     return AsciiToLowerCase(charAt(tok.index()));
539   }
540 
541   // extensions = unicode_locale_extensions |
542   //              transformed_extensions |
543   //              other_extensions ;
544   //
545   // unicode_locale_extensions = sep [uU] ((sep keyword)+ |
546   //                                       (sep attribute)+ (sep keyword)*) ;
547   //
548   // transformed_extensions = sep [tT] ((sep tlang (sep tfield)*) |
549   //                                    (sep tfield)+) ;
550   //
551   // other_extensions = sep [alphanum-[tTuUxX]] (sep alphanum{2,8})+ ;
isExtensionStart(const Token & tok)552   bool isExtensionStart(const Token& tok) const {
553     return tok.length() == 1 && singletonKey(tok) != 'x';
554   }
555 
556   // other_extensions = sep [alphanum-[tTuUxX]] (sep alphanum{2,8})+ ;
isOtherExtensionPart(const Token & tok)557   bool isOtherExtensionPart(const Token& tok) const {
558     return 2 <= tok.length() && tok.length() <= 8;
559   }
560 
561   // unicode_locale_extensions = sep [uU] ((sep keyword)+ |
562   //                                       (sep attribute)+ (sep keyword)*) ;
563   // keyword = key (sep type)? ;
isUnicodeExtensionPart(const Token & tok)564   bool isUnicodeExtensionPart(const Token& tok) const {
565     return isUnicodeExtensionKey(tok) || isUnicodeExtensionType(tok) ||
566            isUnicodeExtensionAttribute(tok);
567   }
568 
569   // attribute = alphanum{3,8} ;
isUnicodeExtensionAttribute(const Token & tok)570   bool isUnicodeExtensionAttribute(const Token& tok) const {
571     return 3 <= tok.length() && tok.length() <= 8;
572   }
573 
574   // key = alphanum alpha ;
isUnicodeExtensionKey(const Token & tok)575   bool isUnicodeExtensionKey(const Token& tok) const {
576     return tok.length() == 2 && mozilla::IsAsciiAlpha(charAt(tok.index() + 1));
577   }
578 
579   // type = alphanum{3,8} (sep alphanum{3,8})* ;
isUnicodeExtensionType(const Token & tok)580   bool isUnicodeExtensionType(const Token& tok) const {
581     return 3 <= tok.length() && tok.length() <= 8;
582   }
583 
584   // tkey = alpha digit ;
isTransformExtensionKey(const Token & tok)585   bool isTransformExtensionKey(const Token& tok) const {
586     return tok.length() == 2 && mozilla::IsAsciiAlpha(charAt(tok.index())) &&
587            mozilla::IsAsciiDigit(charAt(tok.index() + 1));
588   }
589 
590   // tvalue = (sep alphanum{3,8})+ ;
isTransformExtensionPart(const Token & tok)591   bool isTransformExtensionPart(const Token& tok) const {
592     return 3 <= tok.length() && tok.length() <= 8;
593   }
594 
595   // pu_extensions = sep [xX] (sep alphanum{1,8})+ ;
isPrivateUseStart(const Token & tok)596   bool isPrivateUseStart(const Token& tok) const {
597     return tok.length() == 1 && singletonKey(tok) == 'x';
598   }
599 
600   // pu_extensions = sep [xX] (sep alphanum{1,8})+ ;
isPrivateUsePart(const Token & tok)601   bool isPrivateUsePart(const Token& tok) const {
602     return 1 <= tok.length() && tok.length() <= 8;
603   }
604 
605   // Helper function for use in |parseBaseName| and
606   // |parseTlangInTransformExtension|.  Do not use this directly!
607   static JS::Result<bool> internalParseBaseName(JSContext* cx,
608                                                 LanguageTagParser& ts,
609                                                 LanguageTag& tag, Token& tok);
610 
611   // Parse the `unicode_language_id` production, i.e. the
612   // language/script/region/variants portion of a language tag, into |tag|.
613   // |tok| must be the current token.
parseBaseName(JSContext * cx,LanguageTagParser & ts,LanguageTag & tag,Token & tok)614   static JS::Result<bool> parseBaseName(JSContext* cx, LanguageTagParser& ts,
615                                         LanguageTag& tag, Token& tok) {
616     return internalParseBaseName(cx, ts, tag, tok);
617   }
618 
619   // Parse the `tlang` production within a parsed 't' transform extension.
620   // The precise requirements for "previously parsed" are:
621   //
622   //   * the input begins from current token |tok| with a valid `tlang`
623   //   * the `tlang` is wholly lowercase (*not* canonical case)
624   //   * variant subtags in the `tlang` may contain duplicates and be
625   //     unordered
626   //
627   // Return an error on internal failure. Otherwise, return a success value. If
628   // there was no `tlang`, then |tag.language().missing()|. But if there was a
629   // `tlang`, then |tag| is filled with subtags exactly as they appeared in the
630   // parse input.
parseTlangInTransformExtension(JSContext * cx,LanguageTagParser & ts,LanguageTag & tag,Token & tok)631   static JS::Result<JS::Ok> parseTlangInTransformExtension(
632       JSContext* cx, LanguageTagParser& ts, LanguageTag& tag, Token& tok) {
633     MOZ_ASSERT(ts.isLanguage(tok));
634     return internalParseBaseName(cx, ts, tag, tok).map([](bool parsed) {
635       MOZ_ASSERT(parsed);
636       return JS::Ok();
637     });
638   }
639 
640   friend class LanguageTag;
641 
642   class Range final {
643     size_t begin_;
644     size_t length_;
645 
646    public:
Range(size_t begin,size_t length)647     Range(size_t begin, size_t length) : begin_(begin), length_(length) {}
648 
649     template <typename T>
begin(T * ptr)650     T* begin(T* ptr) const {
651       return ptr + begin_;
652     }
653 
length()654     size_t length() const { return length_; }
655   };
656 
657   using TFieldVector = js::Vector<Range, 8>;
658   using AttributesVector = js::Vector<Range, 8>;
659   using KeywordsVector = js::Vector<Range, 8>;
660 
661   // Parse |extension|, which must be a validated, fully lowercase
662   // `transformed_extensions` subtag, and fill |tag| and |fields| from the
663   // `tlang` and `tfield` components. Data in |tag| is lowercase, consistent
664   // with |extension|.
665   static JS::Result<bool> parseTransformExtension(
666       JSContext* cx, mozilla::Span<const char> extension, LanguageTag& tag,
667       TFieldVector& fields);
668 
669   // Parse |extension|, which must be a validated, fully lowercase
670   // `unicode_locale_extensions` subtag, and fill |attributes| and |keywords|
671   // from the `attribute` and `keyword` components.
672   static JS::Result<bool> parseUnicodeExtension(
673       JSContext* cx, mozilla::Span<const char> extension,
674       AttributesVector& attributes, KeywordsVector& keywords);
675 
676   static JS::Result<bool> tryParse(JSContext* cx, LocaleChars& localeChars,
677                                    size_t localeLength, LanguageTag& tag);
678 
679  public:
680   // Parse the input string as a language tag. Reports an error to the context
681   // if the input can't be parsed completely.
682   static bool parse(JSContext* cx, JSLinearString* locale, LanguageTag& tag);
683 
684   // Parse the input string as a language tag. Reports an error to the context
685   // if the input can't be parsed completely.
686   static bool parse(JSContext* cx, mozilla::Span<const char> locale,
687                     LanguageTag& tag);
688 
689   // Parse the input string as a language tag. Returns Ok(true) if the input
690   // could be completely parsed, Ok(false) if the input couldn't be parsed,
691   // or Err() in case of internal error.
692   static JS::Result<bool> tryParse(JSContext* cx, JSLinearString* locale,
693                                    LanguageTag& tag);
694 
695   // Parse the input string as a language tag. Returns Ok(true) if the input
696   // could be completely parsed, Ok(false) if the input couldn't be parsed,
697   // or Err() in case of internal error.
698   static JS::Result<bool> tryParse(JSContext* cx,
699                                    mozilla::Span<const char> locale,
700                                    LanguageTag& tag);
701 
702   // Parse the input string as the base-name parts (language, script, region,
703   // variants) of a language tag. Ignores any trailing characters.
704   static bool parseBaseName(JSContext* cx, mozilla::Span<const char> locale,
705                             LanguageTag& tag);
706 
707   // Parse the input string as the base-name parts (language, script, region,
708   // variants) of a language tag. Returns Ok(true) if the input could be
709   // completely parsed, Ok(false) if the input couldn't be parsed, or Err() in
710   // case of internal error.
711   static JS::Result<bool> tryParseBaseName(JSContext* cx,
712                                            JSLinearString* locale,
713                                            LanguageTag& tag);
714 
715   // Return true iff |extension| can be parsed as a Unicode extension subtag.
716   static bool canParseUnicodeExtension(mozilla::Span<const char> extension);
717 
718   // Return true iff |unicodeType| can be parsed as a Unicode extension type.
719   static bool canParseUnicodeExtensionType(JSLinearString* unicodeType);
720 };
721 
722 MOZ_MAKE_ENUM_CLASS_BITWISE_OPERATORS(LanguageTagParser::TokenKind)
723 
724 /**
725  * Parse a string as a standalone |language| tag. If |str| is a standalone
726  * language tag, store it in |result| and return true. Otherwise return false.
727  */
728 MOZ_MUST_USE bool ParseStandaloneLanguageTag(JS::Handle<JSLinearString*> str,
729                                              LanguageSubtag& result);
730 
731 /**
732  * Parse a string as a standalone |script| tag. If |str| is a standalone script
733  * tag, store it in |result| and return true. Otherwise return false.
734  */
735 MOZ_MUST_USE bool ParseStandaloneScriptTag(JS::Handle<JSLinearString*> str,
736                                            ScriptSubtag& result);
737 
738 /**
739  * Parse a string as a standalone |region| tag. If |str| is a standalone region
740  * tag, store it in |result| and return true. Otherwise return false.
741  */
742 MOZ_MUST_USE bool ParseStandaloneRegionTag(JS::Handle<JSLinearString*> str,
743                                            RegionSubtag& result);
744 
745 /**
746  * Parse a string as an ISO-639 language code. Return |nullptr| in the result if
747  * the input could not be parsed or the canonical form of the resulting language
748  * tag contains more than a single language subtag.
749  */
750 JS::Result<JSString*> ParseStandaloneISO639LanguageTag(
751     JSContext* cx, JS::Handle<JSLinearString*> str);
752 
753 class UnicodeExtensionKeyword final {
754   char key_[LanguageTagLimits::UnicodeKeyLength];
755   JSLinearString* type_;
756 
757  public:
758   using UnicodeKey = const char (&)[LanguageTagLimits::UnicodeKeyLength + 1];
759   using UnicodeKeySpan =
760       mozilla::Span<const char, LanguageTagLimits::UnicodeKeyLength>;
761 
UnicodeExtensionKeyword(UnicodeKey key,JSLinearString * type)762   UnicodeExtensionKeyword(UnicodeKey key, JSLinearString* type)
763       : key_{key[0], key[1]}, type_(type) {}
764 
key()765   UnicodeKeySpan key() const { return {key_, sizeof(key_)}; }
type()766   JSLinearString* type() const { return type_; }
767 
768   void trace(JSTracer* trc);
769 };
770 
771 extern MOZ_MUST_USE bool ApplyUnicodeExtensionToTag(
772     JSContext* cx, LanguageTag& tag,
773     JS::HandleVector<UnicodeExtensionKeyword> keywords);
774 
775 }  // namespace intl
776 
777 }  // namespace js
778 
779 #endif /* builtin_intl_LanguageTag_h */
780