1 /* -*- Mode: C++; tab-width: 8; indent-tabs-mode: nil; c-basic-offset: 2 -*-
2  * vim: set ts=8 sts=2 et sw=2 tw=80:
3  * This Source Code Form is subject to the terms of the Mozilla Public
4  * License, v. 2.0. If a copy of the MPL was not distributed with this
5  * file, You can obtain one at http://mozilla.org/MPL/2.0/. */
6 
7 /* Structured representation of Unicode locale IDs used with Intl functions. */
8 
9 #ifndef builtin_intl_LanguageTag_h
10 #define builtin_intl_LanguageTag_h
11 
12 #include "mozilla/Assertions.h"
13 #include "mozilla/Span.h"
14 #include "mozilla/TextUtils.h"
15 #include "mozilla/TypedEnumBits.h"
16 #include "mozilla/Variant.h"
17 
18 #include <algorithm>
19 #include <stddef.h>
20 #include <stdint.h>
21 #include <string.h>
22 #include <utility>
23 
24 #include "js/AllocPolicy.h"
25 #include "js/GCAPI.h"
26 #include "js/Result.h"
27 #include "js/RootingAPI.h"
28 #include "js/Utility.h"
29 #include "js/Vector.h"
30 
31 struct JS_PUBLIC_API JSContext;
32 class JSLinearString;
33 class JS_PUBLIC_API JSString;
34 class JS_PUBLIC_API JSTracer;
35 
36 namespace js {
37 
38 namespace intl {
39 
40 /**
41  * Return true if |language| is a valid language subtag.
42  */
43 template <typename CharT>
44 bool IsStructurallyValidLanguageTag(mozilla::Span<const CharT> language);
45 
46 /**
47  * Return true if |script| is a valid script subtag.
48  */
49 template <typename CharT>
50 bool IsStructurallyValidScriptTag(mozilla::Span<const CharT> script);
51 
52 /**
53  * Return true if |region| is a valid region subtag.
54  */
55 template <typename CharT>
56 bool IsStructurallyValidRegionTag(mozilla::Span<const CharT> region);
57 
58 #ifdef DEBUG
59 /**
60  * Return true if |variant| is a valid variant subtag.
61  */
62 bool IsStructurallyValidVariantTag(mozilla::Span<const char> variant);
63 
64 /**
65  * Return true if |extension| is a valid Unicode extension subtag.
66  */
67 bool IsStructurallyValidUnicodeExtensionTag(
68     mozilla::Span<const char> extension);
69 
70 /**
71  * Return true if |privateUse| is a valid private-use subtag.
72  */
73 bool IsStructurallyValidPrivateUseTag(mozilla::Span<const char> privateUse);
74 
75 #endif
76 
77 template <typename CharT>
AsciiToLowerCase(CharT c)78 char AsciiToLowerCase(CharT c) {
79   MOZ_ASSERT(mozilla::IsAscii(c));
80   return mozilla::IsAsciiUppercaseAlpha(c) ? (c + 0x20) : c;
81 }
82 
83 template <typename CharT>
AsciiToUpperCase(CharT c)84 char AsciiToUpperCase(CharT c) {
85   MOZ_ASSERT(mozilla::IsAscii(c));
86   return mozilla::IsAsciiLowercaseAlpha(c) ? (c - 0x20) : c;
87 }
88 
89 template <typename CharT>
AsciiToLowerCase(CharT * chars,size_t length,char * dest)90 void AsciiToLowerCase(CharT* chars, size_t length, char* dest) {
91   // Tell the analysis the |std::transform| function can't GC.
92   JS::AutoSuppressGCAnalysis nogc;
93 
94   char (&fn)(CharT) = AsciiToLowerCase;
95   std::transform(chars, chars + length, dest, fn);
96 }
97 
98 template <typename CharT>
AsciiToUpperCase(CharT * chars,size_t length,char * dest)99 void AsciiToUpperCase(CharT* chars, size_t length, char* dest) {
100   // Tell the analysis the |std::transform| function can't GC.
101   JS::AutoSuppressGCAnalysis nogc;
102 
103   char (&fn)(CharT) = AsciiToUpperCase;
104   std::transform(chars, chars + length, dest, fn);
105 }
106 
107 template <typename CharT>
AsciiToTitleCase(CharT * chars,size_t length,char * dest)108 void AsciiToTitleCase(CharT* chars, size_t length, char* dest) {
109   if (length > 0) {
110     AsciiToUpperCase(chars, 1, dest);
111     AsciiToLowerCase(chars + 1, length - 1, dest + 1);
112   }
113 }
114 
115 // Constants for language subtag lengths.
116 namespace LanguageTagLimits {
117 
118 // unicode_language_subtag = alpha{2,3} | alpha{5,8} ;
119 static constexpr size_t LanguageLength = 8;
120 
121 // unicode_script_subtag = alpha{4} ;
122 static constexpr size_t ScriptLength = 4;
123 
124 // unicode_region_subtag = (alpha{2} | digit{3}) ;
125 static constexpr size_t RegionLength = 3;
126 static constexpr size_t AlphaRegionLength = 2;
127 static constexpr size_t DigitRegionLength = 3;
128 
129 // key = alphanum alpha ;
130 static constexpr size_t UnicodeKeyLength = 2;
131 
132 // tkey = alpha digit ;
133 static constexpr size_t TransformKeyLength = 2;
134 
135 }  // namespace LanguageTagLimits
136 
137 // Fixed size language subtag which is stored inline in LanguageTag.
138 template <size_t Length>
139 class LanguageTagSubtag final {
140   uint8_t length_ = 0;
141   char chars_[Length] = {};  // zero initialize
142 
143  public:
144   LanguageTagSubtag() = default;
145 
146   LanguageTagSubtag(const LanguageTagSubtag&) = delete;
147   LanguageTagSubtag& operator=(const LanguageTagSubtag&) = delete;
148 
length()149   size_t length() const { return length_; }
missing()150   bool missing() const { return length_ == 0; }
present()151   bool present() const { return length_ > 0; }
152 
span()153   mozilla::Span<const char> span() const { return {chars_, length_}; }
154 
155   template <typename CharT>
set(mozilla::Span<const CharT> str)156   void set(mozilla::Span<const CharT> str) {
157     MOZ_ASSERT(str.size() <= Length);
158     std::copy_n(str.data(), str.size(), chars_);
159     length_ = str.size();
160   }
161 
162   // The toXYZCase() methods are using |Length| instead of |length()|, because
163   // current compilers (tested GCC and Clang) can't infer the maximum string
164   // length - even when using hints like |std::min| - and instead are emitting
165   // SIMD optimized code. Using a fixed sized length avoids emitting the SIMD
166   // code. (Emitting SIMD code doesn't make sense here, because the SIMD code
167   // only kicks in for long strings.) A fixed length will additionally ensure
168   // the compiler unrolls the loop in the case conversion code.
169 
toLowerCase()170   void toLowerCase() { AsciiToLowerCase(chars_, Length, chars_); }
171 
toUpperCase()172   void toUpperCase() { AsciiToUpperCase(chars_, Length, chars_); }
173 
toTitleCase()174   void toTitleCase() { AsciiToTitleCase(chars_, Length, chars_); }
175 
176   template <size_t N>
equalTo(const char (& str)[N])177   bool equalTo(const char (&str)[N]) const {
178     static_assert(N - 1 <= Length,
179                   "subtag literals must not exceed the maximum subtag length");
180 
181     return length_ == N - 1 && memcmp(chars_, str, N - 1) == 0;
182   }
183 };
184 
185 using LanguageSubtag = LanguageTagSubtag<LanguageTagLimits::LanguageLength>;
186 using ScriptSubtag = LanguageTagSubtag<LanguageTagLimits::ScriptLength>;
187 using RegionSubtag = LanguageTagSubtag<LanguageTagLimits::RegionLength>;
188 
189 /**
190  * Object representing a language tag.
191  *
192  * All subtags are already in canonicalized case.
193  */
194 class MOZ_STACK_CLASS LanguageTag final {
195   LanguageSubtag language_ = {};
196   ScriptSubtag script_ = {};
197   RegionSubtag region_ = {};
198 
199   using VariantsVector = Vector<JS::UniqueChars, 2>;
200   using ExtensionsVector = Vector<JS::UniqueChars, 2>;
201 
202   VariantsVector variants_;
203   ExtensionsVector extensions_;
204   JS::UniqueChars privateuse_ = nullptr;
205 
206   friend class LanguageTagParser;
207 
208   bool canonicalizeUnicodeExtension(JSContext* cx,
209                                     JS::UniqueChars& unicodeExtension);
210 
211   bool canonicalizeTransformExtension(JSContext* cx,
212                                       JS::UniqueChars& transformExtension);
213 
214  public:
215   static bool languageMapping(LanguageSubtag& language);
216   static bool complexLanguageMapping(const LanguageSubtag& language);
217 
218  private:
219   static bool scriptMapping(ScriptSubtag& script);
220   static bool regionMapping(RegionSubtag& region);
221   static bool complexRegionMapping(const RegionSubtag& region);
222 
223   void performComplexLanguageMappings();
224   void performComplexRegionMappings();
225   [[nodiscard]] bool performVariantMappings(JSContext* cx);
226 
227   [[nodiscard]] bool updateLegacyMappings(JSContext* cx);
228 
229   static bool signLanguageMapping(LanguageSubtag& language,
230                                   const RegionSubtag& region);
231 
232   static const char* replaceTransformExtensionType(
233       mozilla::Span<const char> key, mozilla::Span<const char> type);
234 
235  public:
236   /**
237    * Given a Unicode key and type, return the null-terminated preferred
238    * replacement for that type if there is one, or null if there is none, e.g.
239    * in effect
240    * |replaceUnicodeExtensionType("ca", "islamicc") == "islamic-civil"|
241    * and
242    * |replaceUnicodeExtensionType("ca", "islamic-civil") == nullptr|.
243    */
244   static const char* replaceUnicodeExtensionType(
245       mozilla::Span<const char> key, mozilla::Span<const char> type);
246 
247  public:
LanguageTag(JSContext * cx)248   explicit LanguageTag(JSContext* cx) : variants_(cx), extensions_(cx) {}
249 
250   LanguageTag(const LanguageTag&) = delete;
251   LanguageTag& operator=(const LanguageTag&) = delete;
252 
language()253   const LanguageSubtag& language() const { return language_; }
script()254   const ScriptSubtag& script() const { return script_; }
region()255   const RegionSubtag& region() const { return region_; }
variants()256   const auto& variants() const { return variants_; }
extensions()257   const auto& extensions() const { return extensions_; }
privateuse()258   const char* privateuse() const { return privateuse_.get(); }
259 
260   /**
261    * Return the Unicode extension subtag or nullptr if not present.
262    */
263   const char* unicodeExtension() const;
264 
265  private:
266   ptrdiff_t unicodeExtensionIndex() const;
267 
268  public:
269   /**
270    * Set the language subtag. The input must be a valid language subtag.
271    */
272   template <size_t N>
setLanguage(const char (& language)[N])273   void setLanguage(const char (&language)[N]) {
274     mozilla::Span<const char> span(language, N - 1);
275     MOZ_ASSERT(IsStructurallyValidLanguageTag(span));
276     language_.set(span);
277   }
278 
279   /**
280    * Set the language subtag. The input must be a valid language subtag.
281    */
setLanguage(const LanguageSubtag & language)282   void setLanguage(const LanguageSubtag& language) {
283     MOZ_ASSERT(IsStructurallyValidLanguageTag(language.span()));
284     language_.set(language.span());
285   }
286 
287   /**
288    * Set the script subtag. The input must be a valid script subtag.
289    */
290   template <size_t N>
setScript(const char (& script)[N])291   void setScript(const char (&script)[N]) {
292     mozilla::Span<const char> span(script, N - 1);
293     MOZ_ASSERT(IsStructurallyValidScriptTag(span));
294     script_.set(span);
295   }
296 
297   /**
298    * Set the script subtag. The input must be a valid script subtag or the empty
299    * string.
300    */
setScript(const ScriptSubtag & script)301   void setScript(const ScriptSubtag& script) {
302     MOZ_ASSERT(script.missing() || IsStructurallyValidScriptTag(script.span()));
303     script_.set(script.span());
304   }
305 
306   /**
307    * Set the region subtag. The input must be a valid region subtag.
308    */
309   template <size_t N>
setRegion(const char (& region)[N])310   void setRegion(const char (&region)[N]) {
311     mozilla::Span<const char> span(region, N - 1);
312     MOZ_ASSERT(IsStructurallyValidRegionTag(span));
313     region_.set(span);
314   }
315 
316   /**
317    * Set the region subtag. The input must be a valid region subtag or the empty
318    * empty string.
319    */
setRegion(const RegionSubtag & region)320   void setRegion(const RegionSubtag& region) {
321     MOZ_ASSERT(region.missing() || IsStructurallyValidRegionTag(region.span()));
322     region_.set(region.span());
323   }
324 
325   /**
326    * Removes all variant subtags.
327    */
clearVariants()328   void clearVariants() { variants_.clearAndFree(); }
329 
330   /**
331    * Set the Unicode extension subtag. The input must be a valid Unicode
332    * extension subtag.
333    */
334   bool setUnicodeExtension(JS::UniqueChars extension);
335 
336   /**
337    * Remove any Unicode extension subtag if present.
338    */
339   void clearUnicodeExtension();
340 
341   /**
342    * Set the private-use subtag. The input must be a valid private-use subtag
343    * or nullptr.
344    */
setPrivateuse(JS::UniqueChars privateuse)345   void setPrivateuse(JS::UniqueChars privateuse) {
346     MOZ_ASSERT(!privateuse ||
347                IsStructurallyValidPrivateUseTag(
348                    {privateuse.get(), strlen(privateuse.get())}));
349     privateuse_ = std::move(privateuse);
350   }
351 
352   /** Canonicalize the base-name (language, script, region, variant) subtags. */
353   bool canonicalizeBaseName(JSContext* cx);
354 
355   /**
356    * Canonicalize all extension subtags.
357    */
358   bool canonicalizeExtensions(JSContext* cx);
359 
360   /**
361    * Canonicalizes the given structurally valid Unicode BCP 47 locale
362    * identifier, including regularized case of subtags. For example, the
363    * language tag Zh-haNS-bu-variant2-Variant1-u-ca-chinese-t-Zh-laTN-x-PRIVATE,
364    * where
365    *
366    *     Zh             ; 2*3ALPHA
367    *     -haNS          ; ["-" script]
368    *     -bu            ; ["-" region]
369    *     -variant2      ; *("-" variant)
370    *     -Variant1
371    *     -u-ca-chinese  ; *("-" extension)
372    *     -t-Zh-laTN
373    *     -x-PRIVATE     ; ["-" privateuse]
374    *
375    * becomes zh-Hans-MM-variant1-variant2-t-zh-latn-u-ca-chinese-x-private
376    *
377    * Spec: ECMAScript Internationalization API Specification, 6.2.3.
378    */
canonicalize(JSContext * cx)379   bool canonicalize(JSContext* cx) {
380     return canonicalizeBaseName(cx) && canonicalizeExtensions(cx);
381   }
382 
383   /**
384    * Return the string representation of this language tag.
385    */
386   JSString* toString(JSContext* cx) const;
387 
388   /**
389    * Return the string representation of this language tag as a null-terminated
390    * C-string.
391    */
392   JS::UniqueChars toStringZ(JSContext* cx) const;
393 
394   /**
395    * Add likely-subtags to the language tag.
396    *
397    * Spec: <https://www.unicode.org/reports/tr35/#Likely_Subtags>
398    */
399   bool addLikelySubtags(JSContext* cx);
400 
401   /**
402    * Remove likely-subtags from the language tag.
403    *
404    * Spec: <https://www.unicode.org/reports/tr35/#Likely_Subtags>
405    */
406   bool removeLikelySubtags(JSContext* cx);
407 };
408 
409 /**
410  * Parser for Unicode BCP 47 locale identifiers.
411  *
412  * <https://unicode.org/reports/tr35/#Unicode_Language_and_Locale_Identifiers>
413  */
414 class MOZ_STACK_CLASS LanguageTagParser final {
415  public:
416   // Exposed as |public| for |MOZ_MAKE_ENUM_CLASS_BITWISE_OPERATORS|.
417   enum class TokenKind : uint8_t {
418     None = 0b000,
419     Alpha = 0b001,
420     Digit = 0b010,
421     AlphaDigit = 0b011,
422     Error = 0b100
423   };
424 
425  private:
426   class Token final {
427     size_t index_;
428     size_t length_;
429     TokenKind kind_;
430 
431    public:
Token(TokenKind kind,size_t index,size_t length)432     Token(TokenKind kind, size_t index, size_t length)
433         : index_(index), length_(length), kind_(kind) {}
434 
kind()435     TokenKind kind() const { return kind_; }
index()436     size_t index() const { return index_; }
length()437     size_t length() const { return length_; }
438 
isError()439     bool isError() const { return kind_ == TokenKind::Error; }
isNone()440     bool isNone() const { return kind_ == TokenKind::None; }
isAlpha()441     bool isAlpha() const { return kind_ == TokenKind::Alpha; }
isDigit()442     bool isDigit() const { return kind_ == TokenKind::Digit; }
isAlphaDigit()443     bool isAlphaDigit() const { return kind_ == TokenKind::AlphaDigit; }
444   };
445 
446   using LocaleChars = mozilla::Variant<const JS::Latin1Char*, const char16_t*>;
447 
448   const LocaleChars& locale_;
449   size_t length_;
450   size_t index_ = 0;
451 
LanguageTagParser(const LocaleChars & locale,size_t length)452   LanguageTagParser(const LocaleChars& locale, size_t length)
453       : locale_(locale), length_(length) {}
454 
charAtUnchecked(size_t index)455   char16_t charAtUnchecked(size_t index) const {
456     if (locale_.is<const JS::Latin1Char*>()) {
457       return locale_.as<const JS::Latin1Char*>()[index];
458     }
459     return locale_.as<const char16_t*>()[index];
460   }
461 
charAt(size_t index)462   char charAt(size_t index) const {
463     char16_t c = charAtUnchecked(index);
464     MOZ_ASSERT(mozilla::IsAscii(c));
465     return c;
466   }
467 
468   // Copy the token characters into |subtag|.
469   template <size_t N>
copyChars(const Token & tok,LanguageTagSubtag<N> & subtag)470   void copyChars(const Token& tok, LanguageTagSubtag<N>& subtag) const {
471     size_t index = tok.index();
472     size_t length = tok.length();
473     if (locale_.is<const JS::Latin1Char*>()) {
474       using T = const JS::Latin1Char;
475       subtag.set(mozilla::Span(locale_.as<T*>() + index, length));
476     } else {
477       using T = const char16_t;
478       subtag.set(mozilla::Span(locale_.as<T*>() + index, length));
479     }
480   }
481 
482   // Create a string copy of |length| characters starting at |index|.
483   JS::UniqueChars chars(JSContext* cx, size_t index, size_t length) const;
484 
485   // Create a string copy of the token characters.
chars(JSContext * cx,const Token & tok)486   JS::UniqueChars chars(JSContext* cx, const Token& tok) const {
487     return chars(cx, tok.index(), tok.length());
488   }
489 
extension(JSContext * cx,const Token & start,const Token & end)490   JS::UniqueChars extension(JSContext* cx, const Token& start,
491                             const Token& end) const {
492     MOZ_ASSERT(start.index() < end.index());
493 
494     size_t length = end.index() - 1 - start.index();
495     return chars(cx, start.index(), length);
496   }
497 
498   Token nextToken();
499 
500   // unicode_language_subtag = alpha{2,3} | alpha{5,8} ;
501   //
502   // Four character language subtags are not allowed in Unicode BCP 47 locale
503   // identifiers. Also see the comparison to Unicode CLDR locale identifiers in
504   // <https://unicode.org/reports/tr35/#BCP_47_Conformance>.
isLanguage(const Token & tok)505   bool isLanguage(const Token& tok) const {
506     return tok.isAlpha() && ((2 <= tok.length() && tok.length() <= 3) ||
507                              (5 <= tok.length() && tok.length() <= 8));
508   }
509 
510   // unicode_script_subtag = alpha{4} ;
isScript(const Token & tok)511   bool isScript(const Token& tok) const {
512     return tok.isAlpha() && tok.length() == 4;
513   }
514 
515   // unicode_region_subtag = (alpha{2} | digit{3}) ;
isRegion(const Token & tok)516   bool isRegion(const Token& tok) const {
517     return (tok.isAlpha() && tok.length() == 2) ||
518            (tok.isDigit() && tok.length() == 3);
519   }
520 
521   // unicode_variant_subtag = (alphanum{5,8} | digit alphanum{3}) ;
isVariant(const Token & tok)522   bool isVariant(const Token& tok) const {
523     return (5 <= tok.length() && tok.length() <= 8) ||
524            (tok.length() == 4 && mozilla::IsAsciiDigit(charAt(tok.index())));
525   }
526 
527   // Returns the code unit of the first character at the given singleton token.
528   // Always returns the lower case form of an alphabetical character.
singletonKey(const Token & tok)529   char singletonKey(const Token& tok) const {
530     MOZ_ASSERT(tok.length() == 1);
531     return AsciiToLowerCase(charAt(tok.index()));
532   }
533 
534   // extensions = unicode_locale_extensions |
535   //              transformed_extensions |
536   //              other_extensions ;
537   //
538   // unicode_locale_extensions = sep [uU] ((sep keyword)+ |
539   //                                       (sep attribute)+ (sep keyword)*) ;
540   //
541   // transformed_extensions = sep [tT] ((sep tlang (sep tfield)*) |
542   //                                    (sep tfield)+) ;
543   //
544   // other_extensions = sep [alphanum-[tTuUxX]] (sep alphanum{2,8})+ ;
isExtensionStart(const Token & tok)545   bool isExtensionStart(const Token& tok) const {
546     return tok.length() == 1 && singletonKey(tok) != 'x';
547   }
548 
549   // other_extensions = sep [alphanum-[tTuUxX]] (sep alphanum{2,8})+ ;
isOtherExtensionPart(const Token & tok)550   bool isOtherExtensionPart(const Token& tok) const {
551     return 2 <= tok.length() && tok.length() <= 8;
552   }
553 
554   // unicode_locale_extensions = sep [uU] ((sep keyword)+ |
555   //                                       (sep attribute)+ (sep keyword)*) ;
556   // keyword = key (sep type)? ;
isUnicodeExtensionPart(const Token & tok)557   bool isUnicodeExtensionPart(const Token& tok) const {
558     return isUnicodeExtensionKey(tok) || isUnicodeExtensionType(tok) ||
559            isUnicodeExtensionAttribute(tok);
560   }
561 
562   // attribute = alphanum{3,8} ;
isUnicodeExtensionAttribute(const Token & tok)563   bool isUnicodeExtensionAttribute(const Token& tok) const {
564     return 3 <= tok.length() && tok.length() <= 8;
565   }
566 
567   // key = alphanum alpha ;
isUnicodeExtensionKey(const Token & tok)568   bool isUnicodeExtensionKey(const Token& tok) const {
569     return tok.length() == 2 && mozilla::IsAsciiAlpha(charAt(tok.index() + 1));
570   }
571 
572   // type = alphanum{3,8} (sep alphanum{3,8})* ;
isUnicodeExtensionType(const Token & tok)573   bool isUnicodeExtensionType(const Token& tok) const {
574     return 3 <= tok.length() && tok.length() <= 8;
575   }
576 
577   // tkey = alpha digit ;
isTransformExtensionKey(const Token & tok)578   bool isTransformExtensionKey(const Token& tok) const {
579     return tok.length() == 2 && mozilla::IsAsciiAlpha(charAt(tok.index())) &&
580            mozilla::IsAsciiDigit(charAt(tok.index() + 1));
581   }
582 
583   // tvalue = (sep alphanum{3,8})+ ;
isTransformExtensionPart(const Token & tok)584   bool isTransformExtensionPart(const Token& tok) const {
585     return 3 <= tok.length() && tok.length() <= 8;
586   }
587 
588   // pu_extensions = sep [xX] (sep alphanum{1,8})+ ;
isPrivateUseStart(const Token & tok)589   bool isPrivateUseStart(const Token& tok) const {
590     return tok.length() == 1 && singletonKey(tok) == 'x';
591   }
592 
593   // pu_extensions = sep [xX] (sep alphanum{1,8})+ ;
isPrivateUsePart(const Token & tok)594   bool isPrivateUsePart(const Token& tok) const {
595     return 1 <= tok.length() && tok.length() <= 8;
596   }
597 
598   // Helper function for use in |parseBaseName| and
599   // |parseTlangInTransformExtension|.  Do not use this directly!
600   static JS::Result<bool> internalParseBaseName(JSContext* cx,
601                                                 LanguageTagParser& ts,
602                                                 LanguageTag& tag, Token& tok);
603 
604   // Parse the `unicode_language_id` production, i.e. the
605   // language/script/region/variants portion of a language tag, into |tag|.
606   // |tok| must be the current token.
parseBaseName(JSContext * cx,LanguageTagParser & ts,LanguageTag & tag,Token & tok)607   static JS::Result<bool> parseBaseName(JSContext* cx, LanguageTagParser& ts,
608                                         LanguageTag& tag, Token& tok) {
609     return internalParseBaseName(cx, ts, tag, tok);
610   }
611 
612   // Parse the `tlang` production within a parsed 't' transform extension.
613   // The precise requirements for "previously parsed" are:
614   //
615   //   * the input begins from current token |tok| with a valid `tlang`
616   //   * the `tlang` is wholly lowercase (*not* canonical case)
617   //   * variant subtags in the `tlang` may contain duplicates and be
618   //     unordered
619   //
620   // Return an error on internal failure. Otherwise, return a success value. If
621   // there was no `tlang`, then |tag.language().missing()|. But if there was a
622   // `tlang`, then |tag| is filled with subtags exactly as they appeared in the
623   // parse input.
parseTlangInTransformExtension(JSContext * cx,LanguageTagParser & ts,LanguageTag & tag,Token & tok)624   static JS::Result<JS::Ok> parseTlangInTransformExtension(
625       JSContext* cx, LanguageTagParser& ts, LanguageTag& tag, Token& tok) {
626     MOZ_ASSERT(ts.isLanguage(tok));
627     return internalParseBaseName(cx, ts, tag, tok).map([](bool parsed) {
628       MOZ_ASSERT(parsed);
629       return JS::Ok();
630     });
631   }
632 
633   friend class LanguageTag;
634 
635   class Range final {
636     size_t begin_;
637     size_t length_;
638 
639    public:
Range(size_t begin,size_t length)640     Range(size_t begin, size_t length) : begin_(begin), length_(length) {}
641 
642     template <typename T>
begin(T * ptr)643     T* begin(T* ptr) const {
644       return ptr + begin_;
645     }
646 
length()647     size_t length() const { return length_; }
648   };
649 
650   using TFieldVector = js::Vector<Range, 8>;
651   using AttributesVector = js::Vector<Range, 8>;
652   using KeywordsVector = js::Vector<Range, 8>;
653 
654   // Parse |extension|, which must be a validated, fully lowercase
655   // `transformed_extensions` subtag, and fill |tag| and |fields| from the
656   // `tlang` and `tfield` components. Data in |tag| is lowercase, consistent
657   // with |extension|.
658   static JS::Result<bool> parseTransformExtension(
659       JSContext* cx, mozilla::Span<const char> extension, LanguageTag& tag,
660       TFieldVector& fields);
661 
662   // Parse |extension|, which must be a validated, fully lowercase
663   // `unicode_locale_extensions` subtag, and fill |attributes| and |keywords|
664   // from the `attribute` and `keyword` components.
665   static JS::Result<bool> parseUnicodeExtension(
666       JSContext* cx, mozilla::Span<const char> extension,
667       AttributesVector& attributes, KeywordsVector& keywords);
668 
669   static JS::Result<bool> tryParse(JSContext* cx, LocaleChars& localeChars,
670                                    size_t localeLength, LanguageTag& tag);
671 
672  public:
673   // Parse the input string as a language tag. Reports an error to the context
674   // if the input can't be parsed completely.
675   static bool parse(JSContext* cx, JSLinearString* locale, LanguageTag& tag);
676 
677   // Parse the input string as a language tag. Reports an error to the context
678   // if the input can't be parsed completely.
679   static bool parse(JSContext* cx, mozilla::Span<const char> locale,
680                     LanguageTag& tag);
681 
682   // Parse the input string as a language tag. Returns Ok(true) if the input
683   // could be completely parsed, Ok(false) if the input couldn't be parsed,
684   // or Err() in case of internal error.
685   static JS::Result<bool> tryParse(JSContext* cx, JSLinearString* locale,
686                                    LanguageTag& tag);
687 
688   // Parse the input string as a language tag. Returns Ok(true) if the input
689   // could be completely parsed, Ok(false) if the input couldn't be parsed,
690   // or Err() in case of internal error.
691   static JS::Result<bool> tryParse(JSContext* cx,
692                                    mozilla::Span<const char> locale,
693                                    LanguageTag& tag);
694 
695   // Parse the input string as the base-name parts (language, script, region,
696   // variants) of a language tag. Ignores any trailing characters.
697   static bool parseBaseName(JSContext* cx, mozilla::Span<const char> locale,
698                             LanguageTag& tag);
699 
700   // Parse the input string as the base-name parts (language, script, region,
701   // variants) of a language tag. Returns Ok(true) if the input could be
702   // completely parsed, Ok(false) if the input couldn't be parsed, or Err() in
703   // case of internal error.
704   static JS::Result<bool> tryParseBaseName(JSContext* cx,
705                                            JSLinearString* locale,
706                                            LanguageTag& tag);
707 
708   // Return true iff |extension| can be parsed as a Unicode extension subtag.
709   static bool canParseUnicodeExtension(mozilla::Span<const char> extension);
710 
711   // Return true iff |unicodeType| can be parsed as a Unicode extension type.
712   static bool canParseUnicodeExtensionType(JSLinearString* unicodeType);
713 };
714 
715 MOZ_MAKE_ENUM_CLASS_BITWISE_OPERATORS(LanguageTagParser::TokenKind)
716 
717 /**
718  * Parse a string as a standalone |language| tag. If |str| is a standalone
719  * language tag, store it in |result| and return true. Otherwise return false.
720  */
721 [[nodiscard]] bool ParseStandaloneLanguageTag(JS::Handle<JSLinearString*> str,
722                                               LanguageSubtag& result);
723 
724 /**
725  * Parse a string as a standalone |script| tag. If |str| is a standalone script
726  * tag, store it in |result| and return true. Otherwise return false.
727  */
728 [[nodiscard]] bool ParseStandaloneScriptTag(JS::Handle<JSLinearString*> str,
729                                             ScriptSubtag& result);
730 
731 /**
732  * Parse a string as a standalone |region| tag. If |str| is a standalone region
733  * tag, store it in |result| and return true. Otherwise return false.
734  */
735 [[nodiscard]] bool ParseStandaloneRegionTag(JS::Handle<JSLinearString*> str,
736                                             RegionSubtag& result);
737 
738 /**
739  * Parse a string as an ISO-639 language code. Return |nullptr| in the result if
740  * the input could not be parsed or the canonical form of the resulting language
741  * tag contains more than a single language subtag.
742  */
743 JS::Result<JSString*> ParseStandaloneISO639LanguageTag(
744     JSContext* cx, JS::Handle<JSLinearString*> str);
745 
746 class UnicodeExtensionKeyword final {
747   char key_[LanguageTagLimits::UnicodeKeyLength];
748   JSLinearString* type_;
749 
750  public:
751   using UnicodeKey = const char (&)[LanguageTagLimits::UnicodeKeyLength + 1];
752   using UnicodeKeySpan =
753       mozilla::Span<const char, LanguageTagLimits::UnicodeKeyLength>;
754 
UnicodeExtensionKeyword(UnicodeKey key,JSLinearString * type)755   UnicodeExtensionKeyword(UnicodeKey key, JSLinearString* type)
756       : key_{key[0], key[1]}, type_(type) {}
757 
key()758   UnicodeKeySpan key() const { return {key_, sizeof(key_)}; }
type()759   JSLinearString* type() const { return type_; }
760 
761   void trace(JSTracer* trc);
762 };
763 
764 [[nodiscard]] extern bool ApplyUnicodeExtensionToTag(
765     JSContext* cx, LanguageTag& tag,
766     JS::HandleVector<UnicodeExtensionKeyword> keywords);
767 
768 }  // namespace intl
769 
770 }  // namespace js
771 
772 #endif /* builtin_intl_LanguageTag_h */
773