1 /* -*- Mode: C++; tab-width: 8; indent-tabs-mode: nil; c-basic-offset: 2 -*-
2 * vim: set ts=8 sts=2 et sw=2 tw=80:
3 * This Source Code Form is subject to the terms of the Mozilla Public
4 * License, v. 2.0. If a copy of the MPL was not distributed with this
5 * file, You can obtain one at http://mozilla.org/MPL/2.0/. */
6
7 /* Structured representation of Unicode locale IDs used with Intl functions. */
8
9 #ifndef builtin_intl_LanguageTag_h
10 #define builtin_intl_LanguageTag_h
11
12 #include "mozilla/Assertions.h"
13 #include "mozilla/Span.h"
14 #include "mozilla/TextUtils.h"
15 #include "mozilla/TypedEnumBits.h"
16 #include "mozilla/Variant.h"
17
18 #include <algorithm>
19 #include <stddef.h>
20 #include <stdint.h>
21 #include <string.h>
22 #include <utility>
23
24 #include "js/AllocPolicy.h"
25 #include "js/GCAPI.h"
26 #include "js/Result.h"
27 #include "js/RootingAPI.h"
28 #include "js/Utility.h"
29 #include "js/Vector.h"
30
31 struct JS_PUBLIC_API JSContext;
32 class JSLinearString;
33 class JS_PUBLIC_API JSString;
34 class JS_PUBLIC_API JSTracer;
35
36 namespace js {
37
38 namespace intl {
39
40 /**
41 * Return true if |language| is a valid language subtag.
42 */
43 template <typename CharT>
44 bool IsStructurallyValidLanguageTag(mozilla::Span<const CharT> language);
45
46 /**
47 * Return true if |script| is a valid script subtag.
48 */
49 template <typename CharT>
50 bool IsStructurallyValidScriptTag(mozilla::Span<const CharT> script);
51
52 /**
53 * Return true if |region| is a valid region subtag.
54 */
55 template <typename CharT>
56 bool IsStructurallyValidRegionTag(mozilla::Span<const CharT> region);
57
58 #ifdef DEBUG
59 /**
60 * Return true if |variant| is a valid variant subtag.
61 */
62 bool IsStructurallyValidVariantTag(mozilla::Span<const char> variant);
63
64 /**
65 * Return true if |extension| is a valid Unicode extension subtag.
66 */
67 bool IsStructurallyValidUnicodeExtensionTag(
68 mozilla::Span<const char> extension);
69
70 /**
71 * Return true if |privateUse| is a valid private-use subtag.
72 */
73 bool IsStructurallyValidPrivateUseTag(mozilla::Span<const char> privateUse);
74
75 #endif
76
77 template <typename CharT>
AsciiToLowerCase(CharT c)78 char AsciiToLowerCase(CharT c) {
79 MOZ_ASSERT(mozilla::IsAscii(c));
80 return mozilla::IsAsciiUppercaseAlpha(c) ? (c + 0x20) : c;
81 }
82
83 template <typename CharT>
AsciiToUpperCase(CharT c)84 char AsciiToUpperCase(CharT c) {
85 MOZ_ASSERT(mozilla::IsAscii(c));
86 return mozilla::IsAsciiLowercaseAlpha(c) ? (c - 0x20) : c;
87 }
88
89 template <typename CharT>
AsciiToLowerCase(CharT * chars,size_t length,char * dest)90 void AsciiToLowerCase(CharT* chars, size_t length, char* dest) {
91 // Tell the analysis the |std::transform| function can't GC.
92 JS::AutoSuppressGCAnalysis nogc;
93
94 char (&fn)(CharT) = AsciiToLowerCase;
95 std::transform(chars, chars + length, dest, fn);
96 }
97
98 template <typename CharT>
AsciiToUpperCase(CharT * chars,size_t length,char * dest)99 void AsciiToUpperCase(CharT* chars, size_t length, char* dest) {
100 // Tell the analysis the |std::transform| function can't GC.
101 JS::AutoSuppressGCAnalysis nogc;
102
103 char (&fn)(CharT) = AsciiToUpperCase;
104 std::transform(chars, chars + length, dest, fn);
105 }
106
107 template <typename CharT>
AsciiToTitleCase(CharT * chars,size_t length,char * dest)108 void AsciiToTitleCase(CharT* chars, size_t length, char* dest) {
109 if (length > 0) {
110 AsciiToUpperCase(chars, 1, dest);
111 AsciiToLowerCase(chars + 1, length - 1, dest + 1);
112 }
113 }
114
115 // Constants for language subtag lengths.
116 namespace LanguageTagLimits {
117
118 // unicode_language_subtag = alpha{2,3} | alpha{5,8} ;
119 static constexpr size_t LanguageLength = 8;
120
121 // unicode_script_subtag = alpha{4} ;
122 static constexpr size_t ScriptLength = 4;
123
124 // unicode_region_subtag = (alpha{2} | digit{3}) ;
125 static constexpr size_t RegionLength = 3;
126 static constexpr size_t AlphaRegionLength = 2;
127 static constexpr size_t DigitRegionLength = 3;
128
129 // key = alphanum alpha ;
130 static constexpr size_t UnicodeKeyLength = 2;
131
132 // tkey = alpha digit ;
133 static constexpr size_t TransformKeyLength = 2;
134
135 } // namespace LanguageTagLimits
136
137 // Fixed size language subtag which is stored inline in LanguageTag.
138 template <size_t Length>
139 class LanguageTagSubtag final {
140 uint8_t length_ = 0;
141 char chars_[Length] = {}; // zero initialize
142
143 public:
144 LanguageTagSubtag() = default;
145
146 LanguageTagSubtag(const LanguageTagSubtag&) = delete;
147 LanguageTagSubtag& operator=(const LanguageTagSubtag&) = delete;
148
length()149 size_t length() const { return length_; }
missing()150 bool missing() const { return length_ == 0; }
present()151 bool present() const { return length_ > 0; }
152
span()153 mozilla::Span<const char> span() const { return {chars_, length_}; }
154
155 template <typename CharT>
set(mozilla::Span<const CharT> str)156 void set(mozilla::Span<const CharT> str) {
157 MOZ_ASSERT(str.size() <= Length);
158 std::copy_n(str.data(), str.size(), chars_);
159 length_ = str.size();
160 }
161
162 // The toXYZCase() methods are using |Length| instead of |length()|, because
163 // current compilers (tested GCC and Clang) can't infer the maximum string
164 // length - even when using hints like |std::min| - and instead are emitting
165 // SIMD optimized code. Using a fixed sized length avoids emitting the SIMD
166 // code. (Emitting SIMD code doesn't make sense here, because the SIMD code
167 // only kicks in for long strings.) A fixed length will additionally ensure
168 // the compiler unrolls the loop in the case conversion code.
169
toLowerCase()170 void toLowerCase() { AsciiToLowerCase(chars_, Length, chars_); }
171
toUpperCase()172 void toUpperCase() { AsciiToUpperCase(chars_, Length, chars_); }
173
toTitleCase()174 void toTitleCase() { AsciiToTitleCase(chars_, Length, chars_); }
175
176 template <size_t N>
equalTo(const char (& str)[N])177 bool equalTo(const char (&str)[N]) const {
178 static_assert(N - 1 <= Length,
179 "subtag literals must not exceed the maximum subtag length");
180
181 return length_ == N - 1 && memcmp(chars_, str, N - 1) == 0;
182 }
183 };
184
185 using LanguageSubtag = LanguageTagSubtag<LanguageTagLimits::LanguageLength>;
186 using ScriptSubtag = LanguageTagSubtag<LanguageTagLimits::ScriptLength>;
187 using RegionSubtag = LanguageTagSubtag<LanguageTagLimits::RegionLength>;
188
189 /**
190 * Object representing a language tag.
191 *
192 * All subtags are already in canonicalized case.
193 */
194 class MOZ_STACK_CLASS LanguageTag final {
195 LanguageSubtag language_ = {};
196 ScriptSubtag script_ = {};
197 RegionSubtag region_ = {};
198
199 using VariantsVector = Vector<JS::UniqueChars, 2>;
200 using ExtensionsVector = Vector<JS::UniqueChars, 2>;
201
202 VariantsVector variants_;
203 ExtensionsVector extensions_;
204 JS::UniqueChars privateuse_ = nullptr;
205
206 friend class LanguageTagParser;
207
208 bool canonicalizeUnicodeExtension(JSContext* cx,
209 JS::UniqueChars& unicodeExtension);
210
211 bool canonicalizeTransformExtension(JSContext* cx,
212 JS::UniqueChars& transformExtension);
213
214 public:
215 static bool languageMapping(LanguageSubtag& language);
216 static bool complexLanguageMapping(const LanguageSubtag& language);
217
218 private:
219 static bool scriptMapping(ScriptSubtag& script);
220 static bool regionMapping(RegionSubtag& region);
221 static bool complexRegionMapping(const RegionSubtag& region);
222
223 void performComplexLanguageMappings();
224 void performComplexRegionMappings();
225 [[nodiscard]] bool performVariantMappings(JSContext* cx);
226
227 [[nodiscard]] bool updateLegacyMappings(JSContext* cx);
228
229 static bool signLanguageMapping(LanguageSubtag& language,
230 const RegionSubtag& region);
231
232 static const char* replaceTransformExtensionType(
233 mozilla::Span<const char> key, mozilla::Span<const char> type);
234
235 public:
236 /**
237 * Given a Unicode key and type, return the null-terminated preferred
238 * replacement for that type if there is one, or null if there is none, e.g.
239 * in effect
240 * |replaceUnicodeExtensionType("ca", "islamicc") == "islamic-civil"|
241 * and
242 * |replaceUnicodeExtensionType("ca", "islamic-civil") == nullptr|.
243 */
244 static const char* replaceUnicodeExtensionType(
245 mozilla::Span<const char> key, mozilla::Span<const char> type);
246
247 public:
LanguageTag(JSContext * cx)248 explicit LanguageTag(JSContext* cx) : variants_(cx), extensions_(cx) {}
249
250 LanguageTag(const LanguageTag&) = delete;
251 LanguageTag& operator=(const LanguageTag&) = delete;
252
language()253 const LanguageSubtag& language() const { return language_; }
script()254 const ScriptSubtag& script() const { return script_; }
region()255 const RegionSubtag& region() const { return region_; }
variants()256 const auto& variants() const { return variants_; }
extensions()257 const auto& extensions() const { return extensions_; }
privateuse()258 const char* privateuse() const { return privateuse_.get(); }
259
260 /**
261 * Return the Unicode extension subtag or nullptr if not present.
262 */
263 const char* unicodeExtension() const;
264
265 private:
266 ptrdiff_t unicodeExtensionIndex() const;
267
268 public:
269 /**
270 * Set the language subtag. The input must be a valid language subtag.
271 */
272 template <size_t N>
setLanguage(const char (& language)[N])273 void setLanguage(const char (&language)[N]) {
274 mozilla::Span<const char> span(language, N - 1);
275 MOZ_ASSERT(IsStructurallyValidLanguageTag(span));
276 language_.set(span);
277 }
278
279 /**
280 * Set the language subtag. The input must be a valid language subtag.
281 */
setLanguage(const LanguageSubtag & language)282 void setLanguage(const LanguageSubtag& language) {
283 MOZ_ASSERT(IsStructurallyValidLanguageTag(language.span()));
284 language_.set(language.span());
285 }
286
287 /**
288 * Set the script subtag. The input must be a valid script subtag.
289 */
290 template <size_t N>
setScript(const char (& script)[N])291 void setScript(const char (&script)[N]) {
292 mozilla::Span<const char> span(script, N - 1);
293 MOZ_ASSERT(IsStructurallyValidScriptTag(span));
294 script_.set(span);
295 }
296
297 /**
298 * Set the script subtag. The input must be a valid script subtag or the empty
299 * string.
300 */
setScript(const ScriptSubtag & script)301 void setScript(const ScriptSubtag& script) {
302 MOZ_ASSERT(script.missing() || IsStructurallyValidScriptTag(script.span()));
303 script_.set(script.span());
304 }
305
306 /**
307 * Set the region subtag. The input must be a valid region subtag.
308 */
309 template <size_t N>
setRegion(const char (& region)[N])310 void setRegion(const char (®ion)[N]) {
311 mozilla::Span<const char> span(region, N - 1);
312 MOZ_ASSERT(IsStructurallyValidRegionTag(span));
313 region_.set(span);
314 }
315
316 /**
317 * Set the region subtag. The input must be a valid region subtag or the empty
318 * empty string.
319 */
setRegion(const RegionSubtag & region)320 void setRegion(const RegionSubtag& region) {
321 MOZ_ASSERT(region.missing() || IsStructurallyValidRegionTag(region.span()));
322 region_.set(region.span());
323 }
324
325 /**
326 * Removes all variant subtags.
327 */
clearVariants()328 void clearVariants() { variants_.clearAndFree(); }
329
330 /**
331 * Set the Unicode extension subtag. The input must be a valid Unicode
332 * extension subtag.
333 */
334 bool setUnicodeExtension(JS::UniqueChars extension);
335
336 /**
337 * Remove any Unicode extension subtag if present.
338 */
339 void clearUnicodeExtension();
340
341 /**
342 * Set the private-use subtag. The input must be a valid private-use subtag
343 * or nullptr.
344 */
setPrivateuse(JS::UniqueChars privateuse)345 void setPrivateuse(JS::UniqueChars privateuse) {
346 MOZ_ASSERT(!privateuse ||
347 IsStructurallyValidPrivateUseTag(
348 {privateuse.get(), strlen(privateuse.get())}));
349 privateuse_ = std::move(privateuse);
350 }
351
352 /** Canonicalize the base-name (language, script, region, variant) subtags. */
353 bool canonicalizeBaseName(JSContext* cx);
354
355 /**
356 * Canonicalize all extension subtags.
357 */
358 bool canonicalizeExtensions(JSContext* cx);
359
360 /**
361 * Canonicalizes the given structurally valid Unicode BCP 47 locale
362 * identifier, including regularized case of subtags. For example, the
363 * language tag Zh-haNS-bu-variant2-Variant1-u-ca-chinese-t-Zh-laTN-x-PRIVATE,
364 * where
365 *
366 * Zh ; 2*3ALPHA
367 * -haNS ; ["-" script]
368 * -bu ; ["-" region]
369 * -variant2 ; *("-" variant)
370 * -Variant1
371 * -u-ca-chinese ; *("-" extension)
372 * -t-Zh-laTN
373 * -x-PRIVATE ; ["-" privateuse]
374 *
375 * becomes zh-Hans-MM-variant1-variant2-t-zh-latn-u-ca-chinese-x-private
376 *
377 * Spec: ECMAScript Internationalization API Specification, 6.2.3.
378 */
canonicalize(JSContext * cx)379 bool canonicalize(JSContext* cx) {
380 return canonicalizeBaseName(cx) && canonicalizeExtensions(cx);
381 }
382
383 /**
384 * Return the string representation of this language tag.
385 */
386 JSString* toString(JSContext* cx) const;
387
388 /**
389 * Return the string representation of this language tag as a null-terminated
390 * C-string.
391 */
392 JS::UniqueChars toStringZ(JSContext* cx) const;
393
394 /**
395 * Add likely-subtags to the language tag.
396 *
397 * Spec: <https://www.unicode.org/reports/tr35/#Likely_Subtags>
398 */
399 bool addLikelySubtags(JSContext* cx);
400
401 /**
402 * Remove likely-subtags from the language tag.
403 *
404 * Spec: <https://www.unicode.org/reports/tr35/#Likely_Subtags>
405 */
406 bool removeLikelySubtags(JSContext* cx);
407 };
408
409 /**
410 * Parser for Unicode BCP 47 locale identifiers.
411 *
412 * <https://unicode.org/reports/tr35/#Unicode_Language_and_Locale_Identifiers>
413 */
414 class MOZ_STACK_CLASS LanguageTagParser final {
415 public:
416 // Exposed as |public| for |MOZ_MAKE_ENUM_CLASS_BITWISE_OPERATORS|.
417 enum class TokenKind : uint8_t {
418 None = 0b000,
419 Alpha = 0b001,
420 Digit = 0b010,
421 AlphaDigit = 0b011,
422 Error = 0b100
423 };
424
425 private:
426 class Token final {
427 size_t index_;
428 size_t length_;
429 TokenKind kind_;
430
431 public:
Token(TokenKind kind,size_t index,size_t length)432 Token(TokenKind kind, size_t index, size_t length)
433 : index_(index), length_(length), kind_(kind) {}
434
kind()435 TokenKind kind() const { return kind_; }
index()436 size_t index() const { return index_; }
length()437 size_t length() const { return length_; }
438
isError()439 bool isError() const { return kind_ == TokenKind::Error; }
isNone()440 bool isNone() const { return kind_ == TokenKind::None; }
isAlpha()441 bool isAlpha() const { return kind_ == TokenKind::Alpha; }
isDigit()442 bool isDigit() const { return kind_ == TokenKind::Digit; }
isAlphaDigit()443 bool isAlphaDigit() const { return kind_ == TokenKind::AlphaDigit; }
444 };
445
446 using LocaleChars = mozilla::Variant<const JS::Latin1Char*, const char16_t*>;
447
448 const LocaleChars& locale_;
449 size_t length_;
450 size_t index_ = 0;
451
LanguageTagParser(const LocaleChars & locale,size_t length)452 LanguageTagParser(const LocaleChars& locale, size_t length)
453 : locale_(locale), length_(length) {}
454
charAtUnchecked(size_t index)455 char16_t charAtUnchecked(size_t index) const {
456 if (locale_.is<const JS::Latin1Char*>()) {
457 return locale_.as<const JS::Latin1Char*>()[index];
458 }
459 return locale_.as<const char16_t*>()[index];
460 }
461
charAt(size_t index)462 char charAt(size_t index) const {
463 char16_t c = charAtUnchecked(index);
464 MOZ_ASSERT(mozilla::IsAscii(c));
465 return c;
466 }
467
468 // Copy the token characters into |subtag|.
469 template <size_t N>
copyChars(const Token & tok,LanguageTagSubtag<N> & subtag)470 void copyChars(const Token& tok, LanguageTagSubtag<N>& subtag) const {
471 size_t index = tok.index();
472 size_t length = tok.length();
473 if (locale_.is<const JS::Latin1Char*>()) {
474 using T = const JS::Latin1Char;
475 subtag.set(mozilla::Span(locale_.as<T*>() + index, length));
476 } else {
477 using T = const char16_t;
478 subtag.set(mozilla::Span(locale_.as<T*>() + index, length));
479 }
480 }
481
482 // Create a string copy of |length| characters starting at |index|.
483 JS::UniqueChars chars(JSContext* cx, size_t index, size_t length) const;
484
485 // Create a string copy of the token characters.
chars(JSContext * cx,const Token & tok)486 JS::UniqueChars chars(JSContext* cx, const Token& tok) const {
487 return chars(cx, tok.index(), tok.length());
488 }
489
extension(JSContext * cx,const Token & start,const Token & end)490 JS::UniqueChars extension(JSContext* cx, const Token& start,
491 const Token& end) const {
492 MOZ_ASSERT(start.index() < end.index());
493
494 size_t length = end.index() - 1 - start.index();
495 return chars(cx, start.index(), length);
496 }
497
498 Token nextToken();
499
500 // unicode_language_subtag = alpha{2,3} | alpha{5,8} ;
501 //
502 // Four character language subtags are not allowed in Unicode BCP 47 locale
503 // identifiers. Also see the comparison to Unicode CLDR locale identifiers in
504 // <https://unicode.org/reports/tr35/#BCP_47_Conformance>.
isLanguage(const Token & tok)505 bool isLanguage(const Token& tok) const {
506 return tok.isAlpha() && ((2 <= tok.length() && tok.length() <= 3) ||
507 (5 <= tok.length() && tok.length() <= 8));
508 }
509
510 // unicode_script_subtag = alpha{4} ;
isScript(const Token & tok)511 bool isScript(const Token& tok) const {
512 return tok.isAlpha() && tok.length() == 4;
513 }
514
515 // unicode_region_subtag = (alpha{2} | digit{3}) ;
isRegion(const Token & tok)516 bool isRegion(const Token& tok) const {
517 return (tok.isAlpha() && tok.length() == 2) ||
518 (tok.isDigit() && tok.length() == 3);
519 }
520
521 // unicode_variant_subtag = (alphanum{5,8} | digit alphanum{3}) ;
isVariant(const Token & tok)522 bool isVariant(const Token& tok) const {
523 return (5 <= tok.length() && tok.length() <= 8) ||
524 (tok.length() == 4 && mozilla::IsAsciiDigit(charAt(tok.index())));
525 }
526
527 // Returns the code unit of the first character at the given singleton token.
528 // Always returns the lower case form of an alphabetical character.
singletonKey(const Token & tok)529 char singletonKey(const Token& tok) const {
530 MOZ_ASSERT(tok.length() == 1);
531 return AsciiToLowerCase(charAt(tok.index()));
532 }
533
534 // extensions = unicode_locale_extensions |
535 // transformed_extensions |
536 // other_extensions ;
537 //
538 // unicode_locale_extensions = sep [uU] ((sep keyword)+ |
539 // (sep attribute)+ (sep keyword)*) ;
540 //
541 // transformed_extensions = sep [tT] ((sep tlang (sep tfield)*) |
542 // (sep tfield)+) ;
543 //
544 // other_extensions = sep [alphanum-[tTuUxX]] (sep alphanum{2,8})+ ;
isExtensionStart(const Token & tok)545 bool isExtensionStart(const Token& tok) const {
546 return tok.length() == 1 && singletonKey(tok) != 'x';
547 }
548
549 // other_extensions = sep [alphanum-[tTuUxX]] (sep alphanum{2,8})+ ;
isOtherExtensionPart(const Token & tok)550 bool isOtherExtensionPart(const Token& tok) const {
551 return 2 <= tok.length() && tok.length() <= 8;
552 }
553
554 // unicode_locale_extensions = sep [uU] ((sep keyword)+ |
555 // (sep attribute)+ (sep keyword)*) ;
556 // keyword = key (sep type)? ;
isUnicodeExtensionPart(const Token & tok)557 bool isUnicodeExtensionPart(const Token& tok) const {
558 return isUnicodeExtensionKey(tok) || isUnicodeExtensionType(tok) ||
559 isUnicodeExtensionAttribute(tok);
560 }
561
562 // attribute = alphanum{3,8} ;
isUnicodeExtensionAttribute(const Token & tok)563 bool isUnicodeExtensionAttribute(const Token& tok) const {
564 return 3 <= tok.length() && tok.length() <= 8;
565 }
566
567 // key = alphanum alpha ;
isUnicodeExtensionKey(const Token & tok)568 bool isUnicodeExtensionKey(const Token& tok) const {
569 return tok.length() == 2 && mozilla::IsAsciiAlpha(charAt(tok.index() + 1));
570 }
571
572 // type = alphanum{3,8} (sep alphanum{3,8})* ;
isUnicodeExtensionType(const Token & tok)573 bool isUnicodeExtensionType(const Token& tok) const {
574 return 3 <= tok.length() && tok.length() <= 8;
575 }
576
577 // tkey = alpha digit ;
isTransformExtensionKey(const Token & tok)578 bool isTransformExtensionKey(const Token& tok) const {
579 return tok.length() == 2 && mozilla::IsAsciiAlpha(charAt(tok.index())) &&
580 mozilla::IsAsciiDigit(charAt(tok.index() + 1));
581 }
582
583 // tvalue = (sep alphanum{3,8})+ ;
isTransformExtensionPart(const Token & tok)584 bool isTransformExtensionPart(const Token& tok) const {
585 return 3 <= tok.length() && tok.length() <= 8;
586 }
587
588 // pu_extensions = sep [xX] (sep alphanum{1,8})+ ;
isPrivateUseStart(const Token & tok)589 bool isPrivateUseStart(const Token& tok) const {
590 return tok.length() == 1 && singletonKey(tok) == 'x';
591 }
592
593 // pu_extensions = sep [xX] (sep alphanum{1,8})+ ;
isPrivateUsePart(const Token & tok)594 bool isPrivateUsePart(const Token& tok) const {
595 return 1 <= tok.length() && tok.length() <= 8;
596 }
597
598 // Helper function for use in |parseBaseName| and
599 // |parseTlangInTransformExtension|. Do not use this directly!
600 static JS::Result<bool> internalParseBaseName(JSContext* cx,
601 LanguageTagParser& ts,
602 LanguageTag& tag, Token& tok);
603
604 // Parse the `unicode_language_id` production, i.e. the
605 // language/script/region/variants portion of a language tag, into |tag|.
606 // |tok| must be the current token.
parseBaseName(JSContext * cx,LanguageTagParser & ts,LanguageTag & tag,Token & tok)607 static JS::Result<bool> parseBaseName(JSContext* cx, LanguageTagParser& ts,
608 LanguageTag& tag, Token& tok) {
609 return internalParseBaseName(cx, ts, tag, tok);
610 }
611
612 // Parse the `tlang` production within a parsed 't' transform extension.
613 // The precise requirements for "previously parsed" are:
614 //
615 // * the input begins from current token |tok| with a valid `tlang`
616 // * the `tlang` is wholly lowercase (*not* canonical case)
617 // * variant subtags in the `tlang` may contain duplicates and be
618 // unordered
619 //
620 // Return an error on internal failure. Otherwise, return a success value. If
621 // there was no `tlang`, then |tag.language().missing()|. But if there was a
622 // `tlang`, then |tag| is filled with subtags exactly as they appeared in the
623 // parse input.
parseTlangInTransformExtension(JSContext * cx,LanguageTagParser & ts,LanguageTag & tag,Token & tok)624 static JS::Result<JS::Ok> parseTlangInTransformExtension(
625 JSContext* cx, LanguageTagParser& ts, LanguageTag& tag, Token& tok) {
626 MOZ_ASSERT(ts.isLanguage(tok));
627 return internalParseBaseName(cx, ts, tag, tok).map([](bool parsed) {
628 MOZ_ASSERT(parsed);
629 return JS::Ok();
630 });
631 }
632
633 friend class LanguageTag;
634
635 class Range final {
636 size_t begin_;
637 size_t length_;
638
639 public:
Range(size_t begin,size_t length)640 Range(size_t begin, size_t length) : begin_(begin), length_(length) {}
641
642 template <typename T>
begin(T * ptr)643 T* begin(T* ptr) const {
644 return ptr + begin_;
645 }
646
length()647 size_t length() const { return length_; }
648 };
649
650 using TFieldVector = js::Vector<Range, 8>;
651 using AttributesVector = js::Vector<Range, 8>;
652 using KeywordsVector = js::Vector<Range, 8>;
653
654 // Parse |extension|, which must be a validated, fully lowercase
655 // `transformed_extensions` subtag, and fill |tag| and |fields| from the
656 // `tlang` and `tfield` components. Data in |tag| is lowercase, consistent
657 // with |extension|.
658 static JS::Result<bool> parseTransformExtension(
659 JSContext* cx, mozilla::Span<const char> extension, LanguageTag& tag,
660 TFieldVector& fields);
661
662 // Parse |extension|, which must be a validated, fully lowercase
663 // `unicode_locale_extensions` subtag, and fill |attributes| and |keywords|
664 // from the `attribute` and `keyword` components.
665 static JS::Result<bool> parseUnicodeExtension(
666 JSContext* cx, mozilla::Span<const char> extension,
667 AttributesVector& attributes, KeywordsVector& keywords);
668
669 static JS::Result<bool> tryParse(JSContext* cx, LocaleChars& localeChars,
670 size_t localeLength, LanguageTag& tag);
671
672 public:
673 // Parse the input string as a language tag. Reports an error to the context
674 // if the input can't be parsed completely.
675 static bool parse(JSContext* cx, JSLinearString* locale, LanguageTag& tag);
676
677 // Parse the input string as a language tag. Reports an error to the context
678 // if the input can't be parsed completely.
679 static bool parse(JSContext* cx, mozilla::Span<const char> locale,
680 LanguageTag& tag);
681
682 // Parse the input string as a language tag. Returns Ok(true) if the input
683 // could be completely parsed, Ok(false) if the input couldn't be parsed,
684 // or Err() in case of internal error.
685 static JS::Result<bool> tryParse(JSContext* cx, JSLinearString* locale,
686 LanguageTag& tag);
687
688 // Parse the input string as a language tag. Returns Ok(true) if the input
689 // could be completely parsed, Ok(false) if the input couldn't be parsed,
690 // or Err() in case of internal error.
691 static JS::Result<bool> tryParse(JSContext* cx,
692 mozilla::Span<const char> locale,
693 LanguageTag& tag);
694
695 // Parse the input string as the base-name parts (language, script, region,
696 // variants) of a language tag. Ignores any trailing characters.
697 static bool parseBaseName(JSContext* cx, mozilla::Span<const char> locale,
698 LanguageTag& tag);
699
700 // Parse the input string as the base-name parts (language, script, region,
701 // variants) of a language tag. Returns Ok(true) if the input could be
702 // completely parsed, Ok(false) if the input couldn't be parsed, or Err() in
703 // case of internal error.
704 static JS::Result<bool> tryParseBaseName(JSContext* cx,
705 JSLinearString* locale,
706 LanguageTag& tag);
707
708 // Return true iff |extension| can be parsed as a Unicode extension subtag.
709 static bool canParseUnicodeExtension(mozilla::Span<const char> extension);
710
711 // Return true iff |unicodeType| can be parsed as a Unicode extension type.
712 static bool canParseUnicodeExtensionType(JSLinearString* unicodeType);
713 };
714
715 MOZ_MAKE_ENUM_CLASS_BITWISE_OPERATORS(LanguageTagParser::TokenKind)
716
717 /**
718 * Parse a string as a standalone |language| tag. If |str| is a standalone
719 * language tag, store it in |result| and return true. Otherwise return false.
720 */
721 [[nodiscard]] bool ParseStandaloneLanguageTag(JS::Handle<JSLinearString*> str,
722 LanguageSubtag& result);
723
724 /**
725 * Parse a string as a standalone |script| tag. If |str| is a standalone script
726 * tag, store it in |result| and return true. Otherwise return false.
727 */
728 [[nodiscard]] bool ParseStandaloneScriptTag(JS::Handle<JSLinearString*> str,
729 ScriptSubtag& result);
730
731 /**
732 * Parse a string as a standalone |region| tag. If |str| is a standalone region
733 * tag, store it in |result| and return true. Otherwise return false.
734 */
735 [[nodiscard]] bool ParseStandaloneRegionTag(JS::Handle<JSLinearString*> str,
736 RegionSubtag& result);
737
738 /**
739 * Parse a string as an ISO-639 language code. Return |nullptr| in the result if
740 * the input could not be parsed or the canonical form of the resulting language
741 * tag contains more than a single language subtag.
742 */
743 JS::Result<JSString*> ParseStandaloneISO639LanguageTag(
744 JSContext* cx, JS::Handle<JSLinearString*> str);
745
746 class UnicodeExtensionKeyword final {
747 char key_[LanguageTagLimits::UnicodeKeyLength];
748 JSLinearString* type_;
749
750 public:
751 using UnicodeKey = const char (&)[LanguageTagLimits::UnicodeKeyLength + 1];
752 using UnicodeKeySpan =
753 mozilla::Span<const char, LanguageTagLimits::UnicodeKeyLength>;
754
UnicodeExtensionKeyword(UnicodeKey key,JSLinearString * type)755 UnicodeExtensionKeyword(UnicodeKey key, JSLinearString* type)
756 : key_{key[0], key[1]}, type_(type) {}
757
key()758 UnicodeKeySpan key() const { return {key_, sizeof(key_)}; }
type()759 JSLinearString* type() const { return type_; }
760
761 void trace(JSTracer* trc);
762 };
763
764 [[nodiscard]] extern bool ApplyUnicodeExtensionToTag(
765 JSContext* cx, LanguageTag& tag,
766 JS::HandleVector<UnicodeExtensionKeyword> keywords);
767
768 } // namespace intl
769
770 } // namespace js
771
772 #endif /* builtin_intl_LanguageTag_h */
773