1#!/usr/bin/env python
2# -*- coding: utf-8 -*-
3#
4# This Source Code Form is subject to the terms of the Mozilla Public
5# License, v. 2.0. If a copy of the MPL was not distributed with this
6# file, You can obtain one at http://mozilla.org/MPL/2.0/.
7
8""" Usage:
9    make_intl_data.py langtags [cldr_core.zip]
10    make_intl_data.py tzdata
11    make_intl_data.py currency
12    make_intl_data.py units
13    make_intl_data.py numbering
14
15
16    Target "langtags":
17    This script extracts information about 1) mappings between deprecated and
18    current Unicode BCP 47 locale identifiers, and 2) deprecated and current
19    BCP 47 Unicode extension value from CLDR, and converts it to C++ mapping
20    code in LanguageTagGenerated.cpp. The code is used in LanguageTag.cpp.
21
22
23    Target "tzdata":
24    This script computes which time zone informations are not up-to-date in ICU
25    and provides the necessary mappings to workaround this problem.
26    https://ssl.icu-project.org/trac/ticket/12044
27
28
29    Target "currency":
30    Generates the mapping from currency codes to decimal digits used for them.
31
32
33    Target "units":
34    Generate source and test files using the list of so-called "sanctioned unit
35    identifiers" and verifies that the ICU data filter includes these units.
36
37
38    Target "numbering":
39    Generate source and test files using the list of numbering systems with
40    simple digit mappings and verifies that it's in sync with ICU/CLDR.
41"""
42
43from __future__ import print_function
44import os
45import re
46import io
47import json
48import shutil
49import sys
50import tarfile
51import tempfile
52import yaml
53from contextlib import closing
54from functools import partial, total_ordering
55from itertools import chain, groupby, tee
56from operator import attrgetter, itemgetter
57from zipfile import ZipFile
58
59if sys.version_info.major == 2:
60    from itertools import (
61        ifilter as filter,
62        ifilterfalse as filterfalse,
63        imap as map,
64        izip_longest as zip_longest,
65    )
66    from urllib2 import urlopen, Request as UrlRequest
67    from urlparse import urlsplit
68else:
69    from itertools import filterfalse, zip_longest
70    from urllib.request import urlopen, Request as UrlRequest
71    from urllib.parse import urlsplit
72
73
74# From https://docs.python.org/3/library/itertools.html
75def grouper(iterable, n, fillvalue=None):
76    "Collect data into fixed-length chunks or blocks"
77    # grouper('ABCDEFG', 3, 'x') --> ABC DEF Gxx"
78    args = [iter(iterable)] * n
79    return zip_longest(*args, fillvalue=fillvalue)
80
81
82def writeMappingHeader(println, description, source, url):
83    if type(description) is not list:
84        description = [description]
85    for desc in description:
86        println("// {0}".format(desc))
87    println("// Derived from {0}.".format(source))
88    println("// {0}".format(url))
89
90
91def writeMappingsVar(println, mapping, name, description, source, url):
92    """Writes a variable definition with a mapping table.
93
94    Writes the contents of dictionary |mapping| through the |println|
95    function with the given variable name and a comment with description,
96    fileDate, and URL.
97    """
98    println("")
99    writeMappingHeader(println, description, source, url)
100    println("var {0} = {{".format(name))
101    for (key, value) in sorted(mapping.items(), key=itemgetter(0)):
102        println('    "{0}": "{1}",'.format(key, value))
103    println("};")
104
105
106def writeMappingsBinarySearch(
107    println,
108    fn_name,
109    type_name,
110    name,
111    validate_fn,
112    validate_case_fn,
113    mappings,
114    tag_maxlength,
115    description,
116    source,
117    url,
118):
119    """Emit code to perform a binary search on language tag subtags.
120
121    Uses the contents of |mapping|, which can either be a dictionary or set,
122    to emit a mapping function to find subtag replacements.
123    """
124    println("")
125    writeMappingHeader(println, description, source, url)
126    println(
127        """
128bool js::intl::LanguageTag::{0}({1} {2}) {{
129  MOZ_ASSERT({3}({2}.span()));
130  MOZ_ASSERT({4}({2}.span()));
131""".format(
132            fn_name, type_name, name, validate_fn, validate_case_fn
133        ).strip()
134    )
135    writeMappingsBinarySearchBody(println, name, name, mappings, tag_maxlength)
136
137    println(
138        """
139}""".lstrip(
140            "\n"
141        )
142    )
143
144
145def writeMappingsBinarySearchBody(
146    println, source_name, target_name, mappings, tag_maxlength
147):
148    def write_array(subtags, name, length, fixed):
149        if fixed:
150            println(
151                "    static const char {}[{}][{}] = {{".format(
152                    name, len(subtags), length + 1
153                )
154            )
155        else:
156            println("    static const char* {}[{}] = {{".format(name, len(subtags)))
157
158        # Group in pairs of ten to not exceed the 80 line column limit.
159        for entries in grouper(subtags, 10):
160            entries = (
161                '"{}"'.format(tag).rjust(length + 2)
162                for tag in entries
163                if tag is not None
164            )
165            println("      {},".format(", ".join(entries)))
166
167        println("    };")
168
169    trailing_return = True
170
171    # Sort the subtags by length. That enables using an optimized comparator
172    # for the binary search, which only performs a single |memcmp| for multiple
173    # of two subtag lengths.
174    mappings_keys = mappings.keys() if type(mappings) == dict else mappings
175    for (length, subtags) in groupby(sorted(mappings_keys, key=len), len):
176        # Omit the length check if the current length is the maximum length.
177        if length != tag_maxlength:
178            println(
179                """
180  if ({}.length() == {}) {{
181""".format(
182                    source_name, length
183                ).rstrip(
184                    "\n"
185                )
186            )
187        else:
188            trailing_return = False
189            println(
190                """
191  {
192""".rstrip(
193                    "\n"
194                )
195            )
196
197        # The subtags need to be sorted for binary search to work.
198        subtags = sorted(subtags)
199
200        def equals(subtag):
201            return """{}.equalTo("{}")""".format(source_name, subtag)
202
203        # Don't emit a binary search for short lists.
204        if len(subtags) == 1:
205            if type(mappings) == dict:
206                println(
207                    """
208    if ({}) {{
209      {}.set(mozilla::MakeStringSpan("{}"));
210      return true;
211    }}
212    return false;
213""".format(
214                        equals(subtags[0]), target_name, mappings[subtags[0]]
215                    ).strip(
216                        "\n"
217                    )
218                )
219            else:
220                println(
221                    """
222    return {};
223""".format(
224                        equals(subtags[0])
225                    ).strip(
226                        "\n"
227                    )
228                )
229        elif len(subtags) <= 4:
230            if type(mappings) == dict:
231                for subtag in subtags:
232                    println(
233                        """
234    if ({}) {{
235      {}.set("{}");
236      return true;
237    }}
238""".format(
239                            equals(subtag), target_name, mappings[subtag]
240                        ).strip(
241                            "\n"
242                        )
243                    )
244
245                println(
246                    """
247    return false;
248""".strip(
249                        "\n"
250                    )
251                )
252            else:
253                cond = (equals(subtag) for subtag in subtags)
254                cond = (" ||\n" + " " * (4 + len("return "))).join(cond)
255                println(
256                    """
257    return {};
258""".format(
259                        cond
260                    ).strip(
261                        "\n"
262                    )
263                )
264        else:
265            write_array(subtags, source_name + "s", length, True)
266
267            if type(mappings) == dict:
268                write_array([mappings[k] for k in subtags], "aliases", length, False)
269
270                println(
271                    """
272    if (const char* replacement = SearchReplacement({0}s, aliases, {0})) {{
273      {1}.set(mozilla::MakeStringSpan(replacement));
274      return true;
275    }}
276    return false;
277""".format(
278                        source_name, target_name
279                    ).rstrip()
280                )
281            else:
282                println(
283                    """
284    return HasReplacement({0}s, {0});
285""".format(
286                        source_name
287                    ).rstrip()
288                )
289
290        println(
291            """
292  }
293""".strip(
294                "\n"
295            )
296        )
297
298    if trailing_return:
299        println(
300            """
301  return false;"""
302        )
303
304
305def writeComplexLanguageTagMappings(
306    println, complex_language_mappings, description, source, url
307):
308    println("")
309    writeMappingHeader(println, description, source, url)
310    println(
311        """
312void js::intl::LanguageTag::performComplexLanguageMappings() {
313  MOZ_ASSERT(IsStructurallyValidLanguageTag(language().span()));
314  MOZ_ASSERT(IsCanonicallyCasedLanguageTag(language().span()));
315""".lstrip()
316    )
317
318    # Merge duplicate language entries.
319    language_aliases = {}
320    for (deprecated_language, (language, script, region)) in sorted(
321        complex_language_mappings.items(), key=itemgetter(0)
322    ):
323        key = (language, script, region)
324        if key not in language_aliases:
325            language_aliases[key] = []
326        else:
327            language_aliases[key].append(deprecated_language)
328
329    first_language = True
330    for (deprecated_language, (language, script, region)) in sorted(
331        complex_language_mappings.items(), key=itemgetter(0)
332    ):
333        key = (language, script, region)
334        if deprecated_language in language_aliases[key]:
335            continue
336
337        if_kind = "if" if first_language else "else if"
338        first_language = False
339
340        cond = (
341            'language().equalTo("{}")'.format(lang)
342            for lang in [deprecated_language] + language_aliases[key]
343        )
344        cond = (" ||\n" + " " * (2 + len(if_kind) + 2)).join(cond)
345
346        println(
347            """
348  {} ({}) {{""".format(
349                if_kind, cond
350            ).strip(
351                "\n"
352            )
353        )
354
355        println(
356            """
357    setLanguage("{}");""".format(
358                language
359            ).strip(
360                "\n"
361            )
362        )
363
364        if script is not None:
365            println(
366                """
367    if (script().missing()) {{
368      setScript("{}");
369    }}""".format(
370                    script
371                ).strip(
372                    "\n"
373                )
374            )
375        if region is not None:
376            println(
377                """
378    if (region().missing()) {{
379      setRegion("{}");
380    }}""".format(
381                    region
382                ).strip(
383                    "\n"
384                )
385            )
386        println(
387            """
388  }""".strip(
389                "\n"
390            )
391        )
392
393    println(
394        """
395}
396""".strip(
397            "\n"
398        )
399    )
400
401
402def writeComplexRegionTagMappings(
403    println, complex_region_mappings, description, source, url
404):
405    println("")
406    writeMappingHeader(println, description, source, url)
407    println(
408        """
409void js::intl::LanguageTag::performComplexRegionMappings() {
410  MOZ_ASSERT(IsStructurallyValidLanguageTag(language().span()));
411  MOZ_ASSERT(IsCanonicallyCasedLanguageTag(language().span()));
412  MOZ_ASSERT(IsStructurallyValidRegionTag(region().span()));
413  MOZ_ASSERT(IsCanonicallyCasedRegionTag(region().span()));
414""".lstrip()
415    )
416
417    # |non_default_replacements| is a list and hence not hashable. Convert it
418    # to a string to get a proper hashable value.
419    def hash_key(default, non_default_replacements):
420        return (default, str(sorted(str(v) for v in non_default_replacements)))
421
422    # Merge duplicate region entries.
423    region_aliases = {}
424    for (deprecated_region, (default, non_default_replacements)) in sorted(
425        complex_region_mappings.items(), key=itemgetter(0)
426    ):
427        key = hash_key(default, non_default_replacements)
428        if key not in region_aliases:
429            region_aliases[key] = []
430        else:
431            region_aliases[key].append(deprecated_region)
432
433    first_region = True
434    for (deprecated_region, (default, non_default_replacements)) in sorted(
435        complex_region_mappings.items(), key=itemgetter(0)
436    ):
437        key = hash_key(default, non_default_replacements)
438        if deprecated_region in region_aliases[key]:
439            continue
440
441        if_kind = "if" if first_region else "else if"
442        first_region = False
443
444        cond = (
445            'region().equalTo("{}")'.format(region)
446            for region in [deprecated_region] + region_aliases[key]
447        )
448        cond = (" ||\n" + " " * (2 + len(if_kind) + 2)).join(cond)
449
450        println(
451            """
452  {} ({}) {{""".format(
453                if_kind, cond
454            ).strip(
455                "\n"
456            )
457        )
458
459        replacement_regions = sorted(
460            {region for (_, _, region) in non_default_replacements}
461        )
462
463        first_case = True
464        for replacement_region in replacement_regions:
465            replacement_language_script = sorted(
466                (language, script)
467                for (language, script, region) in (non_default_replacements)
468                if region == replacement_region
469            )
470
471            if_kind = "if" if first_case else "else if"
472            first_case = False
473
474            def compare_tags(language, script):
475                if script is None:
476                    return 'language().equalTo("{}")'.format(language)
477                return '(language().equalTo("{}") && script().equalTo("{}"))'.format(
478                    language, script
479                )
480
481            cond = (
482                compare_tags(language, script)
483                for (language, script) in replacement_language_script
484            )
485            cond = (" ||\n" + " " * (4 + len(if_kind) + 2)).join(cond)
486
487            println(
488                """
489    {} ({}) {{
490      setRegion("{}");
491    }}""".format(
492                    if_kind, cond, replacement_region
493                )
494                .rstrip()
495                .strip("\n")
496            )
497
498        println(
499            """
500    else {{
501      setRegion("{}");
502    }}
503  }}""".format(
504                default
505            )
506            .rstrip()
507            .strip("\n")
508        )
509
510    println(
511        """
512}
513""".strip(
514            "\n"
515        )
516    )
517
518
519def writeVariantTagMappings(println, variant_mappings, description, source, url):
520    """ Writes a function definition that maps variant subtags. """
521    println(
522        """
523static const char* ToCharPointer(const char* str) {
524  return str;
525}
526
527static const char* ToCharPointer(const js::UniqueChars& str) {
528  return str.get();
529}
530
531template <typename T, typename U = T>
532static bool IsLessThan(const T& a, const U& b) {
533  return strcmp(ToCharPointer(a), ToCharPointer(b)) < 0;
534}
535"""
536    )
537    writeMappingHeader(println, description, source, url)
538    println(
539        """
540bool js::intl::LanguageTag::performVariantMappings(JSContext* cx) {
541  // The variant subtags need to be sorted for binary search.
542  MOZ_ASSERT(std::is_sorted(variants_.begin(), variants_.end(),
543                            IsLessThan<decltype(variants_)::ElementType>));
544
545  auto removeVariantAt = [&](size_t index) {
546    variants_.erase(variants_.begin() + index);
547  };
548
549  auto insertVariantSortedIfNotPresent = [&](const char* variant) {
550    auto* p = std::lower_bound(variants_.begin(), variants_.end(), variant,
551                               IsLessThan<decltype(variants_)::ElementType,
552                                          decltype(variant)>);
553
554    // Don't insert the replacement when already present.
555    if (p != variants_.end() && strcmp(p->get(), variant) == 0) {
556      return true;
557    }
558
559    // Insert the preferred variant in sort order.
560    auto preferred = DuplicateString(cx, variant);
561    if (!preferred) {
562      return false;
563    }
564    return !!variants_.insert(p, std::move(preferred));
565  };
566
567  for (size_t i = 0; i < variants_.length(); ) {
568    const char* variant = variants_[i].get();
569    MOZ_ASSERT(IsCanonicallyCasedVariantTag(mozilla::MakeStringSpan(variant)));
570""".lstrip()
571    )
572
573    (no_alias, with_alias) = partition(
574        variant_mappings.items(), lambda item: item[1] is None
575    )
576
577    no_replacements = " ||\n        ".join(
578        f"""strcmp(variant, "{deprecated_variant}") == 0"""
579        for (deprecated_variant, _) in sorted(no_alias, key=itemgetter(0))
580    )
581
582    println(
583        f"""
584    if ({no_replacements}) {{
585      removeVariantAt(i);
586    }}
587""".strip(
588            "\n"
589        )
590    )
591
592    for (deprecated_variant, (type, replacement)) in sorted(
593        with_alias, key=itemgetter(0)
594    ):
595        println(
596            f"""
597    else if (strcmp(variant, "{deprecated_variant}") == 0) {{
598      removeVariantAt(i);
599""".strip(
600                "\n"
601            )
602        )
603
604        if type == "language":
605            println(
606                f"""
607      setLanguage("{replacement}");
608""".strip(
609                    "\n"
610                )
611            )
612        elif type == "region":
613            println(
614                f"""
615      setRegion("{replacement}");
616""".strip(
617                    "\n"
618                )
619            )
620        else:
621            assert type == "variant"
622            println(
623                f"""
624      if (!insertVariantSortedIfNotPresent("{replacement}")) {{
625        return false;
626      }}
627""".strip(
628                    "\n"
629                )
630            )
631
632        println(
633            """
634    }
635""".strip(
636                "\n"
637            )
638        )
639
640    println(
641        """
642    else {
643      i++;
644    }
645  }
646  return true;
647}
648""".strip(
649            "\n"
650        )
651    )
652
653
654def writeLegacyMappingsFunction(println, legacy_mappings, description, source, url):
655    """ Writes a function definition that maps legacy language tags. """
656    println("")
657    writeMappingHeader(println, description, source, url)
658    println(
659        """\
660bool js::intl::LanguageTag::updateLegacyMappings(JSContext* cx) {
661  // We're mapping legacy tags to non-legacy form here.
662  // Other tags remain unchanged.
663  //
664  // Legacy tags are either sign language tags ("sgn") or have one or multiple
665  // variant subtags. Therefore we can quickly exclude most tags by checking
666  // these two subtags.
667
668  MOZ_ASSERT(IsCanonicallyCasedLanguageTag(language().span()));
669
670  if (!language().equalTo("sgn") && variants().length() == 0) {
671    return true;
672  }
673
674  for ([[maybe_unused]] const auto& variant : variants()) {
675    MOZ_ASSERT(IsStructurallyValidVariantTag(mozilla::MakeStringSpan(variant.get())));
676    MOZ_ASSERT(IsCanonicallyCasedVariantTag(mozilla::MakeStringSpan(variant.get())));
677  }
678
679  // The variant subtags need to be sorted for binary search.
680  MOZ_ASSERT(std::is_sorted(variants_.begin(), variants_.end(),
681                            IsLessThan<decltype(variants_)::ElementType>));
682
683  auto findVariant = [this](const char* variant) {
684    auto* p = std::lower_bound(variants_.begin(), variants_.end(), variant,
685                               IsLessThan<decltype(variants_)::ElementType,
686                                          decltype(variant)>);
687
688    if (p != variants_.end() && strcmp(p->get(), variant) == 0) {
689      return p;
690    }
691    return static_cast<decltype(p)>(nullptr);
692  };
693
694  auto insertVariantSortedIfNotPresent = [&](const char* variant) {
695    auto* p = std::lower_bound(variants_.begin(), variants_.end(), variant,
696                               IsLessThan<decltype(variants_)::ElementType,
697                                          decltype(variant)>);
698
699    // Don't insert the replacement when already present.
700    if (p != variants_.end() && strcmp(p->get(), variant) == 0) {
701      return true;
702    }
703
704    // Insert the preferred variant in sort order.
705    auto preferred = DuplicateString(cx, variant);
706    if (!preferred) {
707      return false;
708    }
709    return !!variants_.insert(p, std::move(preferred));
710  };
711
712  auto removeVariant = [&](auto* p) {
713    size_t index = std::distance(variants_.begin(), p);
714    variants_.erase(variants_.begin() + index);
715  };
716
717  auto removeVariants = [&](auto* p, auto* q) {
718    size_t pIndex = std::distance(variants_.begin(), p);
719    size_t qIndex = std::distance(variants_.begin(), q);
720    MOZ_ASSERT(pIndex < qIndex, "variant subtags are sorted");
721
722    variants_.erase(variants_.begin() + qIndex);
723    variants_.erase(variants_.begin() + pIndex);
724  };"""
725    )
726
727    # Helper class for pattern matching.
728    class AnyClass:
729        def __eq__(self, obj):
730            return obj is not None
731
732    Any = AnyClass()
733
734    # Group the mappings by language.
735    legacy_mappings_by_language = {}
736    for (type, replacement) in legacy_mappings.items():
737        (language, _, _, _) = type
738        legacy_mappings_by_language.setdefault(language, {})[type] = replacement
739
740    # Handle the empty language case first.
741    if None in legacy_mappings_by_language:
742        # Get the mappings and remove them from the dict.
743        mappings = legacy_mappings_by_language.pop(None)
744
745        # This case only applies for the "hepburn-heploc" -> "alalc97"
746        # mapping, so just inline it here.
747        from_tag = (None, None, None, "hepburn-heploc")
748        to_tag = (None, None, None, "alalc97")
749
750        assert len(mappings) == 1
751        assert mappings[from_tag] == to_tag
752
753        println(
754            """
755  if (variants().length() >= 2) {
756    if (auto* hepburn = findVariant("hepburn")) {
757      if (auto* heploc = findVariant("heploc")) {
758        removeVariants(hepburn, heploc);
759
760        if (!insertVariantSortedIfNotPresent("alalc97")) {
761          return false;
762        }
763      }
764    }
765  }
766"""
767        )
768
769    # Handle sign languages next.
770    if "sgn" in legacy_mappings_by_language:
771        mappings = legacy_mappings_by_language.pop("sgn")
772
773        # Legacy sign language mappings have the form "sgn-XX" where "XX" is
774        # some region code.
775        assert all(type == ("sgn", None, Any, None) for type in mappings.keys())
776
777        # Legacy sign languages are mapped to a single language subtag.
778        assert all(
779            replacement == (Any, None, None, None) for replacement in mappings.values()
780        )
781
782        println(
783            """
784  if (language().equalTo("sgn")) {
785    if (region().present() && signLanguageMapping(language_, region())) {
786      region_.set(mozilla::MakeStringSpan(""));
787    }
788  }
789""".rstrip().lstrip(
790                "\n"
791            )
792        )
793
794    # Finally handle all remaining cases.
795
796    # The remaining mappings have neither script nor region subtags in the source locale.
797    assert all(
798        type == (Any, None, None, Any)
799        for mappings in legacy_mappings_by_language.values()
800        for type in mappings.keys()
801    )
802
803    # And they have neither script nor region nor variant subtags in the target locale.
804    assert all(
805        replacement == (Any, None, None, None)
806        for mappings in legacy_mappings_by_language.values()
807        for replacement in mappings.values()
808    )
809
810    # Compact the mappings table by removing empty fields.
811    legacy_mappings_by_language = {
812        lang: {
813            variants: r_language
814            for ((_, _, _, variants), (r_language, _, _, _)) in mappings.items()
815        }
816        for (lang, mappings) in legacy_mappings_by_language.items()
817    }
818
819    # Try to combine the remaining cases.
820    legacy_mappings_compact = {}
821
822    # Python can't hash dicts or lists, so use the string representation as the hash key.
823    def hash_key(mappings):
824        return str(sorted(mappings.items(), key=itemgetter(0)))
825
826    for (lang, mappings) in sorted(
827        legacy_mappings_by_language.items(), key=itemgetter(0)
828    ):
829        key = hash_key(mappings)
830        legacy_mappings_compact.setdefault(key, []).append(lang)
831
832    for langs in legacy_mappings_compact.values():
833        language_equal_to = (
834            f"""language().equalTo("{lang}")""" for lang in sorted(langs)
835        )
836        cond = f""" ||\n{" " * len("  else if (")}""".join(language_equal_to)
837
838        println(
839            f"""
840  else if ({cond}) {{
841""".rstrip().lstrip(
842                "\n"
843            )
844        )
845
846        mappings = legacy_mappings_by_language[langs[0]]
847
848        # Count the variant subtags to determine the sort order.
849        def variant_size(m):
850            (k, _) = m
851            return len(k.split("-"))
852
853        # Alias rules are applied by largest union size first.
854        for (size, mappings_by_size) in groupby(
855            sorted(mappings.items(), key=variant_size, reverse=True), key=variant_size
856        ):
857
858            # Convert grouper object to dict.
859            mappings_by_size = dict(mappings_by_size)
860
861            is_first = True
862            chain_if = size == 1
863
864            # Alias rules are applied in alphabetical order
865            for (variants, r_language) in sorted(
866                mappings_by_size.items(), key=itemgetter(0)
867            ):
868                sorted_variants = sorted(variants.split("-"))
869                len_variants = len(sorted_variants)
870
871                maybe_else = "else " if chain_if and not is_first else ""
872                is_first = False
873
874                for (i, variant) in enumerate(sorted_variants):
875                    println(
876                        f"""
877    {"  " * i}{maybe_else}if (auto* {variant} = findVariant("{variant}")) {{
878""".rstrip().lstrip(
879                            "\n"
880                        )
881                    )
882
883                indent = "  " * len_variants
884
885                println(
886                    f"""
887    {indent}removeVariant{"s" if len_variants > 1 else ""}({", ".join(sorted_variants)});
888    {indent}setLanguage("{r_language}");
889    {indent}{"return true;" if not chain_if else ""}
890""".rstrip().lstrip(
891                        "\n"
892                    )
893                )
894
895                for i in range(len_variants, 0, -1):
896                    println(
897                        f"""
898    {"  " * (i - 1)}}}
899""".rstrip().lstrip(
900                            "\n"
901                        )
902                    )
903
904        println(
905            """
906  }
907""".rstrip().lstrip(
908                "\n"
909            )
910        )
911
912    println(
913        """
914  return true;
915}"""
916    )
917
918
919def writeSignLanguageMappingsFunction(
920    println, legacy_mappings, description, source, url
921):
922    """ Writes a function definition that maps legacy sign language tags. """
923    println("")
924    writeMappingHeader(println, description, source, url)
925    println(
926        """\
927bool js::intl::LanguageTag::signLanguageMapping(LanguageSubtag& language,
928                                                const RegionSubtag& region) {
929  MOZ_ASSERT(language.equalTo("sgn"));
930  MOZ_ASSERT(IsStructurallyValidRegionTag(region.span()));
931  MOZ_ASSERT(IsCanonicallyCasedRegionTag(region.span()));
932""".rstrip()
933    )
934
935    region_mappings = {
936        rg: lg
937        for ((lang, _, rg, _), (lg, _, _, _)) in legacy_mappings.items()
938        if lang == "sgn"
939    }
940
941    source_name = "region"
942    target_name = "language"
943    tag_maxlength = 3
944    writeMappingsBinarySearchBody(
945        println, source_name, target_name, region_mappings, tag_maxlength
946    )
947
948    println(
949        """
950}""".lstrip()
951    )
952
953
954def readSupplementalData(core_file):
955    """Reads CLDR Supplemental Data and extracts information for Intl.js.
956
957    Information extracted:
958    - legacyMappings: mappings from legacy tags to preferred complete language tags
959    - languageMappings: mappings from language subtags to preferred subtags
960    - complexLanguageMappings: mappings from language subtags with complex rules
961    - regionMappings: mappings from region subtags to preferred subtags
962    - complexRegionMappings: mappings from region subtags with complex rules
963    - variantMappings: mappings from variant subtags to preferred subtags
964    - likelySubtags: likely subtags used for generating test data only
965    Returns these mappings as dictionaries.
966    """
967    import xml.etree.ElementTree as ET
968
969    # From Unicode BCP 47 locale identifier <https://unicode.org/reports/tr35/>.
970    re_unicode_language_id = re.compile(
971        r"""
972        ^
973        # unicode_language_id = unicode_language_subtag
974        #     unicode_language_subtag = alpha{2,3} | alpha{5,8}
975        (?P<language>[a-z]{2,3}|[a-z]{5,8})
976
977        # (sep unicode_script_subtag)?
978        #     unicode_script_subtag = alpha{4}
979        (?:-(?P<script>[a-z]{4}))?
980
981        # (sep unicode_region_subtag)?
982        #     unicode_region_subtag = (alpha{2} | digit{3})
983        (?:-(?P<region>([a-z]{2}|[0-9]{3})))?
984
985        # (sep unicode_variant_subtag)*
986        #     unicode_variant_subtag = (alphanum{5,8} | digit alphanum{3})
987        (?P<variants>(-([a-z0-9]{5,8}|[0-9][a-z0-9]{3}))+)?
988        $
989        """,
990        re.IGNORECASE | re.VERBOSE,
991    )
992
993    # CLDR uses "_" as the separator for some elements. Replace it with "-".
994    def bcp47_id(cldr_id):
995        return cldr_id.replace("_", "-")
996
997    # Return the tuple (language, script, region, variants) and assert all
998    # subtags are in canonical case.
999    def bcp47_canonical(language, script, region, variants):
1000        # Canonical case for language subtags is lower case.
1001        assert language is None or language.lower() == language
1002
1003        # Canonical case for script subtags is title case.
1004        assert script is None or script.title() == script
1005
1006        # Canonical case for region subtags is upper case.
1007        assert region is None or region.upper() == region
1008
1009        # Canonical case for variant subtags is lower case.
1010        assert variants is None or variants.lower() == variants
1011
1012        return (language, script, region, variants[1:] if variants else None)
1013
1014    # Language ids are interpreted as multi-maps in
1015    # <https://www.unicode.org/reports/tr35/#LocaleId_Canonicalization>.
1016    #
1017    # See UTS35, §Annex C, Definitions - 1. Multimap interpretation.
1018    def language_id_to_multimap(language_id):
1019        match = re_unicode_language_id.match(language_id)
1020        assert (
1021            match is not None
1022        ), f"{language_id} invalid Unicode BCP 47 locale identifier"
1023
1024        canonical_language_id = bcp47_canonical(
1025            *match.group("language", "script", "region", "variants")
1026        )
1027        (language, _, _, _) = canonical_language_id
1028
1029        # Normalize "und" language to None, but keep the rest as is.
1030        return (language if language != "und" else None,) + canonical_language_id[1:]
1031
1032    rules = {}
1033    territory_exception_rules = {}
1034
1035    tree = ET.parse(core_file.open("common/supplemental/supplementalMetadata.xml"))
1036
1037    # Load the rules from supplementalMetadata.xml.
1038    #
1039    # See UTS35, §Annex C, Definitions - 2. Alias elements.
1040    # See UTS35, §Annex C, Preprocessing.
1041    for alias_name in [
1042        "languageAlias",
1043        "scriptAlias",
1044        "territoryAlias",
1045        "variantAlias",
1046    ]:
1047        for alias in tree.iterfind(".//" + alias_name):
1048            # Replace '_' by '-'.
1049            type = bcp47_id(alias.get("type"))
1050            replacement = bcp47_id(alias.get("replacement"))
1051
1052            # Prefix with "und-".
1053            if alias_name != "languageAlias":
1054                type = "und-" + type
1055
1056            # Discard all rules where the type is an invalid languageId.
1057            if re_unicode_language_id.match(type) is None:
1058                continue
1059
1060            type = language_id_to_multimap(type)
1061
1062            # Multiple, whitespace-separated territory replacements may be present.
1063            if alias_name == "territoryAlias" and " " in replacement:
1064                replacements = replacement.split(" ")
1065                replacement_list = [
1066                    language_id_to_multimap("und-" + r) for r in replacements
1067                ]
1068
1069                assert (
1070                    type not in territory_exception_rules
1071                ), f"Duplicate alias rule: {type}"
1072
1073                territory_exception_rules[type] = replacement_list
1074
1075                # The first element is the default territory replacement.
1076                replacement = replacements[0]
1077
1078            # Prefix with "und-".
1079            if alias_name != "languageAlias":
1080                replacement = "und-" + replacement
1081
1082            replacement = language_id_to_multimap(replacement)
1083
1084            assert type not in rules, f"Duplicate alias rule: {type}"
1085
1086            rules[type] = replacement
1087
1088    # Helper class for pattern matching.
1089    class AnyClass:
1090        def __eq__(self, obj):
1091            return obj is not None
1092
1093    Any = AnyClass()
1094
1095    modified_rules = True
1096    loop_count = 0
1097
1098    while modified_rules:
1099        modified_rules = False
1100        loop_count += 1
1101
1102        # UTS 35 defines that canonicalization is applied until a fixed point has
1103        # been reached. This iterative application of the canonicalization algorithm
1104        # is only needed for a relatively small set of rules, so we can precompute
1105        # the transitive closure of all rules here and then perform a single pass
1106        # when canonicalizing language tags at runtime.
1107        transitive_rules = {}
1108
1109        # Compute the transitive closure.
1110        # Any case which currently doesn't occur in the CLDR sources isn't supported
1111        # and will lead to throwing an error.
1112        for (type, replacement) in rules.items():
1113            (language, script, region, variants) = type
1114            (r_language, r_script, r_region, r_variants) = replacement
1115
1116            for (i_type, i_replacement) in rules.items():
1117                (i_language, i_script, i_region, i_variants) = i_type
1118                (i_r_language, i_r_script, i_r_region, i_r_variants) = i_replacement
1119
1120                if i_language is not None and i_language == r_language:
1121                    # This case currently only occurs when neither script nor region
1122                    # subtags are present. A single variant subtags may be present
1123                    # in |type|. And |i_type| definitely has a single variant subtag.
1124                    # Should this ever change, update this code accordingly.
1125                    assert type == (Any, None, None, None) or type == (
1126                        Any,
1127                        None,
1128                        None,
1129                        Any,
1130                    )
1131                    assert replacement == (Any, None, None, None)
1132                    assert i_type == (Any, None, None, Any)
1133                    assert i_replacement == (Any, None, None, None)
1134
1135                    # This case happens for the rules
1136                    #   "zh-guoyu -> zh",
1137                    #   "zh-hakka -> hak", and
1138                    #   "und-hakka -> und".
1139                    # Given the possible input "zh-guoyu-hakka", the first rule will
1140                    # change it to "zh-hakka", and then the second rule can be
1141                    # applied. (The third rule isn't applied ever.)
1142                    #
1143                    # Let's assume there's a hypothetical rule
1144                    #   "zh-aaaaa" -> "en"
1145                    # And we have the input "zh-aaaaa-hakka", then "zh-aaaaa -> en"
1146                    # is applied before "zh-hakka -> hak", because rules are sorted
1147                    # alphabetically. That means the overall result is "en":
1148                    # "zh-aaaaa-hakka" is first canonicalized to "en-hakka" and then
1149                    # "hakka" is removed through the third rule.
1150                    #
1151                    # No current rule requires to handle this special case, so we
1152                    # don't yet support it.
1153                    assert variants is None or variants <= i_variants
1154
1155                    # Combine all variants and remove duplicates.
1156                    vars = set(
1157                        i_variants.split("-")
1158                        + (variants.split("-") if variants else [])
1159                    )
1160
1161                    # Add the variants alphabetically sorted.
1162                    n_type = (language, None, None, "-".join(sorted(vars)))
1163
1164                    assert (
1165                        n_type not in transitive_rules
1166                        or transitive_rules[n_type] == i_replacement
1167                    )
1168                    transitive_rules[n_type] = i_replacement
1169
1170                    continue
1171
1172                if i_script is not None and i_script == r_script:
1173                    # This case currently doesn't occur, so we don't yet support it.
1174                    raise ValueError(
1175                        f"{type} -> {replacement} :: {i_type} -> {i_replacement}"
1176                    )
1177                if i_region is not None and i_region == r_region:
1178                    # This case currently only applies for sign language
1179                    # replacements. Similar to the language subtag case any other
1180                    # combination isn't currently supported.
1181                    assert type == (None, None, Any, None)
1182                    assert replacement == (None, None, Any, None)
1183                    assert i_type == ("sgn", None, Any, None)
1184                    assert i_replacement == (Any, None, None, None)
1185
1186                    n_type = ("sgn", None, region, None)
1187
1188                    assert n_type not in transitive_rules
1189                    transitive_rules[n_type] = i_replacement
1190
1191                    continue
1192
1193                if i_variants is not None and i_variants == r_variants:
1194                    # This case currently doesn't occur, so we don't yet support it.
1195                    raise ValueError(
1196                        f"{type} -> {replacement} :: {i_type} -> {i_replacement}"
1197                    )
1198
1199        # Ensure there are no contradicting rules.
1200        assert all(
1201            rules[type] == replacement
1202            for (type, replacement) in transitive_rules.items()
1203            if type in rules
1204        )
1205
1206        # If |transitive_rules| is not a subset of |rules|, new rules will be added.
1207        modified_rules = not (transitive_rules.keys() <= rules.keys())
1208
1209        # Ensure we only have to iterate more than once for the "guoyo-{hakka,xiang}"
1210        # case. Failing this assertion means either there's a bug when computing the
1211        # stop condition of this loop or a new kind of legacy language tags was added.
1212        if modified_rules and loop_count > 1:
1213            new_rules = {k for k in transitive_rules.keys() if k not in rules}
1214            for k in new_rules:
1215                assert k == (Any, None, None, "guoyu-hakka") or k == (
1216                    Any,
1217                    None,
1218                    None,
1219                    "guoyu-xiang",
1220                )
1221
1222        # Merge the transitive rules.
1223        rules.update(transitive_rules)
1224
1225    # Computes the size of the union of all field value sets.
1226    def multi_map_size(locale_id):
1227        (language, script, region, variants) = locale_id
1228
1229        return (
1230            (1 if language is not None else 0)
1231            + (1 if script is not None else 0)
1232            + (1 if region is not None else 0)
1233            + (len(variants.split("-")) if variants is not None else 0)
1234        )
1235
1236    # Dictionary of legacy mappings, contains raw rules, e.g.
1237    # (None, None, None, "hepburn-heploc") -> (None, None, None, "alalc97").
1238    legacy_mappings = {}
1239
1240    # Dictionary of simple language subtag mappings, e.g. "in" -> "id".
1241    language_mappings = {}
1242
1243    # Dictionary of complex language subtag mappings, modifying more than one
1244    # subtag, e.g. "sh" -> ("sr", "Latn", None) and "cnr" -> ("sr", None, "ME").
1245    complex_language_mappings = {}
1246
1247    # Dictionary of simple script subtag mappings, e.g. "Qaai" -> "Zinh".
1248    script_mappings = {}
1249
1250    # Dictionary of simple region subtag mappings, e.g. "DD" -> "DE".
1251    region_mappings = {}
1252
1253    # Dictionary of complex region subtag mappings, containing more than one
1254    # replacement, e.g. "SU" -> ("RU", ["AM", "AZ", "BY", ...]).
1255    complex_region_mappings = {}
1256
1257    # Dictionary of aliased variant subtags to a tuple of preferred replacement
1258    # type and replacement, e.g. "arevela" -> ("language", "hy") or
1259    # "aaland" -> ("region", "AX") or "heploc" -> ("variant", "alalc97").
1260    variant_mappings = {}
1261
1262    # Preprocess all rules so we can perform a single lookup per subtag at runtime.
1263    for (type, replacement) in rules.items():
1264        (language, script, region, variants) = type
1265        (r_language, r_script, r_region, r_variants) = replacement
1266
1267        type_map_size = multi_map_size(type)
1268
1269        # Most mappings are one-to-one and can be encoded through lookup tables.
1270        if type_map_size == 1:
1271            if language is not None:
1272                assert r_language is not None, "Can't remove a language subtag"
1273
1274                # We don't yet support this case.
1275                assert (
1276                    r_variants is None
1277                ), f"Unhandled variant replacement in language alias: {replacement}"
1278
1279                if replacement == (Any, None, None, None):
1280                    language_mappings[language] = r_language
1281                else:
1282                    complex_language_mappings[language] = replacement[:-1]
1283            elif script is not None:
1284                # We don't support removing script subtags.
1285                assert (
1286                    r_script is not None
1287                ), f"Can't remove a script subtag: {replacement}"
1288
1289                # We only support one-to-one script mappings for now.
1290                assert replacement == (
1291                    None,
1292                    Any,
1293                    None,
1294                    None,
1295                ), f"Unhandled replacement in script alias: {replacement}"
1296
1297                script_mappings[script] = r_script
1298            elif region is not None:
1299                # We don't support removing region subtags.
1300                assert (
1301                    r_region is not None
1302                ), f"Can't remove a region subtag: {replacement}"
1303
1304                # We only support one-to-one region mappings for now.
1305                assert replacement == (
1306                    None,
1307                    None,
1308                    Any,
1309                    None,
1310                ), f"Unhandled replacement in region alias: {replacement}"
1311
1312                if type not in territory_exception_rules:
1313                    region_mappings[region] = r_region
1314                else:
1315                    complex_region_mappings[region] = [
1316                        r_region
1317                        for (_, _, r_region, _) in territory_exception_rules[type]
1318                    ]
1319            else:
1320                assert variants is not None
1321                assert len(variants.split("-")) == 1
1322
1323                # We only support one-to-one variant mappings for now.
1324                assert (
1325                    multi_map_size(replacement) <= 1
1326                ), f"Unhandled replacement in variant alias: {replacement}"
1327
1328                if r_language is not None:
1329                    variant_mappings[variants] = ("language", r_language)
1330                elif r_script is not None:
1331                    variant_mappings[variants] = ("script", r_script)
1332                elif r_region is not None:
1333                    variant_mappings[variants] = ("region", r_region)
1334                elif r_variants is not None:
1335                    assert len(r_variants.split("-")) == 1
1336                    variant_mappings[variants] = ("variant", r_variants)
1337                else:
1338                    variant_mappings[variants] = None
1339        else:
1340            # Alias rules which have multiple input fields must be processed
1341            # first. This applies only to a handful of rules, so our generated
1342            # code adds fast paths to skip these rules in the common case.
1343
1344            # Case 1: Language and at least one variant subtag.
1345            if language is not None and variants is not None:
1346                pass
1347
1348            # Case 2: Sign language and a region subtag.
1349            elif language == "sgn" and region is not None:
1350                pass
1351
1352            # Case 3: "hepburn-heploc" to "alalc97" canonicalization.
1353            elif (
1354                language is None
1355                and variants is not None
1356                and len(variants.split("-")) == 2
1357            ):
1358                pass
1359
1360            # Any other combination is currently unsupported.
1361            else:
1362                raise ValueError(f"{type} -> {replacement}")
1363
1364            legacy_mappings[type] = replacement
1365
1366    tree = ET.parse(core_file.open("common/supplemental/likelySubtags.xml"))
1367
1368    likely_subtags = {}
1369
1370    for likely_subtag in tree.iterfind(".//likelySubtag"):
1371        from_tag = bcp47_id(likely_subtag.get("from"))
1372        from_match = re_unicode_language_id.match(from_tag)
1373        assert (
1374            from_match is not None
1375        ), f"{from_tag} invalid Unicode BCP 47 locale identifier"
1376        assert (
1377            from_match.group("variants") is None
1378        ), f"unexpected variant subtags in {from_tag}"
1379
1380        to_tag = bcp47_id(likely_subtag.get("to"))
1381        to_match = re_unicode_language_id.match(to_tag)
1382        assert (
1383            to_match is not None
1384        ), f"{to_tag} invalid Unicode BCP 47 locale identifier"
1385        assert (
1386            to_match.group("variants") is None
1387        ), f"unexpected variant subtags in {to_tag}"
1388
1389        from_canonical = bcp47_canonical(
1390            *from_match.group("language", "script", "region", "variants")
1391        )
1392
1393        to_canonical = bcp47_canonical(
1394            *to_match.group("language", "script", "region", "variants")
1395        )
1396
1397        # Remove the empty variant subtags.
1398        from_canonical = from_canonical[:-1]
1399        to_canonical = to_canonical[:-1]
1400
1401        likely_subtags[from_canonical] = to_canonical
1402
1403    complex_region_mappings_final = {}
1404
1405    for (deprecated_region, replacements) in complex_region_mappings.items():
1406        # Find all likely subtag entries which don't already contain a region
1407        # subtag and whose target region is in the list of replacement regions.
1408        region_likely_subtags = [
1409            (from_language, from_script, to_region)
1410            for (
1411                (from_language, from_script, from_region),
1412                (_, _, to_region),
1413            ) in likely_subtags.items()
1414            if from_region is None and to_region in replacements
1415        ]
1416
1417        # The first replacement entry is the default region.
1418        default = replacements[0]
1419
1420        # Find all likely subtag entries whose region matches the default region.
1421        default_replacements = {
1422            (language, script)
1423            for (language, script, region) in region_likely_subtags
1424            if region == default
1425        }
1426
1427        # And finally find those entries which don't use the default region.
1428        # These are the entries we're actually interested in, because those need
1429        # to be handled specially when selecting the correct preferred region.
1430        non_default_replacements = [
1431            (language, script, region)
1432            for (language, script, region) in region_likely_subtags
1433            if (language, script) not in default_replacements
1434        ]
1435
1436        # If there are no non-default replacements, we can handle the region as
1437        # part of the simple region mapping.
1438        if non_default_replacements:
1439            complex_region_mappings_final[deprecated_region] = (
1440                default,
1441                non_default_replacements,
1442            )
1443        else:
1444            region_mappings[deprecated_region] = default
1445
1446    return {
1447        "legacyMappings": legacy_mappings,
1448        "languageMappings": language_mappings,
1449        "complexLanguageMappings": complex_language_mappings,
1450        "scriptMappings": script_mappings,
1451        "regionMappings": region_mappings,
1452        "complexRegionMappings": complex_region_mappings_final,
1453        "variantMappings": variant_mappings,
1454        "likelySubtags": likely_subtags,
1455    }
1456
1457
1458def readUnicodeExtensions(core_file):
1459    import xml.etree.ElementTree as ET
1460
1461    # Match all xml-files in the BCP 47 directory.
1462    bcpFileRE = re.compile(r"^common/bcp47/.+\.xml$")
1463
1464    # https://www.unicode.org/reports/tr35/#Unicode_locale_identifier
1465    #
1466    # type = alphanum{3,8} (sep alphanum{3,8})* ;
1467    typeRE = re.compile(r"^[a-z0-9]{3,8}(-[a-z0-9]{3,8})*$")
1468
1469    # Mapping from Unicode extension types to dict of deprecated to
1470    # preferred values.
1471    mapping = {
1472        # Unicode BCP 47 U Extension
1473        "u": {},
1474        # Unicode BCP 47 T Extension
1475        "t": {},
1476    }
1477
1478    def readBCP47File(file):
1479        tree = ET.parse(file)
1480        for keyword in tree.iterfind(".//keyword/key"):
1481            extension = keyword.get("extension", "u")
1482            assert (
1483                extension == "u" or extension == "t"
1484            ), "unknown extension type: {}".format(extension)
1485
1486            extension_name = keyword.get("name")
1487
1488            for type in keyword.iterfind("type"):
1489                # <https://unicode.org/reports/tr35/#Unicode_Locale_Extension_Data_Files>:
1490                #
1491                # The key or type name used by Unicode locale extension with 'u' extension
1492                # syntax or the 't' extensions syntax. When alias below is absent, this name
1493                # can be also used with the old style "@key=type" syntax.
1494                name = type.get("name")
1495
1496                # Ignore the special name:
1497                # - <https://unicode.org/reports/tr35/#CODEPOINTS>
1498                # - <https://unicode.org/reports/tr35/#REORDER_CODE>
1499                # - <https://unicode.org/reports/tr35/#RG_KEY_VALUE>
1500                # - <https://unicode.org/reports/tr35/#SCRIPT_CODE>
1501                # - <https://unicode.org/reports/tr35/#SUBDIVISION_CODE>
1502                # - <https://unicode.org/reports/tr35/#PRIVATE_USE>
1503                if name in (
1504                    "CODEPOINTS",
1505                    "REORDER_CODE",
1506                    "RG_KEY_VALUE",
1507                    "SCRIPT_CODE",
1508                    "SUBDIVISION_CODE",
1509                    "PRIVATE_USE",
1510                ):
1511                    continue
1512
1513                # All other names should match the 'type' production.
1514                assert (
1515                    typeRE.match(name) is not None
1516                ), "{} matches the 'type' production".format(name)
1517
1518                # <https://unicode.org/reports/tr35/#Unicode_Locale_Extension_Data_Files>:
1519                #
1520                # The preferred value of the deprecated key, type or attribute element.
1521                # When a key, type or attribute element is deprecated, this attribute is
1522                # used for specifying a new canonical form if available.
1523                preferred = type.get("preferred")
1524
1525                # <https://unicode.org/reports/tr35/#Unicode_Locale_Extension_Data_Files>:
1526                #
1527                # The BCP 47 form is the canonical form, and recommended. Other aliases are
1528                # included only for backwards compatibility.
1529                alias = type.get("alias")
1530
1531                # <https://unicode.org/reports/tr35/#Canonical_Unicode_Locale_Identifiers>
1532                #
1533                # Use the bcp47 data to replace keys, types, tfields, and tvalues by their
1534                # canonical forms. See Section 3.6.4 U Extension Data Files) and Section
1535                # 3.7.1 T Extension Data Files. The aliases are in the alias attribute
1536                # value, while the canonical is in the name attribute value.
1537
1538                # 'preferred' contains the new preferred name, 'alias' the compatibility
1539                # name, but then there's this entry where 'preferred' and 'alias' are the
1540                # same. So which one to choose? Assume 'preferred' is the actual canonical
1541                # name.
1542                #
1543                # <type name="islamicc"
1544                #       description="Civil (algorithmic) Arabic calendar"
1545                #       deprecated="true"
1546                #       preferred="islamic-civil"
1547                #       alias="islamic-civil"/>
1548
1549                if preferred is not None:
1550                    assert typeRE.match(preferred), preferred
1551                    mapping[extension].setdefault(extension_name, {})[name] = preferred
1552
1553                if alias is not None:
1554                    for alias_name in alias.lower().split(" "):
1555                        # Ignore alias entries which don't match the 'type' production.
1556                        if typeRE.match(alias_name) is None:
1557                            continue
1558
1559                        # See comment above when 'alias' and 'preferred' are both present.
1560                        if (
1561                            preferred is not None
1562                            and name in mapping[extension][extension_name]
1563                        ):
1564                            continue
1565
1566                        # Skip over entries where 'name' and 'alias' are equal.
1567                        #
1568                        # <type name="pst8pdt"
1569                        #       description="POSIX style time zone for US Pacific Time"
1570                        #       alias="PST8PDT"
1571                        #       since="1.8"/>
1572                        if name == alias_name:
1573                            continue
1574
1575                        mapping[extension].setdefault(extension_name, {})[
1576                            alias_name
1577                        ] = name
1578
1579    def readSupplementalMetadata(file):
1580        # Find subdivision and region replacements.
1581        #
1582        # <https://www.unicode.org/reports/tr35/#Canonical_Unicode_Locale_Identifiers>
1583        #
1584        # Replace aliases in special key values:
1585        #   - If there is an 'sd' or 'rg' key, replace any subdivision alias
1586        #     in its value in the same way, using subdivisionAlias data.
1587        tree = ET.parse(file)
1588        for alias in tree.iterfind(".//subdivisionAlias"):
1589            type = alias.get("type")
1590            assert (
1591                typeRE.match(type) is not None
1592            ), "{} matches the 'type' production".format(type)
1593
1594            # Take the first replacement when multiple ones are present.
1595            replacement = alias.get("replacement").split(" ")[0].lower()
1596
1597            # Skip over invalid replacements.
1598            #
1599            # <subdivisionAlias type="fi01" replacement="AX" reason="overlong"/>
1600            #
1601            # It's not entirely clear to me if CLDR actually wants to use
1602            # "axzzzz" as the replacement for this case.
1603            if typeRE.match(replacement) is None:
1604                continue
1605
1606            # 'subdivisionAlias' applies to 'rg' and 'sd' keys.
1607            mapping["u"].setdefault("rg", {})[type] = replacement
1608            mapping["u"].setdefault("sd", {})[type] = replacement
1609
1610    for name in core_file.namelist():
1611        if bcpFileRE.match(name):
1612            readBCP47File(core_file.open(name))
1613
1614    readSupplementalMetadata(
1615        core_file.open("common/supplemental/supplementalMetadata.xml")
1616    )
1617
1618    return {
1619        "unicodeMappings": mapping["u"],
1620        "transformMappings": mapping["t"],
1621    }
1622
1623
1624def writeCLDRLanguageTagData(println, data, url):
1625    """ Writes the language tag data to the Intl data file. """
1626
1627    println(generatedFileWarning)
1628    println("// Version: CLDR-{}".format(data["version"]))
1629    println("// URL: {}".format(url))
1630
1631    println(
1632        """
1633#include "mozilla/Assertions.h"
1634#include "mozilla/Span.h"
1635#include "mozilla/TextUtils.h"
1636
1637#include <algorithm>
1638#include <cstdint>
1639#include <cstring>
1640#include <iterator>
1641#include <string>
1642#include <type_traits>
1643
1644#include "builtin/intl/LanguageTag.h"
1645#include "util/Text.h"
1646#include "vm/JSContext.h"
1647
1648using namespace js::intl::LanguageTagLimits;
1649
1650template <size_t Length, size_t TagLength, size_t SubtagLength>
1651static inline bool HasReplacement(
1652    const char (&subtags)[Length][TagLength],
1653    const js::intl::LanguageTagSubtag<SubtagLength>& subtag) {
1654  MOZ_ASSERT(subtag.length() == TagLength - 1,
1655             "subtag must have the same length as the list of subtags");
1656
1657  const char* ptr = subtag.span().data();
1658  return std::binary_search(std::begin(subtags), std::end(subtags), ptr,
1659                            [](const char* a, const char* b) {
1660    return memcmp(a, b, TagLength - 1) < 0;
1661  });
1662}
1663
1664template <size_t Length, size_t TagLength, size_t SubtagLength>
1665static inline const char* SearchReplacement(
1666    const char (&subtags)[Length][TagLength],
1667    const char* (&aliases)[Length],
1668    const js::intl::LanguageTagSubtag<SubtagLength>& subtag) {
1669  MOZ_ASSERT(subtag.length() == TagLength - 1,
1670             "subtag must have the same length as the list of subtags");
1671
1672  const char* ptr = subtag.span().data();
1673  auto p = std::lower_bound(std::begin(subtags), std::end(subtags), ptr,
1674                            [](const char* a, const char* b) {
1675    return memcmp(a, b, TagLength - 1) < 0;
1676  });
1677  if (p != std::end(subtags) && memcmp(*p, ptr, TagLength - 1) == 0) {
1678    return aliases[std::distance(std::begin(subtags), p)];
1679  }
1680  return nullptr;
1681}
1682
1683#ifdef DEBUG
1684static bool IsAsciiLowercaseAlphanumeric(char c) {
1685  return mozilla::IsAsciiLowercaseAlpha(c) || mozilla::IsAsciiDigit(c);
1686}
1687
1688static bool IsAsciiLowercaseAlphanumericOrDash(char c) {
1689  return IsAsciiLowercaseAlphanumeric(c) || c == '-';
1690}
1691
1692static bool IsCanonicallyCasedLanguageTag(mozilla::Span<const char> span) {
1693  // Tell the analysis the |std::all_of| function can't GC.
1694  JS::AutoSuppressGCAnalysis nogc;
1695
1696  return std::all_of(span.begin(), span.end(), mozilla::IsAsciiLowercaseAlpha<char>);
1697}
1698
1699static bool IsCanonicallyCasedScriptTag(mozilla::Span<const char> span) {
1700  // Tell the analysis the |std::all_of| function can't GC.
1701  JS::AutoSuppressGCAnalysis nogc;
1702
1703  return mozilla::IsAsciiUppercaseAlpha(span[0]) &&
1704         std::all_of(span.begin() + 1, span.end(), mozilla::IsAsciiLowercaseAlpha<char>);
1705}
1706
1707static bool IsCanonicallyCasedRegionTag(mozilla::Span<const char> span) {
1708  // Tell the analysis the |std::all_of| function can't GC.
1709  JS::AutoSuppressGCAnalysis nogc;
1710
1711  return std::all_of(span.begin(), span.end(), mozilla::IsAsciiUppercaseAlpha<char>) ||
1712         std::all_of(span.begin(), span.end(), mozilla::IsAsciiDigit<char>);
1713}
1714
1715static bool IsCanonicallyCasedVariantTag(mozilla::Span<const char> span) {
1716  // Tell the analysis the |std::all_of| function can't GC.
1717  JS::AutoSuppressGCAnalysis nogc;
1718
1719  return std::all_of(span.begin(), span.end(), IsAsciiLowercaseAlphanumeric);
1720}
1721
1722static bool IsCanonicallyCasedUnicodeKey(mozilla::Span<const char> key) {
1723  return std::all_of(key.begin(), key.end(), IsAsciiLowercaseAlphanumeric);
1724}
1725
1726static bool IsCanonicallyCasedUnicodeType(mozilla::Span<const char> type) {
1727  return std::all_of(type.begin(), type.end(), IsAsciiLowercaseAlphanumericOrDash);
1728}
1729
1730static bool IsCanonicallyCasedTransformKey(mozilla::Span<const char> key) {
1731  return std::all_of(key.begin(), key.end(), IsAsciiLowercaseAlphanumeric);
1732}
1733
1734static bool IsCanonicallyCasedTransformType(mozilla::Span<const char> type) {
1735  return std::all_of(type.begin(), type.end(), IsAsciiLowercaseAlphanumericOrDash);
1736}
1737#endif
1738""".rstrip()
1739    )
1740
1741    source = "CLDR Supplemental Data, version {}".format(data["version"])
1742    legacy_mappings = data["legacyMappings"]
1743    language_mappings = data["languageMappings"]
1744    complex_language_mappings = data["complexLanguageMappings"]
1745    script_mappings = data["scriptMappings"]
1746    region_mappings = data["regionMappings"]
1747    complex_region_mappings = data["complexRegionMappings"]
1748    variant_mappings = data["variantMappings"]
1749    unicode_mappings = data["unicodeMappings"]
1750    transform_mappings = data["transformMappings"]
1751
1752    # unicode_language_subtag = alpha{2,3} | alpha{5,8} ;
1753    language_maxlength = 8
1754
1755    # unicode_script_subtag = alpha{4} ;
1756    script_maxlength = 4
1757
1758    # unicode_region_subtag = (alpha{2} | digit{3}) ;
1759    region_maxlength = 3
1760
1761    writeMappingsBinarySearch(
1762        println,
1763        "languageMapping",
1764        "LanguageSubtag&",
1765        "language",
1766        "IsStructurallyValidLanguageTag",
1767        "IsCanonicallyCasedLanguageTag",
1768        language_mappings,
1769        language_maxlength,
1770        "Mappings from language subtags to preferred values.",
1771        source,
1772        url,
1773    )
1774    writeMappingsBinarySearch(
1775        println,
1776        "complexLanguageMapping",
1777        "const LanguageSubtag&",
1778        "language",
1779        "IsStructurallyValidLanguageTag",
1780        "IsCanonicallyCasedLanguageTag",
1781        complex_language_mappings.keys(),
1782        language_maxlength,
1783        "Language subtags with complex mappings.",
1784        source,
1785        url,
1786    )
1787    writeMappingsBinarySearch(
1788        println,
1789        "scriptMapping",
1790        "ScriptSubtag&",
1791        "script",
1792        "IsStructurallyValidScriptTag",
1793        "IsCanonicallyCasedScriptTag",
1794        script_mappings,
1795        script_maxlength,
1796        "Mappings from script subtags to preferred values.",
1797        source,
1798        url,
1799    )
1800    writeMappingsBinarySearch(
1801        println,
1802        "regionMapping",
1803        "RegionSubtag&",
1804        "region",
1805        "IsStructurallyValidRegionTag",
1806        "IsCanonicallyCasedRegionTag",
1807        region_mappings,
1808        region_maxlength,
1809        "Mappings from region subtags to preferred values.",
1810        source,
1811        url,
1812    )
1813    writeMappingsBinarySearch(
1814        println,
1815        "complexRegionMapping",
1816        "const RegionSubtag&",
1817        "region",
1818        "IsStructurallyValidRegionTag",
1819        "IsCanonicallyCasedRegionTag",
1820        complex_region_mappings.keys(),
1821        region_maxlength,
1822        "Region subtags with complex mappings.",
1823        source,
1824        url,
1825    )
1826
1827    writeComplexLanguageTagMappings(
1828        println,
1829        complex_language_mappings,
1830        "Language subtags with complex mappings.",
1831        source,
1832        url,
1833    )
1834    writeComplexRegionTagMappings(
1835        println,
1836        complex_region_mappings,
1837        "Region subtags with complex mappings.",
1838        source,
1839        url,
1840    )
1841
1842    writeVariantTagMappings(
1843        println,
1844        variant_mappings,
1845        "Mappings from variant subtags to preferred values.",
1846        source,
1847        url,
1848    )
1849
1850    writeLegacyMappingsFunction(
1851        println, legacy_mappings, "Canonicalize legacy locale identifiers.", source, url
1852    )
1853
1854    writeSignLanguageMappingsFunction(
1855        println, legacy_mappings, "Mappings from legacy sign languages.", source, url
1856    )
1857
1858    writeUnicodeExtensionsMappings(println, unicode_mappings, "Unicode")
1859    writeUnicodeExtensionsMappings(println, transform_mappings, "Transform")
1860
1861
1862def writeCLDRLanguageTagLikelySubtagsTest(println, data, url):
1863    """ Writes the likely-subtags test file. """
1864
1865    println(generatedFileWarning)
1866
1867    source = "CLDR Supplemental Data, version {}".format(data["version"])
1868    language_mappings = data["languageMappings"]
1869    complex_language_mappings = data["complexLanguageMappings"]
1870    script_mappings = data["scriptMappings"]
1871    region_mappings = data["regionMappings"]
1872    complex_region_mappings = data["complexRegionMappings"]
1873    likely_subtags = data["likelySubtags"]
1874
1875    def bcp47(tag):
1876        (language, script, region) = tag
1877        return "{}{}{}".format(
1878            language, "-" + script if script else "", "-" + region if region else ""
1879        )
1880
1881    def canonical(tag):
1882        (language, script, region) = tag
1883
1884        # Map deprecated language subtags.
1885        if language in language_mappings:
1886            language = language_mappings[language]
1887        elif language in complex_language_mappings:
1888            (language2, script2, region2) = complex_language_mappings[language]
1889            (language, script, region) = (
1890                language2,
1891                script if script else script2,
1892                region if region else region2,
1893            )
1894
1895        # Map deprecated script subtags.
1896        if script in script_mappings:
1897            script = script_mappings[script]
1898
1899        # Map deprecated region subtags.
1900        if region in region_mappings:
1901            region = region_mappings[region]
1902        else:
1903            # Assume no complex region mappings are needed for now.
1904            assert (
1905                region not in complex_region_mappings
1906            ), "unexpected region with complex mappings: {}".format(region)
1907
1908        return (language, script, region)
1909
1910    # https://unicode.org/reports/tr35/#Likely_Subtags
1911
1912    def addLikelySubtags(tag):
1913        # Step 1: Canonicalize.
1914        (language, script, region) = canonical(tag)
1915        if script == "Zzzz":
1916            script = None
1917        if region == "ZZ":
1918            region = None
1919
1920        # Step 2: Lookup.
1921        searches = (
1922            (language, script, region),
1923            (language, None, region),
1924            (language, script, None),
1925            (language, None, None),
1926            ("und", script, None),
1927        )
1928        search = next(search for search in searches if search in likely_subtags)
1929
1930        (language_s, script_s, region_s) = search
1931        (language_m, script_m, region_m) = likely_subtags[search]
1932
1933        # Step 3: Return.
1934        return (
1935            language if language != language_s else language_m,
1936            script if script != script_s else script_m,
1937            region if region != region_s else region_m,
1938        )
1939
1940    # https://unicode.org/reports/tr35/#Likely_Subtags
1941    def removeLikelySubtags(tag):
1942        # Step 1: Add likely subtags.
1943        max = addLikelySubtags(tag)
1944
1945        # Step 2: Remove variants (doesn't apply here).
1946
1947        # Step 3: Find a match.
1948        (language, script, region) = max
1949        for trial in (
1950            (language, None, None),
1951            (language, None, region),
1952            (language, script, None),
1953        ):
1954            if addLikelySubtags(trial) == max:
1955                return trial
1956
1957        # Step 4: Return maximized if no match found.
1958        return max
1959
1960    def likely_canonical(from_tag, to_tag):
1961        # Canonicalize the input tag.
1962        from_tag = canonical(from_tag)
1963
1964        # Update the expected result if necessary.
1965        if from_tag in likely_subtags:
1966            to_tag = likely_subtags[from_tag]
1967
1968        # Canonicalize the expected output.
1969        to_canonical = canonical(to_tag)
1970
1971        # Sanity check: This should match the result of |addLikelySubtags|.
1972        assert to_canonical == addLikelySubtags(from_tag)
1973
1974        return to_canonical
1975
1976    # |likely_subtags| contains non-canonicalized tags, so canonicalize it first.
1977    likely_subtags_canonical = {
1978        k: likely_canonical(k, v) for (k, v) in likely_subtags.items()
1979    }
1980
1981    # Add test data for |Intl.Locale.prototype.maximize()|.
1982    writeMappingsVar(
1983        println,
1984        {bcp47(k): bcp47(v) for (k, v) in likely_subtags_canonical.items()},
1985        "maxLikelySubtags",
1986        "Extracted from likelySubtags.xml.",
1987        source,
1988        url,
1989    )
1990
1991    # Use the maximalized tags as the input for the remove likely-subtags test.
1992    minimized = {
1993        tag: removeLikelySubtags(tag) for tag in likely_subtags_canonical.values()
1994    }
1995
1996    # Add test data for |Intl.Locale.prototype.minimize()|.
1997    writeMappingsVar(
1998        println,
1999        {bcp47(k): bcp47(v) for (k, v) in minimized.items()},
2000        "minLikelySubtags",
2001        "Extracted from likelySubtags.xml.",
2002        source,
2003        url,
2004    )
2005
2006    println(
2007        """
2008for (let [tag, maximal] of Object.entries(maxLikelySubtags)) {
2009    assertEq(new Intl.Locale(tag).maximize().toString(), maximal);
2010}"""
2011    )
2012
2013    println(
2014        """
2015for (let [tag, minimal] of Object.entries(minLikelySubtags)) {
2016    assertEq(new Intl.Locale(tag).minimize().toString(), minimal);
2017}"""
2018    )
2019
2020    println(
2021        """
2022if (typeof reportCompare === "function")
2023    reportCompare(0, 0);"""
2024    )
2025
2026
2027def readCLDRVersionFromICU():
2028    icuDir = os.path.join(topsrcdir, "intl/icu/source")
2029    if not os.path.isdir(icuDir):
2030        raise RuntimeError("not a directory: {}".format(icuDir))
2031
2032    reVersion = re.compile(r'\s*cldrVersion\{"(\d+(?:\.\d+)?)"\}')
2033
2034    for line in flines(os.path.join(icuDir, "data/misc/supplementalData.txt")):
2035        m = reVersion.match(line)
2036        if m:
2037            version = m.group(1)
2038            break
2039
2040    if version is None:
2041        raise RuntimeError("can't resolve CLDR version")
2042
2043    return version
2044
2045
2046def updateCLDRLangTags(args):
2047    """ Update the LanguageTagGenerated.cpp file. """
2048    version = args.version
2049    url = args.url
2050    out = args.out
2051    filename = args.file
2052
2053    # Determine current CLDR version from ICU.
2054    if version is None:
2055        version = readCLDRVersionFromICU()
2056
2057    url = url.replace("<VERSION>", version)
2058
2059    print("Arguments:")
2060    print("\tCLDR version: %s" % version)
2061    print("\tDownload url: %s" % url)
2062    if filename is not None:
2063        print("\tLocal CLDR core.zip file: %s" % filename)
2064    print("\tOutput file: %s" % out)
2065    print("")
2066
2067    data = {
2068        "version": version,
2069    }
2070
2071    def readFiles(cldr_file):
2072        with ZipFile(cldr_file) as zip_file:
2073            data.update(readSupplementalData(zip_file))
2074            data.update(readUnicodeExtensions(zip_file))
2075
2076    print("Processing CLDR data...")
2077    if filename is not None:
2078        print("Always make sure you have the newest CLDR core.zip!")
2079        with open(filename, "rb") as cldr_file:
2080            readFiles(cldr_file)
2081    else:
2082        print("Downloading CLDR core.zip...")
2083        with closing(urlopen(url)) as cldr_file:
2084            cldr_data = io.BytesIO(cldr_file.read())
2085            readFiles(cldr_data)
2086
2087    print("Writing Intl data...")
2088    with io.open(out, mode="w", encoding="utf-8", newline="") as f:
2089        println = partial(print, file=f)
2090
2091        writeCLDRLanguageTagData(println, data, url)
2092
2093    print("Writing Intl test data...")
2094    js_src_builtin_intl_dir = os.path.dirname(os.path.abspath(__file__))
2095    test_file = os.path.join(
2096        js_src_builtin_intl_dir,
2097        "../../tests/non262/Intl/Locale/likely-subtags-generated.js",
2098    )
2099    with io.open(test_file, mode="w", encoding="utf-8", newline="") as f:
2100        println = partial(print, file=f)
2101
2102        println("// |reftest| skip-if(!this.hasOwnProperty('Intl'))")
2103        writeCLDRLanguageTagLikelySubtagsTest(println, data, url)
2104
2105
2106def flines(filepath, encoding="utf-8"):
2107    """ Open filepath and iterate over its content. """
2108    with io.open(filepath, mode="r", encoding=encoding) as f:
2109        for line in f:
2110            yield line
2111
2112
2113@total_ordering
2114class Zone(object):
2115    """ Time zone with optional file name. """
2116
2117    def __init__(self, name, filename=""):
2118        self.name = name
2119        self.filename = filename
2120
2121    def __eq__(self, other):
2122        return hasattr(other, "name") and self.name == other.name
2123
2124    def __lt__(self, other):
2125        return self.name < other.name
2126
2127    def __hash__(self):
2128        return hash(self.name)
2129
2130    def __str__(self):
2131        return self.name
2132
2133    def __repr__(self):
2134        return self.name
2135
2136
2137class TzDataDir(object):
2138    """ tzdata source from a directory. """
2139
2140    def __init__(self, obj):
2141        self.name = partial(os.path.basename, obj)
2142        self.resolve = partial(os.path.join, obj)
2143        self.basename = os.path.basename
2144        self.isfile = os.path.isfile
2145        self.listdir = partial(os.listdir, obj)
2146        self.readlines = flines
2147
2148
2149class TzDataFile(object):
2150    """ tzdata source from a file (tar or gzipped). """
2151
2152    def __init__(self, obj):
2153        self.name = lambda: os.path.splitext(
2154            os.path.splitext(os.path.basename(obj))[0]
2155        )[0]
2156        self.resolve = obj.getmember
2157        self.basename = attrgetter("name")
2158        self.isfile = tarfile.TarInfo.isfile
2159        self.listdir = obj.getnames
2160        self.readlines = partial(self._tarlines, obj)
2161
2162    def _tarlines(self, tar, m):
2163        with closing(tar.extractfile(m)) as f:
2164            for line in f:
2165                yield line.decode("utf-8")
2166
2167
2168def validateTimeZones(zones, links):
2169    """ Validate the zone and link entries. """
2170    linkZones = set(links.keys())
2171    intersect = linkZones.intersection(zones)
2172    if intersect:
2173        raise RuntimeError("Links also present in zones: %s" % intersect)
2174
2175    zoneNames = {z.name for z in zones}
2176    linkTargets = set(links.values())
2177    if not linkTargets.issubset(zoneNames):
2178        raise RuntimeError(
2179            "Link targets not found: %s" % linkTargets.difference(zoneNames)
2180        )
2181
2182
2183def partition(iterable, *predicates):
2184    def innerPartition(pred, it):
2185        it1, it2 = tee(it)
2186        return (filter(pred, it1), filterfalse(pred, it2))
2187
2188    if len(predicates) == 0:
2189        return iterable
2190    (left, right) = innerPartition(predicates[0], iterable)
2191    if len(predicates) == 1:
2192        return (left, right)
2193    return tuple([left] + list(partition(right, *predicates[1:])))
2194
2195
2196def listIANAFiles(tzdataDir):
2197    def isTzFile(d, m, f):
2198        return m(f) and d.isfile(d.resolve(f))
2199
2200    return filter(
2201        partial(isTzFile, tzdataDir, re.compile("^[a-z0-9]+$").match),
2202        tzdataDir.listdir(),
2203    )
2204
2205
2206def readIANAFiles(tzdataDir, files):
2207    """ Read all IANA time zone files from the given iterable. """
2208    nameSyntax = "[\w/+\-]+"
2209    pZone = re.compile(r"Zone\s+(?P<name>%s)\s+.*" % nameSyntax)
2210    pLink = re.compile(
2211        r"Link\s+(?P<target>%s)\s+(?P<name>%s)(?:\s+#.*)?" % (nameSyntax, nameSyntax)
2212    )
2213
2214    def createZone(line, fname):
2215        match = pZone.match(line)
2216        name = match.group("name")
2217        return Zone(name, fname)
2218
2219    def createLink(line, fname):
2220        match = pLink.match(line)
2221        (name, target) = match.group("name", "target")
2222        return (Zone(name, fname), target)
2223
2224    zones = set()
2225    links = dict()
2226    for filename in files:
2227        filepath = tzdataDir.resolve(filename)
2228        for line in tzdataDir.readlines(filepath):
2229            if line.startswith("Zone"):
2230                zones.add(createZone(line, filename))
2231            if line.startswith("Link"):
2232                (link, target) = createLink(line, filename)
2233                links[link] = target
2234
2235    return (zones, links)
2236
2237
2238def readIANATimeZones(tzdataDir, ignoreBackzone, ignoreFactory):
2239    """ Read the IANA time zone information from `tzdataDir`. """
2240
2241    backzoneFiles = {"backzone"}
2242    (bkfiles, tzfiles) = partition(listIANAFiles(tzdataDir), backzoneFiles.__contains__)
2243
2244    # Read zone and link infos.
2245    (zones, links) = readIANAFiles(tzdataDir, tzfiles)
2246    (backzones, backlinks) = readIANAFiles(tzdataDir, bkfiles)
2247
2248    # Remove the placeholder time zone "Factory".
2249    if ignoreFactory:
2250        zones.remove(Zone("Factory"))
2251
2252    # Merge with backzone data.
2253    if not ignoreBackzone:
2254        zones |= backzones
2255        links = {
2256            name: target for name, target in links.items() if name not in backzones
2257        }
2258        links.update(backlinks)
2259
2260    validateTimeZones(zones, links)
2261
2262    return (zones, links)
2263
2264
2265def readICUResourceFile(filename):
2266    """Read an ICU resource file.
2267
2268    Yields (<table-name>, <startOrEnd>, <value>) for each table.
2269    """
2270
2271    numberValue = r"-?\d+"
2272    stringValue = r'".+?"'
2273
2274    def asVector(val):
2275        return r"%s(?:\s*,\s*%s)*" % (val, val)
2276
2277    numberVector = asVector(numberValue)
2278    stringVector = asVector(stringValue)
2279
2280    reNumberVector = re.compile(numberVector)
2281    reStringVector = re.compile(stringVector)
2282    reNumberValue = re.compile(numberValue)
2283    reStringValue = re.compile(stringValue)
2284
2285    def parseValue(value):
2286        m = reNumberVector.match(value)
2287        if m:
2288            return [int(v) for v in reNumberValue.findall(value)]
2289        m = reStringVector.match(value)
2290        if m:
2291            return [v[1:-1] for v in reStringValue.findall(value)]
2292        raise RuntimeError("unknown value type: %s" % value)
2293
2294    def extractValue(values):
2295        if len(values) == 0:
2296            return None
2297        if len(values) == 1:
2298            return values[0]
2299        return values
2300
2301    def line(*args):
2302        maybeMultiComments = r"(?:/\*[^*]*\*/)*"
2303        maybeSingleComment = r"(?://.*)?"
2304        lineStart = "^%s" % maybeMultiComments
2305        lineEnd = "%s\s*%s$" % (maybeMultiComments, maybeSingleComment)
2306        return re.compile(r"\s*".join(chain([lineStart], args, [lineEnd])))
2307
2308    tableName = r'(?P<quote>"?)(?P<name>.+?)(?P=quote)'
2309    tableValue = r"(?P<value>%s|%s)" % (numberVector, stringVector)
2310
2311    reStartTable = line(tableName, r"\{")
2312    reEndTable = line(r"\}")
2313    reSingleValue = line(r",?", tableValue, r",?")
2314    reCompactTable = line(tableName, r"\{", tableValue, r"\}")
2315    reEmptyLine = line()
2316
2317    tables = []
2318
2319    def currentTable():
2320        return "|".join(tables)
2321
2322    values = []
2323    for line in flines(filename, "utf-8-sig"):
2324        line = line.strip()
2325        if line == "":
2326            continue
2327
2328        m = reEmptyLine.match(line)
2329        if m:
2330            continue
2331
2332        m = reStartTable.match(line)
2333        if m:
2334            assert len(values) == 0
2335            tables.append(m.group("name"))
2336            continue
2337
2338        m = reEndTable.match(line)
2339        if m:
2340            yield (currentTable(), extractValue(values))
2341            tables.pop()
2342            values = []
2343            continue
2344
2345        m = reCompactTable.match(line)
2346        if m:
2347            assert len(values) == 0
2348            tables.append(m.group("name"))
2349            yield (currentTable(), extractValue(parseValue(m.group("value"))))
2350            tables.pop()
2351            continue
2352
2353        m = reSingleValue.match(line)
2354        if m and tables:
2355            values.extend(parseValue(m.group("value")))
2356            continue
2357
2358        raise RuntimeError("unknown entry: %s" % line)
2359
2360
2361def readICUTimeZonesFromTimezoneTypes(icuTzDir):
2362    """Read the ICU time zone information from `icuTzDir`/timezoneTypes.txt
2363    and returns the tuple (zones, links).
2364    """
2365    typeMapTimeZoneKey = "timezoneTypes:table(nofallback)|typeMap|timezone|"
2366    typeAliasTimeZoneKey = "timezoneTypes:table(nofallback)|typeAlias|timezone|"
2367
2368    def toTimeZone(name):
2369        return Zone(name.replace(":", "/"))
2370
2371    zones = set()
2372    links = dict()
2373
2374    for name, value in readICUResourceFile(os.path.join(icuTzDir, "timezoneTypes.txt")):
2375        if name.startswith(typeMapTimeZoneKey):
2376            zones.add(toTimeZone(name[len(typeMapTimeZoneKey) :]))
2377        if name.startswith(typeAliasTimeZoneKey):
2378            links[toTimeZone(name[len(typeAliasTimeZoneKey) :])] = value
2379
2380    validateTimeZones(zones, links)
2381
2382    return (zones, links)
2383
2384
2385def readICUTimeZonesFromZoneInfo(icuTzDir):
2386    """Read the ICU time zone information from `icuTzDir`/zoneinfo64.txt
2387    and returns the tuple (zones, links).
2388    """
2389    zoneKey = "zoneinfo64:table(nofallback)|Zones:array|:table"
2390    linkKey = "zoneinfo64:table(nofallback)|Zones:array|:int"
2391    namesKey = "zoneinfo64:table(nofallback)|Names"
2392
2393    tzId = 0
2394    tzLinks = dict()
2395    tzNames = []
2396
2397    for name, value in readICUResourceFile(os.path.join(icuTzDir, "zoneinfo64.txt")):
2398        if name == zoneKey:
2399            tzId += 1
2400        elif name == linkKey:
2401            tzLinks[tzId] = int(value)
2402            tzId += 1
2403        elif name == namesKey:
2404            tzNames.extend(value)
2405
2406    links = {Zone(tzNames[zone]): tzNames[target] for (zone, target) in tzLinks.items()}
2407    zones = {Zone(v) for v in tzNames if Zone(v) not in links}
2408
2409    validateTimeZones(zones, links)
2410
2411    return (zones, links)
2412
2413
2414def readICUTimeZones(icuDir, icuTzDir, ignoreFactory):
2415    # zoneinfo64.txt contains the supported time zones by ICU. This data is
2416    # generated from tzdata files, it doesn't include "backzone" in stock ICU.
2417    (zoneinfoZones, zoneinfoLinks) = readICUTimeZonesFromZoneInfo(icuTzDir)
2418
2419    # timezoneTypes.txt contains the canonicalization information for ICU. This
2420    # data is generated from CLDR files. It includes data about time zones from
2421    # tzdata's "backzone" file.
2422    (typesZones, typesLinks) = readICUTimeZonesFromTimezoneTypes(icuTzDir)
2423
2424    # Remove the placeholder time zone "Factory".
2425    # See also <https://github.com/eggert/tz/blob/master/factory>.
2426    if ignoreFactory:
2427        zoneinfoZones.remove(Zone("Factory"))
2428
2429    # Remove the ICU placeholder time zone "Etc/Unknown".
2430    # See also <https://unicode.org/reports/tr35/#Time_Zone_Identifiers>.
2431    for zones in (zoneinfoZones, typesZones):
2432        zones.remove(Zone("Etc/Unknown"))
2433
2434    # Remove any outdated ICU links.
2435    for links in (zoneinfoLinks, typesLinks):
2436        for zone in otherICULegacyLinks().keys():
2437            if zone not in links:
2438                raise KeyError(f"Can't remove non-existent link from '{zone}'")
2439            del links[zone]
2440
2441    # Information in zoneinfo64 should be a superset of timezoneTypes.
2442    def inZoneInfo64(zone):
2443        return zone in zoneinfoZones or zone in zoneinfoLinks
2444
2445    notFoundInZoneInfo64 = [zone for zone in typesZones if not inZoneInfo64(zone)]
2446    if notFoundInZoneInfo64:
2447        raise RuntimeError(
2448            "Missing time zones in zoneinfo64.txt: %s" % notFoundInZoneInfo64
2449        )
2450
2451    notFoundInZoneInfo64 = [
2452        zone for zone in typesLinks.keys() if not inZoneInfo64(zone)
2453    ]
2454    if notFoundInZoneInfo64:
2455        raise RuntimeError(
2456            "Missing time zones in zoneinfo64.txt: %s" % notFoundInZoneInfo64
2457        )
2458
2459    # zoneinfo64.txt only defines the supported time zones by ICU, the canonicalization
2460    # rules are defined through timezoneTypes.txt. Merge both to get the actual zones
2461    # and links used by ICU.
2462    icuZones = set(
2463        chain(
2464            (zone for zone in zoneinfoZones if zone not in typesLinks),
2465            (zone for zone in typesZones),
2466        )
2467    )
2468    icuLinks = dict(
2469        chain(
2470            (
2471                (zone, target)
2472                for (zone, target) in zoneinfoLinks.items()
2473                if zone not in typesZones
2474            ),
2475            ((zone, target) for (zone, target) in typesLinks.items()),
2476        )
2477    )
2478
2479    return (icuZones, icuLinks)
2480
2481
2482def readICULegacyZones(icuDir):
2483    """Read the ICU legacy time zones from `icuTzDir`/tools/tzcode/icuzones
2484    and returns the tuple (zones, links).
2485    """
2486    tzdir = TzDataDir(os.path.join(icuDir, "tools/tzcode"))
2487
2488    # Per spec we must recognize only IANA time zones and links, but ICU
2489    # recognizes various legacy, non-IANA time zones and links. Compute these
2490    # non-IANA time zones and links.
2491
2492    # Most legacy, non-IANA time zones and links are in the icuzones file.
2493    (zones, links) = readIANAFiles(tzdir, ["icuzones"])
2494
2495    # Remove the ICU placeholder time zone "Etc/Unknown".
2496    # See also <https://unicode.org/reports/tr35/#Time_Zone_Identifiers>.
2497    zones.remove(Zone("Etc/Unknown"))
2498
2499    # A handful of non-IANA zones/links are not in icuzones and must be added
2500    # manually so that we won't invoke ICU with them.
2501    for (zone, target) in otherICULegacyLinks().items():
2502        if zone in links:
2503            if links[zone] != target:
2504                raise KeyError(
2505                    f"Can't overwrite link '{zone} -> {links[zone]}' with '{target}'"
2506                )
2507            else:
2508                print(
2509                    f"Info: Link '{zone} -> {target}' can be removed from otherICULegacyLinks()"
2510                )
2511        links[zone] = target
2512
2513    return (zones, links)
2514
2515
2516def otherICULegacyLinks():
2517    """The file `icuTzDir`/tools/tzcode/icuzones contains all ICU legacy time
2518    zones with the exception of time zones which are removed by IANA after an
2519    ICU release.
2520
2521    For example ICU 67 uses tzdata2018i, but tzdata2020b removed the link from
2522    "US/Pacific-New" to "America/Los_Angeles". ICU standalone tzdata updates
2523    don't include modified icuzones files, so we must manually record any IANA
2524    modifications here.
2525
2526    After an ICU update, we can remove any no longer needed entries from this
2527    function by checking if the relevant entries are now included in icuzones.
2528    """
2529
2530    return {
2531        # Current ICU is up-to-date with IANA, so this dict is empty.
2532    }
2533
2534
2535def icuTzDataVersion(icuTzDir):
2536    """ Read the ICU time zone version from `icuTzDir`/zoneinfo64.txt. """
2537
2538    def searchInFile(pattern, f):
2539        p = re.compile(pattern)
2540        for line in flines(f, "utf-8-sig"):
2541            m = p.search(line)
2542            if m:
2543                return m.group(1)
2544        return None
2545
2546    zoneinfo = os.path.join(icuTzDir, "zoneinfo64.txt")
2547    if not os.path.isfile(zoneinfo):
2548        raise RuntimeError("file not found: %s" % zoneinfo)
2549    version = searchInFile("^//\s+tz version:\s+([0-9]{4}[a-z])$", zoneinfo)
2550    if version is None:
2551        raise RuntimeError(
2552            "%s does not contain a valid tzdata version string" % zoneinfo
2553        )
2554    return version
2555
2556
2557def findIncorrectICUZones(ianaZones, ianaLinks, icuZones, icuLinks, ignoreBackzone):
2558    """ Find incorrect ICU zone entries. """
2559
2560    def isIANATimeZone(zone):
2561        return zone in ianaZones or zone in ianaLinks
2562
2563    def isICUTimeZone(zone):
2564        return zone in icuZones or zone in icuLinks
2565
2566    def isICULink(zone):
2567        return zone in icuLinks
2568
2569    # All IANA zones should be present in ICU.
2570    missingTimeZones = [zone for zone in ianaZones if not isICUTimeZone(zone)]
2571    # Normally zones in backzone are also present as links in one of the other
2572    # time zone files. The only exception to this rule is the Asia/Hanoi time
2573    # zone, this zone is only present in the backzone file.
2574    expectedMissing = [] if ignoreBackzone else [Zone("Asia/Hanoi")]
2575    if missingTimeZones != expectedMissing:
2576        raise RuntimeError(
2577            "Not all zones are present in ICU, did you forget "
2578            "to run intl/update-tzdata.sh? %s" % missingTimeZones
2579        )
2580
2581    # Zones which are only present in ICU?
2582    additionalTimeZones = [zone for zone in icuZones if not isIANATimeZone(zone)]
2583    if additionalTimeZones:
2584        raise RuntimeError(
2585            "Additional zones present in ICU, did you forget "
2586            "to run intl/update-tzdata.sh? %s" % additionalTimeZones
2587        )
2588
2589    # Zones which are marked as links in ICU.
2590    result = ((zone, icuLinks[zone]) for zone in ianaZones if isICULink(zone))
2591
2592    # Remove unnecessary UTC mappings.
2593    utcnames = ["Etc/UTC", "Etc/UCT", "Etc/GMT"]
2594    result = ((zone, target) for (zone, target) in result if zone.name not in utcnames)
2595
2596    return sorted(result, key=itemgetter(0))
2597
2598
2599def findIncorrectICULinks(ianaZones, ianaLinks, icuZones, icuLinks):
2600    """ Find incorrect ICU link entries. """
2601
2602    def isIANATimeZone(zone):
2603        return zone in ianaZones or zone in ianaLinks
2604
2605    def isICUTimeZone(zone):
2606        return zone in icuZones or zone in icuLinks
2607
2608    def isICULink(zone):
2609        return zone in icuLinks
2610
2611    def isICUZone(zone):
2612        return zone in icuZones
2613
2614    # All links should be present in ICU.
2615    missingTimeZones = [zone for zone in ianaLinks.keys() if not isICUTimeZone(zone)]
2616    if missingTimeZones:
2617        raise RuntimeError(
2618            "Not all zones are present in ICU, did you forget "
2619            "to run intl/update-tzdata.sh? %s" % missingTimeZones
2620        )
2621
2622    # Links which are only present in ICU?
2623    additionalTimeZones = [zone for zone in icuLinks.keys() if not isIANATimeZone(zone)]
2624    if additionalTimeZones:
2625        raise RuntimeError(
2626            "Additional links present in ICU, did you forget "
2627            "to run intl/update-tzdata.sh? %s" % additionalTimeZones
2628        )
2629
2630    result = chain(
2631        # IANA links which have a different target in ICU.
2632        (
2633            (zone, target, icuLinks[zone])
2634            for (zone, target) in ianaLinks.items()
2635            if isICULink(zone) and target != icuLinks[zone]
2636        ),
2637        # IANA links which are zones in ICU.
2638        (
2639            (zone, target, zone.name)
2640            for (zone, target) in ianaLinks.items()
2641            if isICUZone(zone)
2642        ),
2643    )
2644
2645    # Remove unnecessary UTC mappings.
2646    utcnames = ["Etc/UTC", "Etc/UCT", "Etc/GMT"]
2647    result = (
2648        (zone, target, icuTarget)
2649        for (zone, target, icuTarget) in result
2650        if target not in utcnames or icuTarget not in utcnames
2651    )
2652
2653    return sorted(result, key=itemgetter(0))
2654
2655
2656generatedFileWarning = "// Generated by make_intl_data.py. DO NOT EDIT."
2657tzdataVersionComment = "// tzdata version = {0}"
2658
2659
2660def processTimeZones(
2661    tzdataDir, icuDir, icuTzDir, version, ignoreBackzone, ignoreFactory, out
2662):
2663    """ Read the time zone info and create a new time zone cpp file. """
2664    print("Processing tzdata mapping...")
2665    (ianaZones, ianaLinks) = readIANATimeZones(tzdataDir, ignoreBackzone, ignoreFactory)
2666    (icuZones, icuLinks) = readICUTimeZones(icuDir, icuTzDir, ignoreFactory)
2667    (legacyZones, legacyLinks) = readICULegacyZones(icuDir)
2668
2669    # Remove all legacy ICU time zones.
2670    icuZones = {zone for zone in icuZones if zone not in legacyZones}
2671    icuLinks = {
2672        zone: target for (zone, target) in icuLinks.items() if zone not in legacyLinks
2673    }
2674
2675    incorrectZones = findIncorrectICUZones(
2676        ianaZones, ianaLinks, icuZones, icuLinks, ignoreBackzone
2677    )
2678    if not incorrectZones:
2679        print("<<< No incorrect ICU time zones found, please update Intl.js! >>>")
2680        print("<<< Maybe https://ssl.icu-project.org/trac/ticket/12044 was fixed? >>>")
2681
2682    incorrectLinks = findIncorrectICULinks(ianaZones, ianaLinks, icuZones, icuLinks)
2683    if not incorrectLinks:
2684        print("<<< No incorrect ICU time zone links found, please update Intl.js! >>>")
2685        print("<<< Maybe https://ssl.icu-project.org/trac/ticket/12044 was fixed? >>>")
2686
2687    print("Writing Intl tzdata file...")
2688    with io.open(out, mode="w", encoding="utf-8", newline="") as f:
2689        println = partial(print, file=f)
2690
2691        println(generatedFileWarning)
2692        println(tzdataVersionComment.format(version))
2693        println("")
2694
2695        println("#ifndef builtin_intl_TimeZoneDataGenerated_h")
2696        println("#define builtin_intl_TimeZoneDataGenerated_h")
2697        println("")
2698
2699        println("namespace js {")
2700        println("namespace timezone {")
2701        println("")
2702
2703        println("// Format:")
2704        println('// "ZoneName" // ICU-Name [time zone file]')
2705        println("const char* const ianaZonesTreatedAsLinksByICU[] = {")
2706        for (zone, icuZone) in incorrectZones:
2707            println('    "%s", // %s [%s]' % (zone, icuZone, zone.filename))
2708        println("};")
2709        println("")
2710
2711        println("// Format:")
2712        println('// "LinkName", "Target" // ICU-Target [time zone file]')
2713        println("struct LinkAndTarget")
2714        println("{")
2715        println("    const char* const link;")
2716        println("    const char* const target;")
2717        println("};")
2718        println("")
2719        println("const LinkAndTarget ianaLinksCanonicalizedDifferentlyByICU[] = {")
2720        for (zone, target, icuTarget) in incorrectLinks:
2721            println(
2722                '    { "%s", "%s" }, // %s [%s]'
2723                % (zone, target, icuTarget, zone.filename)
2724            )
2725        println("};")
2726        println("")
2727
2728        println(
2729            "// Legacy ICU time zones, these are not valid IANA time zone names. We also"
2730        )
2731        println("// disallow the old and deprecated System V time zones.")
2732        println(
2733            "// https://ssl.icu-project.org/repos/icu/trunk/icu4c/source/tools/tzcode/icuzones"
2734        )  # NOQA: E501
2735        println("const char* const legacyICUTimeZones[] = {")
2736        for zone in chain(sorted(legacyLinks.keys()), sorted(legacyZones)):
2737            println('    "%s",' % zone)
2738        println("};")
2739        println("")
2740
2741        println("} // namespace timezone")
2742        println("} // namespace js")
2743        println("")
2744        println("#endif /* builtin_intl_TimeZoneDataGenerated_h */")
2745
2746
2747def updateBackzoneLinks(tzdataDir, links):
2748    def withZone(fn):
2749        return lambda zone_target: fn(zone_target[0])
2750
2751    (backzoneZones, backzoneLinks) = readIANAFiles(tzdataDir, ["backzone"])
2752    (stableZones, updatedLinks, updatedZones) = partition(
2753        links.items(),
2754        # Link not changed in backzone.
2755        withZone(lambda zone: zone not in backzoneLinks and zone not in backzoneZones),
2756        # Link has a new target.
2757        withZone(lambda zone: zone in backzoneLinks),
2758    )
2759    # Keep stable zones and links with updated target.
2760    return dict(
2761        chain(
2762            stableZones,
2763            map(withZone(lambda zone: (zone, backzoneLinks[zone])), updatedLinks),
2764        )
2765    )
2766
2767
2768def generateTzDataLinkTestContent(testDir, version, fileName, description, links):
2769    with io.open(
2770        os.path.join(testDir, fileName), mode="w", encoding="utf-8", newline=""
2771    ) as f:
2772        println = partial(print, file=f)
2773
2774        println('// |reftest| skip-if(!this.hasOwnProperty("Intl"))')
2775        println("")
2776        println(generatedFileWarning)
2777        println(tzdataVersionComment.format(version))
2778        println(
2779            """
2780const tzMapper = [
2781    x => x,
2782    x => x.toUpperCase(),
2783    x => x.toLowerCase(),
2784];
2785"""
2786        )
2787
2788        println(description)
2789        println("const links = {")
2790        for (zone, target) in sorted(links, key=itemgetter(0)):
2791            println('    "%s": "%s",' % (zone, target))
2792        println("};")
2793
2794        println(
2795            """
2796for (let [linkName, target] of Object.entries(links)) {
2797    if (target === "Etc/UTC" || target === "Etc/GMT")
2798        target = "UTC";
2799
2800    for (let map of tzMapper) {
2801        let dtf = new Intl.DateTimeFormat(undefined, {timeZone: map(linkName)});
2802        let resolvedTimeZone = dtf.resolvedOptions().timeZone;
2803        assertEq(resolvedTimeZone, target, `${linkName} -> ${target}`);
2804    }
2805}
2806"""
2807        )
2808        println(
2809            """
2810if (typeof reportCompare === "function")
2811    reportCompare(0, 0, "ok");
2812"""
2813        )
2814
2815
2816def generateTzDataTestBackwardLinks(tzdataDir, version, ignoreBackzone, testDir):
2817    (zones, links) = readIANAFiles(tzdataDir, ["backward"])
2818    assert len(zones) == 0
2819
2820    if not ignoreBackzone:
2821        links = updateBackzoneLinks(tzdataDir, links)
2822
2823    generateTzDataLinkTestContent(
2824        testDir,
2825        version,
2826        "timeZone_backward_links.js",
2827        "// Link names derived from IANA Time Zone Database, backward file.",
2828        links.items(),
2829    )
2830
2831
2832def generateTzDataTestNotBackwardLinks(tzdataDir, version, ignoreBackzone, testDir):
2833    tzfiles = filterfalse(
2834        {"backward", "backzone"}.__contains__, listIANAFiles(tzdataDir)
2835    )
2836    (zones, links) = readIANAFiles(tzdataDir, tzfiles)
2837
2838    if not ignoreBackzone:
2839        links = updateBackzoneLinks(tzdataDir, links)
2840
2841    generateTzDataLinkTestContent(
2842        testDir,
2843        version,
2844        "timeZone_notbackward_links.js",
2845        "// Link names derived from IANA Time Zone Database, excluding backward file.",
2846        links.items(),
2847    )
2848
2849
2850def generateTzDataTestBackzone(tzdataDir, version, ignoreBackzone, testDir):
2851    backzoneFiles = {"backzone"}
2852    (bkfiles, tzfiles) = partition(listIANAFiles(tzdataDir), backzoneFiles.__contains__)
2853
2854    # Read zone and link infos.
2855    (zones, links) = readIANAFiles(tzdataDir, tzfiles)
2856    (backzones, backlinks) = readIANAFiles(tzdataDir, bkfiles)
2857
2858    if not ignoreBackzone:
2859        comment = """\
2860// This file was generated with historical, pre-1970 backzone information
2861// respected. Therefore, every zone key listed below is its own Zone, not
2862// a Link to a modern-day target as IANA ignoring backzones would say.
2863
2864"""
2865    else:
2866        comment = """\
2867// This file was generated while ignoring historical, pre-1970 backzone
2868// information. Therefore, every zone key listed below is part of a Link
2869// whose target is the corresponding value.
2870
2871"""
2872
2873    generateTzDataLinkTestContent(
2874        testDir,
2875        version,
2876        "timeZone_backzone.js",
2877        comment + "// Backzone zones derived from IANA Time Zone Database.",
2878        (
2879            (zone, zone if not ignoreBackzone else links[zone])
2880            for zone in backzones
2881            if zone in links
2882        ),
2883    )
2884
2885
2886def generateTzDataTestBackzoneLinks(tzdataDir, version, ignoreBackzone, testDir):
2887    backzoneFiles = {"backzone"}
2888    (bkfiles, tzfiles) = partition(listIANAFiles(tzdataDir), backzoneFiles.__contains__)
2889
2890    # Read zone and link infos.
2891    (zones, links) = readIANAFiles(tzdataDir, tzfiles)
2892    (backzones, backlinks) = readIANAFiles(tzdataDir, bkfiles)
2893
2894    if not ignoreBackzone:
2895        comment = """\
2896// This file was generated with historical, pre-1970 backzone information
2897// respected. Therefore, every zone key listed below points to a target
2898// in the backzone file and not to its modern-day target as IANA ignoring
2899// backzones would say.
2900
2901"""
2902    else:
2903        comment = """\
2904// This file was generated while ignoring historical, pre-1970 backzone
2905// information. Therefore, every zone key listed below is part of a Link
2906// whose target is the corresponding value ignoring any backzone entries.
2907
2908"""
2909
2910    generateTzDataLinkTestContent(
2911        testDir,
2912        version,
2913        "timeZone_backzone_links.js",
2914        comment + "// Backzone links derived from IANA Time Zone Database.",
2915        (
2916            (zone, target if not ignoreBackzone else links[zone])
2917            for (zone, target) in backlinks.items()
2918        ),
2919    )
2920
2921
2922def generateTzDataTestVersion(tzdataDir, version, testDir):
2923    fileName = "timeZone_version.js"
2924
2925    with io.open(
2926        os.path.join(testDir, fileName), mode="w", encoding="utf-8", newline=""
2927    ) as f:
2928        println = partial(print, file=f)
2929
2930        println('// |reftest| skip-if(!this.hasOwnProperty("Intl"))')
2931        println("")
2932        println(generatedFileWarning)
2933        println(tzdataVersionComment.format(version))
2934        println("""const tzdata = "{0}";""".format(version))
2935
2936        println(
2937            """
2938if (typeof getICUOptions === "undefined") {
2939    var getICUOptions = SpecialPowers.Cu.getJSTestingFunctions().getICUOptions;
2940}
2941
2942var options = getICUOptions();
2943
2944assertEq(options.tzdata, tzdata);
2945
2946if (typeof reportCompare === "function")
2947    reportCompare(0, 0, "ok");
2948"""
2949        )
2950
2951
2952def generateTzDataTests(tzdataDir, version, ignoreBackzone, testDir):
2953    generateTzDataTestBackwardLinks(tzdataDir, version, ignoreBackzone, testDir)
2954    generateTzDataTestNotBackwardLinks(tzdataDir, version, ignoreBackzone, testDir)
2955    generateTzDataTestBackzone(tzdataDir, version, ignoreBackzone, testDir)
2956    generateTzDataTestBackzoneLinks(tzdataDir, version, ignoreBackzone, testDir)
2957    generateTzDataTestVersion(tzdataDir, version, testDir)
2958
2959
2960def updateTzdata(topsrcdir, args):
2961    """ Update the time zone cpp file. """
2962
2963    icuDir = os.path.join(topsrcdir, "intl/icu/source")
2964    if not os.path.isdir(icuDir):
2965        raise RuntimeError("not a directory: %s" % icuDir)
2966
2967    icuTzDir = os.path.join(topsrcdir, "intl/tzdata/source")
2968    if not os.path.isdir(icuTzDir):
2969        raise RuntimeError("not a directory: %s" % icuTzDir)
2970
2971    dateTimeFormatTestDir = os.path.join(
2972        topsrcdir, "js/src/tests/non262/Intl/DateTimeFormat"
2973    )
2974    if not os.path.isdir(dateTimeFormatTestDir):
2975        raise RuntimeError("not a directory: %s" % dateTimeFormatTestDir)
2976
2977    tzDir = args.tz
2978    if tzDir is not None and not (os.path.isdir(tzDir) or os.path.isfile(tzDir)):
2979        raise RuntimeError("not a directory or file: %s" % tzDir)
2980    ignoreBackzone = args.ignore_backzone
2981    # TODO: Accept or ignore the placeholder time zone "Factory"?
2982    ignoreFactory = False
2983    out = args.out
2984
2985    version = icuTzDataVersion(icuTzDir)
2986    url = (
2987        "https://www.iana.org/time-zones/repository/releases/tzdata%s.tar.gz" % version
2988    )
2989
2990    print("Arguments:")
2991    print("\ttzdata version: %s" % version)
2992    print("\ttzdata URL: %s" % url)
2993    print("\ttzdata directory|file: %s" % tzDir)
2994    print("\tICU directory: %s" % icuDir)
2995    print("\tICU timezone directory: %s" % icuTzDir)
2996    print("\tIgnore backzone file: %s" % ignoreBackzone)
2997    print("\tOutput file: %s" % out)
2998    print("")
2999
3000    def updateFrom(f):
3001        if os.path.isfile(f) and tarfile.is_tarfile(f):
3002            with tarfile.open(f, "r:*") as tar:
3003                processTimeZones(
3004                    TzDataFile(tar),
3005                    icuDir,
3006                    icuTzDir,
3007                    version,
3008                    ignoreBackzone,
3009                    ignoreFactory,
3010                    out,
3011                )
3012                generateTzDataTests(
3013                    TzDataFile(tar), version, ignoreBackzone, dateTimeFormatTestDir
3014                )
3015        elif os.path.isdir(f):
3016            processTimeZones(
3017                TzDataDir(f),
3018                icuDir,
3019                icuTzDir,
3020                version,
3021                ignoreBackzone,
3022                ignoreFactory,
3023                out,
3024            )
3025            generateTzDataTests(
3026                TzDataDir(f), version, ignoreBackzone, dateTimeFormatTestDir
3027            )
3028        else:
3029            raise RuntimeError("unknown format")
3030
3031    if tzDir is None:
3032        print("Downloading tzdata file...")
3033        with closing(urlopen(url)) as tzfile:
3034            fname = urlsplit(tzfile.geturl()).path.split("/")[-1]
3035            with tempfile.NamedTemporaryFile(suffix=fname) as tztmpfile:
3036                print("File stored in %s" % tztmpfile.name)
3037                tztmpfile.write(tzfile.read())
3038                tztmpfile.flush()
3039                updateFrom(tztmpfile.name)
3040    else:
3041        updateFrom(tzDir)
3042
3043
3044def readCurrencyFile(tree):
3045    reCurrency = re.compile(r"^[A-Z]{3}$")
3046    reIntMinorUnits = re.compile(r"^\d+$")
3047
3048    for country in tree.iterfind(".//CcyNtry"):
3049        # Skip entry if no currency information is available.
3050        currency = country.findtext("Ccy")
3051        if currency is None:
3052            continue
3053        assert reCurrency.match(currency)
3054
3055        minorUnits = country.findtext("CcyMnrUnts")
3056        assert minorUnits is not None
3057
3058        # Skip all entries without minorUnits or which use the default minorUnits.
3059        if reIntMinorUnits.match(minorUnits) and int(minorUnits) != 2:
3060            currencyName = country.findtext("CcyNm")
3061            countryName = country.findtext("CtryNm")
3062            yield (currency, int(minorUnits), currencyName, countryName)
3063
3064
3065def writeCurrencyFile(published, currencies, out):
3066    with io.open(out, mode="w", encoding="utf-8", newline="") as f:
3067        println = partial(print, file=f)
3068
3069        println(generatedFileWarning)
3070        println("// Version: {}".format(published))
3071
3072        println(
3073            """
3074/**
3075 * Mapping from currency codes to the number of decimal digits used for them.
3076 * Default is 2 digits.
3077 *
3078 * Spec: ISO 4217 Currency and Funds Code List.
3079 * http://www.currency-iso.org/en/home/tables/table-a1.html
3080 */"""
3081        )
3082        println("var currencyDigits = {")
3083        for (currency, entries) in groupby(
3084            sorted(currencies, key=itemgetter(0)), itemgetter(0)
3085        ):
3086            for (_, minorUnits, currencyName, countryName) in entries:
3087                println("    // {} ({})".format(currencyName, countryName))
3088            println("    {}: {},".format(currency, minorUnits))
3089        println("};")
3090
3091
3092def updateCurrency(topsrcdir, args):
3093    """ Update the CurrencyDataGenerated.js file. """
3094    import xml.etree.ElementTree as ET
3095    from random import randint
3096
3097    url = args.url
3098    out = args.out
3099    filename = args.file
3100
3101    print("Arguments:")
3102    print("\tDownload url: %s" % url)
3103    print("\tLocal currency file: %s" % filename)
3104    print("\tOutput file: %s" % out)
3105    print("")
3106
3107    def updateFrom(currencyFile):
3108        print("Processing currency code list file...")
3109        tree = ET.parse(currencyFile)
3110        published = tree.getroot().attrib["Pblshd"]
3111        currencies = readCurrencyFile(tree)
3112
3113        print("Writing CurrencyData file...")
3114        writeCurrencyFile(published, currencies, out)
3115
3116    if filename is not None:
3117        print("Always make sure you have the newest currency code list file!")
3118        updateFrom(filename)
3119    else:
3120        print("Downloading currency & funds code list...")
3121        request = UrlRequest(url)
3122        request.add_header(
3123            "User-agent",
3124            "Mozilla/5.0 (Mobile; rv:{0}.0) Gecko/{0}.0 Firefox/{0}.0".format(
3125                randint(1, 999)
3126            ),
3127        )
3128        with closing(urlopen(request)) as currencyFile:
3129            fname = urlsplit(currencyFile.geturl()).path.split("/")[-1]
3130            with tempfile.NamedTemporaryFile(suffix=fname) as currencyTmpFile:
3131                print("File stored in %s" % currencyTmpFile.name)
3132                currencyTmpFile.write(currencyFile.read())
3133                currencyTmpFile.flush()
3134                updateFrom(currencyTmpFile.name)
3135
3136
3137def writeUnicodeExtensionsMappings(println, mapping, extension):
3138    println(
3139        """
3140template <size_t Length>
3141static inline bool Is{0}Key(
3142  mozilla::Span<const char> key, const char (&str)[Length]) {{
3143  static_assert(Length == {0}KeyLength + 1,
3144                "{0} extension key is two characters long");
3145  return memcmp(key.data(), str, Length - 1) == 0;
3146}}
3147
3148template <size_t Length>
3149static inline bool Is{0}Type(
3150  mozilla::Span<const char> type, const char (&str)[Length]) {{
3151  static_assert(Length > {0}KeyLength + 1,
3152                "{0} extension type contains more than two characters");
3153  return type.size() == (Length - 1) &&
3154         memcmp(type.data(), str, Length - 1) == 0;
3155}}
3156""".format(
3157            extension
3158        ).rstrip(
3159            "\n"
3160        )
3161    )
3162
3163    linear_search_max_length = 4
3164
3165    needs_binary_search = any(
3166        len(replacements.items()) > linear_search_max_length
3167        for replacements in mapping.values()
3168    )
3169
3170    if needs_binary_search:
3171        println(
3172            """
3173static int32_t Compare{0}Type(const char* a, mozilla::Span<const char> b) {{
3174  MOZ_ASSERT(!std::char_traits<char>::find(b.data(), b.size(), '\\0'),
3175             "unexpected null-character in string");
3176
3177  using UnsignedChar = unsigned char;
3178  for (size_t i = 0; i < b.size(); i++) {{
3179    // |a| is zero-terminated and |b| doesn't contain a null-terminator. So if
3180    // we've reached the end of |a|, the below if-statement will always be true.
3181    // That ensures we don't read past the end of |a|.
3182    if (int32_t r = UnsignedChar(a[i]) - UnsignedChar(b[i])) {{
3183      return r;
3184    }}
3185  }}
3186
3187  // Return zero if both strings are equal or a negative number if |b| is a
3188  // prefix of |a|.
3189  return -int32_t(UnsignedChar(a[b.size()]));
3190}}
3191
3192template <size_t Length>
3193static inline const char* Search{0}Replacement(
3194  const char* (&types)[Length], const char* (&aliases)[Length],
3195  mozilla::Span<const char> type) {{
3196
3197  auto p = std::lower_bound(std::begin(types), std::end(types), type,
3198                            [](const auto& a, const auto& b) {{
3199    return Compare{0}Type(a, b) < 0;
3200  }});
3201  if (p != std::end(types) && Compare{0}Type(*p, type) == 0) {{
3202    return aliases[std::distance(std::begin(types), p)];
3203  }}
3204  return nullptr;
3205}}
3206""".format(
3207                extension
3208            ).rstrip(
3209                "\n"
3210            )
3211        )
3212
3213    println(
3214        """
3215/**
3216 * Mapping from deprecated BCP 47 {0} extension types to their preferred
3217 * values.
3218 *
3219 * Spec: https://www.unicode.org/reports/tr35/#Unicode_Locale_Extension_Data_Files
3220 * Spec: https://www.unicode.org/reports/tr35/#t_Extension
3221 */
3222const char* js::intl::LanguageTag::replace{0}ExtensionType(
3223    mozilla::Span<const char> key, mozilla::Span<const char> type) {{
3224  MOZ_ASSERT(key.size() == {0}KeyLength);
3225  MOZ_ASSERT(IsCanonicallyCased{0}Key(key));
3226
3227  MOZ_ASSERT(type.size() > {0}KeyLength);
3228  MOZ_ASSERT(IsCanonicallyCased{0}Type(type));
3229""".format(
3230            extension
3231        )
3232    )
3233
3234    def to_hash_key(replacements):
3235        return str(sorted(replacements.items()))
3236
3237    def write_array(subtags, name, length):
3238        max_entries = (80 - len("    ")) // (length + len('"", '))
3239
3240        println("    static const char* {}[{}] = {{".format(name, len(subtags)))
3241
3242        for entries in grouper(subtags, max_entries):
3243            entries = (
3244                '"{}"'.format(tag).rjust(length + 2)
3245                for tag in entries
3246                if tag is not None
3247            )
3248            println("      {},".format(", ".join(entries)))
3249
3250        println("    };")
3251
3252    # Merge duplicate keys.
3253    key_aliases = {}
3254    for (key, replacements) in sorted(mapping.items(), key=itemgetter(0)):
3255        hash_key = to_hash_key(replacements)
3256        if hash_key not in key_aliases:
3257            key_aliases[hash_key] = []
3258        else:
3259            key_aliases[hash_key].append(key)
3260
3261    first_key = True
3262    for (key, replacements) in sorted(mapping.items(), key=itemgetter(0)):
3263        hash_key = to_hash_key(replacements)
3264        if key in key_aliases[hash_key]:
3265            continue
3266
3267        cond = (
3268            'Is{}Key(key, "{}")'.format(extension, k)
3269            for k in [key] + key_aliases[hash_key]
3270        )
3271
3272        if_kind = "if" if first_key else "else if"
3273        cond = (" ||\n" + " " * (2 + len(if_kind) + 2)).join(cond)
3274        println(
3275            """
3276  {} ({}) {{""".format(
3277                if_kind, cond
3278            ).strip(
3279                "\n"
3280            )
3281        )
3282        first_key = False
3283
3284        replacements = sorted(replacements.items(), key=itemgetter(0))
3285
3286        if len(replacements) > linear_search_max_length:
3287            types = [t for (t, _) in replacements]
3288            preferred = [r for (_, r) in replacements]
3289            max_len = max(len(k) for k in types + preferred)
3290
3291            write_array(types, "types", max_len)
3292            write_array(preferred, "aliases", max_len)
3293            println(
3294                """
3295    return Search{}Replacement(types, aliases, type);
3296""".format(
3297                    extension
3298                ).strip(
3299                    "\n"
3300                )
3301            )
3302        else:
3303            for (type, replacement) in replacements:
3304                println(
3305                    """
3306    if (Is{}Type(type, "{}")) {{
3307      return "{}";
3308    }}""".format(
3309                        extension, type, replacement
3310                    ).strip(
3311                        "\n"
3312                    )
3313                )
3314
3315        println(
3316            """
3317  }""".lstrip(
3318                "\n"
3319            )
3320        )
3321
3322    println(
3323        """
3324  return nullptr;
3325}
3326""".strip(
3327            "\n"
3328        )
3329    )
3330
3331
3332def readICUUnitResourceFile(filepath):
3333    """Return a set of unit descriptor pairs where the first entry denotes the unit type and the
3334    second entry the unit name.
3335
3336    Example:
3337
3338    root{
3339        units{
3340            compound{
3341            }
3342            coordinate{
3343            }
3344            length{
3345                meter{
3346                }
3347            }
3348        }
3349        unitsNarrow:alias{"/LOCALE/unitsShort"}
3350        unitsShort{
3351            duration{
3352                day{
3353                }
3354                day-person:alias{"/LOCALE/unitsShort/duration/day"}
3355            }
3356            length{
3357                meter{
3358                }
3359            }
3360        }
3361    }
3362
3363    Returns {("length", "meter"), ("duration", "day"), ("duration", "day-person")}
3364    """
3365
3366    start_table_re = re.compile(r"^([\w\-%:\"]+)\{$")
3367    end_table_re = re.compile(r"^\}$")
3368    table_entry_re = re.compile(r"^([\w\-%:\"]+)\{\"(.*?)\"\}$")
3369
3370    # The current resource table.
3371    table = {}
3372
3373    # List of parent tables when parsing.
3374    parents = []
3375
3376    # Track multi-line comments state.
3377    in_multiline_comment = False
3378
3379    for line in flines(filepath, "utf-8-sig"):
3380        # Remove leading and trailing whitespace.
3381        line = line.strip()
3382
3383        # Skip over comments.
3384        if in_multiline_comment:
3385            if line.endswith("*/"):
3386                in_multiline_comment = False
3387            continue
3388
3389        if line.startswith("//"):
3390            continue
3391
3392        if line.startswith("/*"):
3393            in_multiline_comment = True
3394            continue
3395
3396        # Try to match the start of a table, e.g. `length{` or `meter{`.
3397        match = start_table_re.match(line)
3398        if match:
3399            parents.append(table)
3400            table_name = match.group(1)
3401            new_table = {}
3402            table[table_name] = new_table
3403            table = new_table
3404            continue
3405
3406        # Try to match the end of a table.
3407        match = end_table_re.match(line)
3408        if match:
3409            table = parents.pop()
3410            continue
3411
3412        # Try to match a table entry, e.g. `dnam{"meter"}`.
3413        match = table_entry_re.match(line)
3414        if match:
3415            entry_key = match.group(1)
3416            entry_value = match.group(2)
3417            table[entry_key] = entry_value
3418            continue
3419
3420        raise Exception("unexpected line: '{}' in {}".format(line, filepath))
3421
3422    assert len(parents) == 0, "Not all tables closed"
3423    assert len(table) == 1, "More than one root table"
3424
3425    # Remove the top-level language identifier table.
3426    (_, unit_table) = table.popitem()
3427
3428    # Add all units for the three display formats "units", "unitsNarrow", and "unitsShort".
3429    # But exclude the pseudo-units "compound" and "ccoordinate".
3430    return {
3431        (unit_type, unit_name if not unit_name.endswith(":alias") else unit_name[:-6])
3432        for unit_display in ("units", "unitsNarrow", "unitsShort")
3433        if unit_display in unit_table
3434        for (unit_type, unit_names) in unit_table[unit_display].items()
3435        if unit_type != "compound" and unit_type != "coordinate"
3436        for unit_name in unit_names.keys()
3437    }
3438
3439
3440def computeSupportedUnits(all_units, sanctioned_units):
3441    """Given the set of all possible ICU unit identifiers and the set of sanctioned unit
3442    identifiers, compute the set of effectively supported ICU unit identifiers.
3443    """
3444
3445    def find_match(unit):
3446        unit_match = [
3447            (unit_type, unit_name)
3448            for (unit_type, unit_name) in all_units
3449            if unit_name == unit
3450        ]
3451        if unit_match:
3452            assert len(unit_match) == 1
3453            return unit_match[0]
3454        return None
3455
3456    def compound_unit_identifiers():
3457        for numerator in sanctioned_units:
3458            for denominator in sanctioned_units:
3459                yield "{}-per-{}".format(numerator, denominator)
3460
3461    supported_simple_units = {find_match(unit) for unit in sanctioned_units}
3462    assert None not in supported_simple_units
3463
3464    supported_compound_units = {
3465        unit_match
3466        for unit_match in (find_match(unit) for unit in compound_unit_identifiers())
3467        if unit_match
3468    }
3469
3470    return supported_simple_units | supported_compound_units
3471
3472
3473def readICUDataFilterForUnits(data_filter_file):
3474    with io.open(data_filter_file, mode="r", encoding="utf-8") as f:
3475        data_filter = json.load(f)
3476
3477    # Find the rule set for the "unit_tree".
3478    unit_tree_rules = [
3479        entry["rules"]
3480        for entry in data_filter["resourceFilters"]
3481        if entry["categories"] == ["unit_tree"]
3482    ]
3483    assert len(unit_tree_rules) == 1
3484
3485    # Compute the list of included units from that rule set. The regular expression must match
3486    # "+/*/length/meter" and mustn't match either "-/*" or "+/*/compound".
3487    included_unit_re = re.compile(r"^\+/\*/(.+?)/(.+)$")
3488    filtered_units = (included_unit_re.match(unit) for unit in unit_tree_rules[0])
3489
3490    return {(unit.group(1), unit.group(2)) for unit in filtered_units if unit}
3491
3492
3493def writeSanctionedSimpleUnitIdentifiersFiles(all_units, sanctioned_units):
3494    js_src_builtin_intl_dir = os.path.dirname(os.path.abspath(__file__))
3495
3496    def find_unit_type(unit):
3497        result = [
3498            unit_type for (unit_type, unit_name) in all_units if unit_name == unit
3499        ]
3500        assert result and len(result) == 1
3501        return result[0]
3502
3503    sanctioned_js_file = os.path.join(
3504        js_src_builtin_intl_dir, "SanctionedSimpleUnitIdentifiersGenerated.js"
3505    )
3506    with io.open(sanctioned_js_file, mode="w", encoding="utf-8", newline="") as f:
3507        println = partial(print, file=f)
3508
3509        sanctioned_units_object = json.dumps(
3510            {unit: True for unit in sorted(sanctioned_units)},
3511            sort_keys=True,
3512            indent=4,
3513            separators=(",", ": "),
3514        )
3515
3516        println(generatedFileWarning)
3517
3518        println(
3519            """
3520/**
3521 * The list of currently supported simple unit identifiers.
3522 *
3523 * Intl.NumberFormat Unified API Proposal
3524 */"""
3525        )
3526
3527        println(
3528            "var sanctionedSimpleUnitIdentifiers = {};".format(sanctioned_units_object)
3529        )
3530
3531    sanctioned_cpp_file = os.path.join(
3532        js_src_builtin_intl_dir, "MeasureUnitGenerated.h"
3533    )
3534    with io.open(sanctioned_cpp_file, mode="w", encoding="utf-8", newline="") as f:
3535        println = partial(print, file=f)
3536
3537        println(generatedFileWarning)
3538
3539        println(
3540            """
3541struct MeasureUnit {
3542  const char* const type;
3543  const char* const name;
3544};
3545
3546/**
3547 * The list of currently supported simple unit identifiers.
3548 *
3549 * The list must be kept in alphabetical order of |name|.
3550 */
3551inline constexpr MeasureUnit simpleMeasureUnits[] = {
3552    // clang-format off"""
3553        )
3554
3555        for unit_name in sorted(sanctioned_units):
3556            println('  {{"{}", "{}"}},'.format(find_unit_type(unit_name), unit_name))
3557
3558        println(
3559            """
3560    // clang-format on
3561};""".lstrip(
3562                "\n"
3563            )
3564        )
3565
3566    shutil.copyfile(
3567        sanctioned_cpp_file,
3568        os.path.join(
3569            js_src_builtin_intl_dir,
3570            "../../../../intl/components/src/MeasureUnitGenerated.h",
3571        ),
3572    )
3573
3574    writeUnitTestFiles(all_units, sanctioned_units)
3575
3576
3577def writeUnitTestFiles(all_units, sanctioned_units):
3578    """ Generate test files for unit number formatters. """
3579
3580    js_src_builtin_intl_dir = os.path.dirname(os.path.abspath(__file__))
3581    test_dir = os.path.join(
3582        js_src_builtin_intl_dir, "../../tests/non262/Intl/NumberFormat"
3583    )
3584
3585    def write_test(file_name, test_content, indent=4):
3586        file_path = os.path.join(test_dir, file_name)
3587        with io.open(file_path, mode="w", encoding="utf-8", newline="") as f:
3588            println = partial(print, file=f)
3589
3590            println('// |reftest| skip-if(!this.hasOwnProperty("Intl"))')
3591            println("")
3592            println(generatedFileWarning)
3593            println("")
3594
3595            sanctioned_units_array = json.dumps(
3596                [unit for unit in sorted(sanctioned_units)],
3597                indent=indent,
3598                separators=(",", ": "),
3599            )
3600
3601            println(
3602                "const sanctionedSimpleUnitIdentifiers = {};".format(
3603                    sanctioned_units_array
3604                )
3605            )
3606
3607            println(test_content)
3608
3609            println(
3610                """
3611if (typeof reportCompare === "function")
3612{}reportCompare(true, true);""".format(
3613                    " " * indent
3614                )
3615            )
3616
3617    write_test(
3618        "unit-compound-combinations.js",
3619        """
3620// Test all simple unit identifier combinations are allowed.
3621
3622for (const numerator of sanctionedSimpleUnitIdentifiers) {
3623    for (const denominator of sanctionedSimpleUnitIdentifiers) {
3624        const unit = `${numerator}-per-${denominator}`;
3625        const nf = new Intl.NumberFormat("en", {style: "unit", unit});
3626
3627        assertEq(nf.format(1), nf.formatToParts(1).map(p => p.value).join(""));
3628    }
3629}""",
3630    )
3631
3632    all_units_array = json.dumps(
3633        ["-".join(unit) for unit in sorted(all_units)], indent=4, separators=(",", ": ")
3634    )
3635
3636    write_test(
3637        "unit-well-formed.js",
3638        """
3639const allUnits = {};
3640""".format(
3641            all_units_array
3642        )
3643        + """
3644// Test only sanctioned unit identifiers are allowed.
3645
3646for (const typeAndUnit of allUnits) {
3647    const [_, type, unit] = typeAndUnit.match(/(\w+)-(.+)/);
3648
3649    let allowed;
3650    if (unit.includes("-per-")) {
3651        const [numerator, denominator] = unit.split("-per-");
3652        allowed = sanctionedSimpleUnitIdentifiers.includes(numerator) &&
3653                  sanctionedSimpleUnitIdentifiers.includes(denominator);
3654    } else {
3655        allowed = sanctionedSimpleUnitIdentifiers.includes(unit);
3656    }
3657
3658    if (allowed) {
3659        const nf = new Intl.NumberFormat("en", {style: "unit", unit});
3660        assertEq(nf.format(1), nf.formatToParts(1).map(p => p.value).join(""));
3661    } else {
3662        assertThrowsInstanceOf(() => new Intl.NumberFormat("en", {style: "unit", unit}),
3663                               RangeError, `Missing error for "${typeAndUnit}"`);
3664    }
3665}""",
3666    )
3667
3668    write_test(
3669        "unit-formatToParts-has-unit-field.js",
3670        """
3671// Test only English and Chinese to keep the overall runtime reasonable.
3672//
3673// Chinese is included because it contains more than one "unit" element for
3674// certain unit combinations.
3675const locales = ["en", "zh"];
3676
3677// Plural rules for English only differentiate between "one" and "other". Plural
3678// rules for Chinese only use "other". That means we only need to test two values
3679// per unit.
3680const values = [0, 1];
3681
3682// Ensure unit formatters contain at least one "unit" element.
3683
3684for (const locale of locales) {
3685  for (const unit of sanctionedSimpleUnitIdentifiers) {
3686    const nf = new Intl.NumberFormat(locale, {style: "unit", unit});
3687
3688    for (const value of values) {
3689      assertEq(nf.formatToParts(value).some(e => e.type === "unit"), true,
3690               `locale=${locale}, unit=${unit}`);
3691    }
3692  }
3693
3694  for (const numerator of sanctionedSimpleUnitIdentifiers) {
3695    for (const denominator of sanctionedSimpleUnitIdentifiers) {
3696      const unit = `${numerator}-per-${denominator}`;
3697      const nf = new Intl.NumberFormat(locale, {style: "unit", unit});
3698
3699      for (const value of values) {
3700        assertEq(nf.formatToParts(value).some(e => e.type === "unit"), true,
3701                 `locale=${locale}, unit=${unit}`);
3702      }
3703    }
3704  }
3705}""",
3706        indent=2,
3707    )
3708
3709
3710def updateUnits(topsrcdir, args):
3711    icu_path = os.path.join(topsrcdir, "intl", "icu")
3712    icu_unit_path = os.path.join(icu_path, "source", "data", "unit")
3713
3714    with io.open(
3715        "SanctionedSimpleUnitIdentifiers.yaml", mode="r", encoding="utf-8"
3716    ) as f:
3717        sanctioned_units = yaml.safe_load(f)
3718
3719    # Read all possible ICU unit identifiers from the "unit/root.txt" resource.
3720    unit_root_file = os.path.join(icu_unit_path, "root.txt")
3721    all_units = readICUUnitResourceFile(unit_root_file)
3722
3723    # Compute the set of effectively supported ICU unit identifiers.
3724    supported_units = computeSupportedUnits(all_units, sanctioned_units)
3725
3726    # Read the list of units we're including into the ICU data file.
3727    data_filter_file = os.path.join(icu_path, "data_filter.json")
3728    filtered_units = readICUDataFilterForUnits(data_filter_file)
3729
3730    # Both sets must match to avoid resource loading errors at runtime.
3731    if supported_units != filtered_units:
3732
3733        def units_to_string(units):
3734            return ", ".join("/".join(u) for u in units)
3735
3736        missing = supported_units - filtered_units
3737        if missing:
3738            raise RuntimeError("Missing units: {}".format(units_to_string(missing)))
3739
3740        # Not exactly an error, but we currently don't have a use case where we need to support
3741        # more units than required by ECMA-402.
3742        extra = filtered_units - supported_units
3743        if extra:
3744            raise RuntimeError("Unnecessary units: {}".format(units_to_string(extra)))
3745
3746    writeSanctionedSimpleUnitIdentifiersFiles(all_units, sanctioned_units)
3747
3748
3749def readICUNumberingSystemsResourceFile(filepath):
3750    """Returns a dictionary of numbering systems where the key denotes the numbering system name
3751    and the value a dictionary with additional numbering system data.
3752
3753    Example:
3754
3755    numberingSystems:table(nofallback){
3756        numberingSystems{
3757            latn{
3758                algorithmic:int{0}
3759                desc{"0123456789"}
3760                radix:int{10}
3761            }
3762            roman{
3763                algorithmic:int{1}
3764                desc{"%roman-upper"}
3765                radix:int{10}
3766            }
3767        }
3768    }
3769
3770    Returns {"latn": {"digits": "0123456789", "algorithmic": False},
3771             "roman": {"algorithmic": True}}
3772    """
3773
3774    start_table_re = re.compile(r"^(\w+)(?:\:[\w\(\)]+)?\{$")
3775    end_table_re = re.compile(r"^\}$")
3776    table_entry_re = re.compile(r"^(\w+)(?:\:[\w\(\)]+)?\{(?:(?:\"(.*?)\")|(\d+))\}$")
3777
3778    # The current resource table.
3779    table = {}
3780
3781    # List of parent tables when parsing.
3782    parents = []
3783
3784    # Track multi-line comments state.
3785    in_multiline_comment = False
3786
3787    for line in flines(filepath, "utf-8-sig"):
3788        # Remove leading and trailing whitespace.
3789        line = line.strip()
3790
3791        # Skip over comments.
3792        if in_multiline_comment:
3793            if line.endswith("*/"):
3794                in_multiline_comment = False
3795            continue
3796
3797        if line.startswith("//"):
3798            continue
3799
3800        if line.startswith("/*"):
3801            in_multiline_comment = True
3802            continue
3803
3804        # Try to match the start of a table, e.g. `latn{`.
3805        match = start_table_re.match(line)
3806        if match:
3807            parents.append(table)
3808            table_name = match.group(1)
3809            new_table = {}
3810            table[table_name] = new_table
3811            table = new_table
3812            continue
3813
3814        # Try to match the end of a table.
3815        match = end_table_re.match(line)
3816        if match:
3817            table = parents.pop()
3818            continue
3819
3820        # Try to match a table entry, e.g. `desc{"0123456789"}`.
3821        match = table_entry_re.match(line)
3822        if match:
3823            entry_key = match.group(1)
3824            entry_value = (
3825                match.group(2) if match.group(2) is not None else int(match.group(3))
3826            )
3827            table[entry_key] = entry_value
3828            continue
3829
3830        raise Exception("unexpected line: '{}' in {}".format(line, filepath))
3831
3832    assert len(parents) == 0, "Not all tables closed"
3833    assert len(table) == 1, "More than one root table"
3834
3835    # Remove the two top-level "numberingSystems" tables.
3836    (_, numbering_systems) = table.popitem()
3837    (_, numbering_systems) = numbering_systems.popitem()
3838
3839    # Assert all numbering systems use base 10.
3840    assert all(ns["radix"] == 10 for ns in numbering_systems.values())
3841
3842    # Return the numbering systems.
3843    return {
3844        key: {"digits": value["desc"], "algorithmic": False}
3845        if not bool(value["algorithmic"])
3846        else {"algorithmic": True}
3847        for (key, value) in numbering_systems.items()
3848    }
3849
3850
3851def writeNumberingSystemFiles(numbering_systems):
3852    js_src_builtin_intl_dir = os.path.dirname(os.path.abspath(__file__))
3853
3854    numbering_systems_js_file = os.path.join(
3855        js_src_builtin_intl_dir, "NumberingSystemsGenerated.h"
3856    )
3857    with io.open(
3858        numbering_systems_js_file, mode="w", encoding="utf-8", newline=""
3859    ) as f:
3860        println = partial(print, file=f)
3861
3862        println(generatedFileWarning)
3863
3864        println(
3865            """
3866/**
3867 * The list of numbering systems with simple digit mappings.
3868 */
3869
3870#ifndef builtin_intl_NumberingSystemsGenerated_h
3871#define builtin_intl_NumberingSystemsGenerated_h
3872"""
3873        )
3874
3875        simple_numbering_systems = sorted(
3876            name
3877            for (name, value) in numbering_systems.items()
3878            if not value["algorithmic"]
3879        )
3880
3881        println("// clang-format off")
3882        println("#define NUMBERING_SYSTEMS_WITH_SIMPLE_DIGIT_MAPPINGS \\")
3883        println(
3884            "{}".format(
3885                ", \\\n".join(
3886                    '  "{}"'.format(name) for name in simple_numbering_systems
3887                )
3888            )
3889        )
3890        println("// clang-format on")
3891        println("")
3892
3893        println("#endif  // builtin_intl_NumberingSystemsGenerated_h")
3894
3895    js_src_builtin_intl_dir = os.path.dirname(os.path.abspath(__file__))
3896    test_dir = os.path.join(js_src_builtin_intl_dir, "../../tests/non262/Intl")
3897
3898    intl_shell_js_file = os.path.join(test_dir, "shell.js")
3899
3900    with io.open(intl_shell_js_file, mode="w", encoding="utf-8", newline="") as f:
3901        println = partial(print, file=f)
3902
3903        println(generatedFileWarning)
3904
3905        println(
3906            """
3907// source: CLDR file common/bcp47/number.xml; version CLDR {}.
3908// https://github.com/unicode-org/cldr/blob/master/common/bcp47/number.xml
3909// https://github.com/unicode-org/cldr/blob/master/common/supplemental/numberingSystems.xml
3910""".format(
3911                readCLDRVersionFromICU()
3912            ).rstrip()
3913        )
3914
3915        numbering_systems_object = json.dumps(
3916            numbering_systems,
3917            indent=2,
3918            separators=(",", ": "),
3919            sort_keys=True,
3920            ensure_ascii=False,
3921        )
3922        println("const numberingSystems = {};".format(numbering_systems_object))
3923
3924
3925def updateNumberingSystems(topsrcdir, args):
3926    icu_path = os.path.join(topsrcdir, "intl", "icu")
3927    icu_misc_path = os.path.join(icu_path, "source", "data", "misc")
3928
3929    with io.open("NumberingSystems.yaml", mode="r", encoding="utf-8") as f:
3930        numbering_systems = yaml.safe_load(f)
3931
3932    # Read all possible ICU unit identifiers from the "misc/numberingSystems.txt" resource.
3933    misc_ns_file = os.path.join(icu_misc_path, "numberingSystems.txt")
3934    all_numbering_systems = readICUNumberingSystemsResourceFile(misc_ns_file)
3935
3936    all_numbering_systems_simple_digits = {
3937        name
3938        for (name, value) in all_numbering_systems.items()
3939        if not value["algorithmic"]
3940    }
3941
3942    # Assert ICU includes support for all required numbering systems. If this assertion fails,
3943    # something is broken in ICU.
3944    assert all_numbering_systems_simple_digits.issuperset(
3945        numbering_systems
3946    ), "{}".format(numbering_systems.difference(all_numbering_systems_simple_digits))
3947
3948    # Assert the spec requires support for all numbering systems with simple digit mappings. If
3949    # this assertion fails, file a PR at <https://github.com/tc39/ecma402> to include any new
3950    # numbering systems.
3951    assert all_numbering_systems_simple_digits.issubset(numbering_systems), "{}".format(
3952        all_numbering_systems_simple_digits.difference(numbering_systems)
3953    )
3954
3955    writeNumberingSystemFiles(all_numbering_systems)
3956
3957
3958if __name__ == "__main__":
3959    import argparse
3960
3961    # This script must reside in js/src/builtin/intl to work correctly.
3962    (thisDir, thisFile) = os.path.split(os.path.abspath(sys.argv[0]))
3963    dirPaths = os.path.normpath(thisDir).split(os.sep)
3964    if "/".join(dirPaths[-4:]) != "js/src/builtin/intl":
3965        raise RuntimeError("%s must reside in js/src/builtin/intl" % sys.argv[0])
3966    topsrcdir = "/".join(dirPaths[:-4])
3967
3968    def EnsureHttps(v):
3969        if not v.startswith("https:"):
3970            raise argparse.ArgumentTypeError("URL protocol must be https: " % v)
3971        return v
3972
3973    parser = argparse.ArgumentParser(description="Update intl data.")
3974    subparsers = parser.add_subparsers(help="Select update mode")
3975
3976    parser_cldr_tags = subparsers.add_parser(
3977        "langtags", help="Update CLDR language tags data"
3978    )
3979    parser_cldr_tags.add_argument(
3980        "--version", metavar="VERSION", help="CLDR version number"
3981    )
3982    parser_cldr_tags.add_argument(
3983        "--url",
3984        metavar="URL",
3985        default="https://unicode.org/Public/cldr/<VERSION>/core.zip",
3986        type=EnsureHttps,
3987        help="Download url CLDR data (default: %(default)s)",
3988    )
3989    parser_cldr_tags.add_argument(
3990        "--out",
3991        default="LanguageTagGenerated.cpp",
3992        help="Output file (default: %(default)s)",
3993    )
3994    parser_cldr_tags.add_argument(
3995        "file", nargs="?", help="Local cldr-core.zip file, if omitted uses <URL>"
3996    )
3997    parser_cldr_tags.set_defaults(func=updateCLDRLangTags)
3998
3999    parser_tz = subparsers.add_parser("tzdata", help="Update tzdata")
4000    parser_tz.add_argument(
4001        "--tz",
4002        help="Local tzdata directory or file, if omitted downloads tzdata "
4003        "distribution from https://www.iana.org/time-zones/",
4004    )
4005    # ICU doesn't include the backzone file by default, but we still like to
4006    # use the backzone time zone names to avoid user confusion. This does lead
4007    # to formatting "historic" dates (pre-1970 era) with the wrong time zone,
4008    # but that's probably acceptable for now.
4009    parser_tz.add_argument(
4010        "--ignore-backzone",
4011        action="store_true",
4012        help="Ignore tzdata's 'backzone' file. Can be enabled to generate more "
4013        "accurate time zone canonicalization reflecting the actual time "
4014        "zones as used by ICU.",
4015    )
4016    parser_tz.add_argument(
4017        "--out",
4018        default="TimeZoneDataGenerated.h",
4019        help="Output file (default: %(default)s)",
4020    )
4021    parser_tz.set_defaults(func=partial(updateTzdata, topsrcdir))
4022
4023    parser_currency = subparsers.add_parser(
4024        "currency", help="Update currency digits mapping"
4025    )
4026    parser_currency.add_argument(
4027        "--url",
4028        metavar="URL",
4029        default="https://www.currency-iso.org/dam/downloads/lists/list_one.xml",  # NOQA: E501
4030        type=EnsureHttps,
4031        help="Download url for the currency & funds code list (default: "
4032        "%(default)s)",
4033    )
4034    parser_currency.add_argument(
4035        "--out",
4036        default="CurrencyDataGenerated.js",
4037        help="Output file (default: %(default)s)",
4038    )
4039    parser_currency.add_argument(
4040        "file", nargs="?", help="Local currency code list file, if omitted uses <URL>"
4041    )
4042    parser_currency.set_defaults(func=partial(updateCurrency, topsrcdir))
4043
4044    parser_units = subparsers.add_parser(
4045        "units", help="Update sanctioned unit identifiers mapping"
4046    )
4047    parser_units.set_defaults(func=partial(updateUnits, topsrcdir))
4048
4049    parser_numbering_systems = subparsers.add_parser(
4050        "numbering", help="Update numbering systems with simple " "digit mappings"
4051    )
4052    parser_numbering_systems.set_defaults(
4053        func=partial(updateNumberingSystems, topsrcdir)
4054    )
4055
4056    args = parser.parse_args()
4057    args.func(args)
4058