1#!/usr/bin/env python
2# -*- coding: utf-8 -*-
3#
4# This Source Code Form is subject to the terms of the Mozilla Public
5# License, v. 2.0. If a copy of the MPL was not distributed with this
6# file, You can obtain one at http://mozilla.org/MPL/2.0/.
7
8""" Usage:
9    make_intl_data.py langtags [cldr_core.zip]
10    make_intl_data.py tzdata
11    make_intl_data.py currency
12    make_intl_data.py units
13    make_intl_data.py numbering
14
15
16    Target "langtags":
17    This script extracts information about 1) mappings between deprecated and
18    current Unicode BCP 47 locale identifiers, and 2) deprecated and current
19    BCP 47 Unicode extension value from CLDR, and converts it to C++ mapping
20    code in intl/components/LocaleGenerated.cpp. The code is used in
21    intl/components/Locale.cpp.
22
23
24    Target "tzdata":
25    This script computes which time zone informations are not up-to-date in ICU
26    and provides the necessary mappings to workaround this problem.
27    https://ssl.icu-project.org/trac/ticket/12044
28
29
30    Target "currency":
31    Generates the mapping from currency codes to decimal digits used for them.
32
33
34    Target "units":
35    Generate source and test files using the list of so-called "sanctioned unit
36    identifiers" and verifies that the ICU data filter includes these units.
37
38
39    Target "numbering":
40    Generate source and test files using the list of numbering systems with
41    simple digit mappings and verifies that it's in sync with ICU/CLDR.
42"""
43
44from __future__ import print_function
45import os
46import re
47import io
48import json
49import sys
50import tarfile
51import tempfile
52import yaml
53from contextlib import closing
54from functools import partial, total_ordering
55from itertools import chain, groupby, tee
56from operator import attrgetter, itemgetter
57from zipfile import ZipFile
58
59if sys.version_info.major == 2:
60    from itertools import (
61        ifilter as filter,
62        ifilterfalse as filterfalse,
63        imap as map,
64        izip_longest as zip_longest,
65    )
66    from urllib2 import urlopen, Request as UrlRequest
67    from urlparse import urlsplit
68else:
69    from itertools import filterfalse, zip_longest
70    from urllib.request import urlopen, Request as UrlRequest
71    from urllib.parse import urlsplit
72
73
74# From https://docs.python.org/3/library/itertools.html
75def grouper(iterable, n, fillvalue=None):
76    "Collect data into fixed-length chunks or blocks"
77    # grouper('ABCDEFG', 3, 'x') --> ABC DEF Gxx"
78    args = [iter(iterable)] * n
79    return zip_longest(*args, fillvalue=fillvalue)
80
81
82def writeMappingHeader(println, description, source, url):
83    if type(description) is not list:
84        description = [description]
85    for desc in description:
86        println("// {0}".format(desc))
87    println("// Derived from {0}.".format(source))
88    println("// {0}".format(url))
89
90
91def writeMappingsVar(println, mapping, name, description, source, url):
92    """Writes a variable definition with a mapping table.
93
94    Writes the contents of dictionary |mapping| through the |println|
95    function with the given variable name and a comment with description,
96    fileDate, and URL.
97    """
98    println("")
99    writeMappingHeader(println, description, source, url)
100    println("var {0} = {{".format(name))
101    for (key, value) in sorted(mapping.items(), key=itemgetter(0)):
102        println('    "{0}": "{1}",'.format(key, value))
103    println("};")
104
105
106def writeMappingsBinarySearch(
107    println,
108    fn_name,
109    type_name,
110    name,
111    validate_fn,
112    validate_case_fn,
113    mappings,
114    tag_maxlength,
115    description,
116    source,
117    url,
118):
119    """Emit code to perform a binary search on language tag subtags.
120
121    Uses the contents of |mapping|, which can either be a dictionary or set,
122    to emit a mapping function to find subtag replacements.
123    """
124    println("")
125    writeMappingHeader(println, description, source, url)
126    println(
127        """
128bool mozilla::intl::Locale::{0}({1} {2}) {{
129  MOZ_ASSERT({3}({2}.Span()));
130  MOZ_ASSERT({4}({2}.Span()));
131""".format(
132            fn_name, type_name, name, validate_fn, validate_case_fn
133        ).strip()
134    )
135    writeMappingsBinarySearchBody(println, name, name, mappings, tag_maxlength)
136
137    println(
138        """
139}""".lstrip(
140            "\n"
141        )
142    )
143
144
145def writeMappingsBinarySearchBody(
146    println, source_name, target_name, mappings, tag_maxlength
147):
148    def write_array(subtags, name, length, fixed):
149        if fixed:
150            println(
151                "    static const char {}[{}][{}] = {{".format(
152                    name, len(subtags), length + 1
153                )
154            )
155        else:
156            println("    static const char* {}[{}] = {{".format(name, len(subtags)))
157
158        # Group in pairs of ten to not exceed the 80 line column limit.
159        for entries in grouper(subtags, 10):
160            entries = (
161                '"{}"'.format(tag).rjust(length + 2)
162                for tag in entries
163                if tag is not None
164            )
165            println("      {},".format(", ".join(entries)))
166
167        println("    };")
168
169    trailing_return = True
170
171    # Sort the subtags by length. That enables using an optimized comparator
172    # for the binary search, which only performs a single |memcmp| for multiple
173    # of two subtag lengths.
174    mappings_keys = mappings.keys() if type(mappings) == dict else mappings
175    for (length, subtags) in groupby(sorted(mappings_keys, key=len), len):
176        # Omit the length check if the current length is the maximum length.
177        if length != tag_maxlength:
178            println(
179                """
180  if ({}.Length() == {}) {{
181""".format(
182                    source_name, length
183                ).rstrip(
184                    "\n"
185                )
186            )
187        else:
188            trailing_return = False
189            println(
190                """
191  {
192""".rstrip(
193                    "\n"
194                )
195            )
196
197        # The subtags need to be sorted for binary search to work.
198        subtags = sorted(subtags)
199
200        def equals(subtag):
201            return """{}.EqualTo("{}")""".format(source_name, subtag)
202
203        # Don't emit a binary search for short lists.
204        if len(subtags) == 1:
205            if type(mappings) == dict:
206                println(
207                    """
208    if ({}) {{
209      {}.Set(mozilla::MakeStringSpan("{}"));
210      return true;
211    }}
212    return false;
213""".format(
214                        equals(subtags[0]), target_name, mappings[subtags[0]]
215                    ).strip(
216                        "\n"
217                    )
218                )
219            else:
220                println(
221                    """
222    return {};
223""".format(
224                        equals(subtags[0])
225                    ).strip(
226                        "\n"
227                    )
228                )
229        elif len(subtags) <= 4:
230            if type(mappings) == dict:
231                for subtag in subtags:
232                    println(
233                        """
234    if ({}) {{
235      {}.Set("{}");
236      return true;
237    }}
238""".format(
239                            equals(subtag), target_name, mappings[subtag]
240                        ).strip(
241                            "\n"
242                        )
243                    )
244
245                println(
246                    """
247    return false;
248""".strip(
249                        "\n"
250                    )
251                )
252            else:
253                cond = (equals(subtag) for subtag in subtags)
254                cond = (" ||\n" + " " * (4 + len("return "))).join(cond)
255                println(
256                    """
257    return {};
258""".format(
259                        cond
260                    ).strip(
261                        "\n"
262                    )
263                )
264        else:
265            write_array(subtags, source_name + "s", length, True)
266
267            if type(mappings) == dict:
268                write_array([mappings[k] for k in subtags], "aliases", length, False)
269
270                println(
271                    """
272    if (const char* replacement = SearchReplacement({0}s, aliases, {0})) {{
273      {1}.Set(mozilla::MakeStringSpan(replacement));
274      return true;
275    }}
276    return false;
277""".format(
278                        source_name, target_name
279                    ).rstrip()
280                )
281            else:
282                println(
283                    """
284    return HasReplacement({0}s, {0});
285""".format(
286                        source_name
287                    ).rstrip()
288                )
289
290        println(
291            """
292  }
293""".strip(
294                "\n"
295            )
296        )
297
298    if trailing_return:
299        println(
300            """
301  return false;"""
302        )
303
304
305def writeComplexLanguageTagMappings(
306    println, complex_language_mappings, description, source, url
307):
308    println("")
309    writeMappingHeader(println, description, source, url)
310    println(
311        """
312void mozilla::intl::Locale::PerformComplexLanguageMappings() {
313  MOZ_ASSERT(IsStructurallyValidLanguageTag(Language().Span()));
314  MOZ_ASSERT(IsCanonicallyCasedLanguageTag(Language().Span()));
315""".lstrip()
316    )
317
318    # Merge duplicate language entries.
319    language_aliases = {}
320    for (deprecated_language, (language, script, region)) in sorted(
321        complex_language_mappings.items(), key=itemgetter(0)
322    ):
323        key = (language, script, region)
324        if key not in language_aliases:
325            language_aliases[key] = []
326        else:
327            language_aliases[key].append(deprecated_language)
328
329    first_language = True
330    for (deprecated_language, (language, script, region)) in sorted(
331        complex_language_mappings.items(), key=itemgetter(0)
332    ):
333        key = (language, script, region)
334        if deprecated_language in language_aliases[key]:
335            continue
336
337        if_kind = "if" if first_language else "else if"
338        first_language = False
339
340        cond = (
341            'Language().EqualTo("{}")'.format(lang)
342            for lang in [deprecated_language] + language_aliases[key]
343        )
344        cond = (" ||\n" + " " * (2 + len(if_kind) + 2)).join(cond)
345
346        println(
347            """
348  {} ({}) {{""".format(
349                if_kind, cond
350            ).strip(
351                "\n"
352            )
353        )
354
355        println(
356            """
357    SetLanguage("{}");""".format(
358                language
359            ).strip(
360                "\n"
361            )
362        )
363
364        if script is not None:
365            println(
366                """
367    if (Script().Missing()) {{
368      SetScript("{}");
369    }}""".format(
370                    script
371                ).strip(
372                    "\n"
373                )
374            )
375        if region is not None:
376            println(
377                """
378    if (Region().Missing()) {{
379      SetRegion("{}");
380    }}""".format(
381                    region
382                ).strip(
383                    "\n"
384                )
385            )
386        println(
387            """
388  }""".strip(
389                "\n"
390            )
391        )
392
393    println(
394        """
395}
396""".strip(
397            "\n"
398        )
399    )
400
401
402def writeComplexRegionTagMappings(
403    println, complex_region_mappings, description, source, url
404):
405    println("")
406    writeMappingHeader(println, description, source, url)
407    println(
408        """
409void mozilla::intl::Locale::PerformComplexRegionMappings() {
410  MOZ_ASSERT(IsStructurallyValidLanguageTag(Language().Span()));
411  MOZ_ASSERT(IsCanonicallyCasedLanguageTag(Language().Span()));
412  MOZ_ASSERT(IsStructurallyValidRegionTag(Region().Span()));
413  MOZ_ASSERT(IsCanonicallyCasedRegionTag(Region().Span()));
414""".lstrip()
415    )
416
417    # |non_default_replacements| is a list and hence not hashable. Convert it
418    # to a string to get a proper hashable value.
419    def hash_key(default, non_default_replacements):
420        return (default, str(sorted(str(v) for v in non_default_replacements)))
421
422    # Merge duplicate region entries.
423    region_aliases = {}
424    for (deprecated_region, (default, non_default_replacements)) in sorted(
425        complex_region_mappings.items(), key=itemgetter(0)
426    ):
427        key = hash_key(default, non_default_replacements)
428        if key not in region_aliases:
429            region_aliases[key] = []
430        else:
431            region_aliases[key].append(deprecated_region)
432
433    first_region = True
434    for (deprecated_region, (default, non_default_replacements)) in sorted(
435        complex_region_mappings.items(), key=itemgetter(0)
436    ):
437        key = hash_key(default, non_default_replacements)
438        if deprecated_region in region_aliases[key]:
439            continue
440
441        if_kind = "if" if first_region else "else if"
442        first_region = False
443
444        cond = (
445            'Region().EqualTo("{}")'.format(region)
446            for region in [deprecated_region] + region_aliases[key]
447        )
448        cond = (" ||\n" + " " * (2 + len(if_kind) + 2)).join(cond)
449
450        println(
451            """
452  {} ({}) {{""".format(
453                if_kind, cond
454            ).strip(
455                "\n"
456            )
457        )
458
459        replacement_regions = sorted(
460            {region for (_, _, region) in non_default_replacements}
461        )
462
463        first_case = True
464        for replacement_region in replacement_regions:
465            replacement_language_script = sorted(
466                (language, script)
467                for (language, script, region) in (non_default_replacements)
468                if region == replacement_region
469            )
470
471            if_kind = "if" if first_case else "else if"
472            first_case = False
473
474            def compare_tags(language, script):
475                if script is None:
476                    return 'Language().EqualTo("{}")'.format(language)
477                return '(Language().EqualTo("{}") && Script().EqualTo("{}"))'.format(
478                    language, script
479                )
480
481            cond = (
482                compare_tags(language, script)
483                for (language, script) in replacement_language_script
484            )
485            cond = (" ||\n" + " " * (4 + len(if_kind) + 2)).join(cond)
486
487            println(
488                """
489    {} ({}) {{
490      SetRegion("{}");
491    }}""".format(
492                    if_kind, cond, replacement_region
493                )
494                .rstrip()
495                .strip("\n")
496            )
497
498        println(
499            """
500    else {{
501      SetRegion("{}");
502    }}
503  }}""".format(
504                default
505            )
506            .rstrip()
507            .strip("\n")
508        )
509
510    println(
511        """
512}
513""".strip(
514            "\n"
515        )
516    )
517
518
519def writeVariantTagMappings(println, variant_mappings, description, source, url):
520    """Writes a function definition that maps variant subtags."""
521    println(
522        """
523static const char* ToCharPointer(const char* str) {
524  return str;
525}
526
527static const char* ToCharPointer(const mozilla::intl::UniqueChars& str) {
528  return str.get();
529}
530
531template <typename T, typename U = T>
532static bool IsLessThan(const T& a, const U& b) {
533  return strcmp(ToCharPointer(a), ToCharPointer(b)) < 0;
534}
535"""
536    )
537    writeMappingHeader(println, description, source, url)
538    println(
539        """
540bool mozilla::intl::Locale::PerformVariantMappings() {
541  // The variant subtags need to be sorted for binary search.
542  MOZ_ASSERT(std::is_sorted(mVariants.begin(), mVariants.end(),
543                            IsLessThan<decltype(mVariants)::ElementType>));
544
545  auto removeVariantAt = [&](size_t index) {
546    mVariants.erase(mVariants.begin() + index);
547  };
548
549  auto insertVariantSortedIfNotPresent = [&](const char* variant) {
550    auto* p = std::lower_bound(
551        mVariants.begin(), mVariants.end(), variant,
552        IsLessThan<decltype(mVariants)::ElementType, decltype(variant)>);
553
554    // Don't insert the replacement when already present.
555    if (p != mVariants.end() && strcmp(p->get(), variant) == 0) {
556      return true;
557    }
558
559    // Insert the preferred variant in sort order.
560    auto preferred = DuplicateStringToUniqueChars(variant);
561    return !!mVariants.insert(p, std::move(preferred));
562  };
563
564  for (size_t i = 0; i < mVariants.length();) {
565    const char* variant = mVariants[i].get();
566    MOZ_ASSERT(IsCanonicallyCasedVariantTag(mozilla::MakeStringSpan(variant)));
567""".lstrip()
568    )
569
570    (no_alias, with_alias) = partition(
571        variant_mappings.items(), lambda item: item[1] is None
572    )
573
574    no_replacements = " ||\n        ".join(
575        f"""strcmp(variant, "{deprecated_variant}") == 0"""
576        for (deprecated_variant, _) in sorted(no_alias, key=itemgetter(0))
577    )
578
579    println(
580        f"""
581    if ({no_replacements}) {{
582      removeVariantAt(i);
583    }}
584""".strip(
585            "\n"
586        )
587    )
588
589    for (deprecated_variant, (type, replacement)) in sorted(
590        with_alias, key=itemgetter(0)
591    ):
592        println(
593            f"""
594    else if (strcmp(variant, "{deprecated_variant}") == 0) {{
595      removeVariantAt(i);
596""".strip(
597                "\n"
598            )
599        )
600
601        if type == "language":
602            println(
603                f"""
604      SetLanguage("{replacement}");
605""".strip(
606                    "\n"
607                )
608            )
609        elif type == "region":
610            println(
611                f"""
612      SetRegion("{replacement}");
613""".strip(
614                    "\n"
615                )
616            )
617        else:
618            assert type == "variant"
619            println(
620                f"""
621      if (!insertVariantSortedIfNotPresent("{replacement}")) {{
622        return false;
623      }}
624""".strip(
625                    "\n"
626                )
627            )
628
629        println(
630            """
631    }
632""".strip(
633                "\n"
634            )
635        )
636
637    println(
638        """
639    else {
640      i++;
641    }
642  }
643  return true;
644}
645""".strip(
646            "\n"
647        )
648    )
649
650
651def writeLegacyMappingsFunction(println, legacy_mappings, description, source, url):
652    """Writes a function definition that maps legacy language tags."""
653    println("")
654    writeMappingHeader(println, description, source, url)
655    println(
656        """\
657bool mozilla::intl::Locale::UpdateLegacyMappings() {
658  // We're mapping legacy tags to non-legacy form here.
659  // Other tags remain unchanged.
660  //
661  // Legacy tags are either sign language tags ("sgn") or have one or multiple
662  // variant subtags. Therefore we can quickly exclude most tags by checking
663  // these two subtags.
664
665  MOZ_ASSERT(IsCanonicallyCasedLanguageTag(Language().Span()));
666
667  if (!Language().EqualTo("sgn") && mVariants.length() == 0) {
668    return true;
669  }
670
671#ifdef DEBUG
672  for (const auto& variant : Variants()) {
673    MOZ_ASSERT(IsStructurallyValidVariantTag(variant));
674    MOZ_ASSERT(IsCanonicallyCasedVariantTag(variant));
675  }
676#endif
677
678  // The variant subtags need to be sorted for binary search.
679  MOZ_ASSERT(std::is_sorted(mVariants.begin(), mVariants.end(),
680                            IsLessThan<decltype(mVariants)::ElementType>));
681
682  auto findVariant = [this](const char* variant) {
683    auto* p = std::lower_bound(mVariants.begin(), mVariants.end(), variant,
684                               IsLessThan<decltype(mVariants)::ElementType,
685                                          decltype(variant)>);
686
687    if (p != mVariants.end() && strcmp(p->get(), variant) == 0) {
688      return p;
689    }
690    return static_cast<decltype(p)>(nullptr);
691  };
692
693  auto insertVariantSortedIfNotPresent = [&](const char* variant) {
694    auto* p = std::lower_bound(mVariants.begin(), mVariants.end(), variant,
695                               IsLessThan<decltype(mVariants)::ElementType,
696                                          decltype(variant)>);
697
698    // Don't insert the replacement when already present.
699    if (p != mVariants.end() && strcmp(p->get(), variant) == 0) {
700      return true;
701    }
702
703    // Insert the preferred variant in sort order.
704    auto preferred = DuplicateStringToUniqueChars(variant);
705    return !!mVariants.insert(p, std::move(preferred));
706  };
707
708  auto removeVariant = [&](auto* p) {
709    size_t index = std::distance(mVariants.begin(), p);
710    mVariants.erase(mVariants.begin() + index);
711  };
712
713  auto removeVariants = [&](auto* p, auto* q) {
714    size_t pIndex = std::distance(mVariants.begin(), p);
715    size_t qIndex = std::distance(mVariants.begin(), q);
716    MOZ_ASSERT(pIndex < qIndex, "variant subtags are sorted");
717
718    mVariants.erase(mVariants.begin() + qIndex);
719    mVariants.erase(mVariants.begin() + pIndex);
720  };"""
721    )
722
723    # Helper class for pattern matching.
724    class AnyClass:
725        def __eq__(self, obj):
726            return obj is not None
727
728    Any = AnyClass()
729
730    # Group the mappings by language.
731    legacy_mappings_by_language = {}
732    for (type, replacement) in legacy_mappings.items():
733        (language, _, _, _) = type
734        legacy_mappings_by_language.setdefault(language, {})[type] = replacement
735
736    # Handle the empty language case first.
737    if None in legacy_mappings_by_language:
738        # Get the mappings and remove them from the dict.
739        mappings = legacy_mappings_by_language.pop(None)
740
741        # This case only applies for the "hepburn-heploc" -> "alalc97"
742        # mapping, so just inline it here.
743        from_tag = (None, None, None, "hepburn-heploc")
744        to_tag = (None, None, None, "alalc97")
745
746        assert len(mappings) == 1
747        assert mappings[from_tag] == to_tag
748
749        println(
750            """
751  if (mVariants.length() >= 2) {
752    if (auto* hepburn = findVariant("hepburn")) {
753      if (auto* heploc = findVariant("heploc")) {
754        removeVariants(hepburn, heploc);
755
756        if (!insertVariantSortedIfNotPresent("alalc97")) {
757          return false;
758        }
759      }
760    }
761  }
762"""
763        )
764
765    # Handle sign languages next.
766    if "sgn" in legacy_mappings_by_language:
767        mappings = legacy_mappings_by_language.pop("sgn")
768
769        # Legacy sign language mappings have the form "sgn-XX" where "XX" is
770        # some region code.
771        assert all(type == ("sgn", None, Any, None) for type in mappings.keys())
772
773        # Legacy sign languages are mapped to a single language subtag.
774        assert all(
775            replacement == (Any, None, None, None) for replacement in mappings.values()
776        )
777
778        println(
779            """
780  if (Language().EqualTo("sgn")) {
781    if (Region().Present() && SignLanguageMapping(mLanguage, Region())) {
782      mRegion.Set(mozilla::MakeStringSpan(""));
783    }
784  }
785""".rstrip().lstrip(
786                "\n"
787            )
788        )
789
790    # Finally handle all remaining cases.
791
792    # The remaining mappings have neither script nor region subtags in the source locale.
793    assert all(
794        type == (Any, None, None, Any)
795        for mappings in legacy_mappings_by_language.values()
796        for type in mappings.keys()
797    )
798
799    # And they have neither script nor region nor variant subtags in the target locale.
800    assert all(
801        replacement == (Any, None, None, None)
802        for mappings in legacy_mappings_by_language.values()
803        for replacement in mappings.values()
804    )
805
806    # Compact the mappings table by removing empty fields.
807    legacy_mappings_by_language = {
808        lang: {
809            variants: r_language
810            for ((_, _, _, variants), (r_language, _, _, _)) in mappings.items()
811        }
812        for (lang, mappings) in legacy_mappings_by_language.items()
813    }
814
815    # Try to combine the remaining cases.
816    legacy_mappings_compact = {}
817
818    # Python can't hash dicts or lists, so use the string representation as the hash key.
819    def hash_key(mappings):
820        return str(sorted(mappings.items(), key=itemgetter(0)))
821
822    for (lang, mappings) in sorted(
823        legacy_mappings_by_language.items(), key=itemgetter(0)
824    ):
825        key = hash_key(mappings)
826        legacy_mappings_compact.setdefault(key, []).append(lang)
827
828    for langs in legacy_mappings_compact.values():
829        language_equal_to = (
830            f"""Language().EqualTo("{lang}")""" for lang in sorted(langs)
831        )
832        cond = f""" ||\n{" " * len("  else if (")}""".join(language_equal_to)
833
834        println(
835            f"""
836  else if ({cond}) {{
837""".rstrip().lstrip(
838                "\n"
839            )
840        )
841
842        mappings = legacy_mappings_by_language[langs[0]]
843
844        # Count the variant subtags to determine the sort order.
845        def variant_size(m):
846            (k, _) = m
847            return len(k.split("-"))
848
849        # Alias rules are applied by largest union size first.
850        for (size, mappings_by_size) in groupby(
851            sorted(mappings.items(), key=variant_size, reverse=True), key=variant_size
852        ):
853
854            # Convert grouper object to dict.
855            mappings_by_size = dict(mappings_by_size)
856
857            is_first = True
858            chain_if = size == 1
859
860            # Alias rules are applied in alphabetical order
861            for (variants, r_language) in sorted(
862                mappings_by_size.items(), key=itemgetter(0)
863            ):
864                sorted_variants = sorted(variants.split("-"))
865                len_variants = len(sorted_variants)
866
867                maybe_else = "else " if chain_if and not is_first else ""
868                is_first = False
869
870                for (i, variant) in enumerate(sorted_variants):
871                    println(
872                        f"""
873    {"  " * i}{maybe_else}if (auto* {variant} = findVariant("{variant}")) {{
874""".rstrip().lstrip(
875                            "\n"
876                        )
877                    )
878
879                indent = "  " * len_variants
880
881                println(
882                    f"""
883    {indent}removeVariant{"s" if len_variants > 1 else ""}({", ".join(sorted_variants)});
884    {indent}SetLanguage("{r_language}");
885    {indent}{"return true;" if not chain_if else ""}
886""".rstrip().lstrip(
887                        "\n"
888                    )
889                )
890
891                for i in range(len_variants, 0, -1):
892                    println(
893                        f"""
894    {"  " * (i - 1)}}}
895""".rstrip().lstrip(
896                            "\n"
897                        )
898                    )
899
900        println(
901            """
902  }
903""".rstrip().lstrip(
904                "\n"
905            )
906        )
907
908    println(
909        """
910  return true;
911}"""
912    )
913
914
915def writeSignLanguageMappingsFunction(
916    println, legacy_mappings, description, source, url
917):
918    """Writes a function definition that maps legacy sign language tags."""
919    println("")
920    writeMappingHeader(println, description, source, url)
921    println(
922        """\
923bool mozilla::intl::Locale::SignLanguageMapping(LanguageSubtag& language,
924                                                const RegionSubtag& region) {
925  MOZ_ASSERT(language.EqualTo("sgn"));
926  MOZ_ASSERT(IsStructurallyValidRegionTag(region.Span()));
927  MOZ_ASSERT(IsCanonicallyCasedRegionTag(region.Span()));
928""".rstrip()
929    )
930
931    region_mappings = {
932        rg: lg
933        for ((lang, _, rg, _), (lg, _, _, _)) in legacy_mappings.items()
934        if lang == "sgn"
935    }
936
937    source_name = "region"
938    target_name = "language"
939    tag_maxlength = 3
940    writeMappingsBinarySearchBody(
941        println, source_name, target_name, region_mappings, tag_maxlength
942    )
943
944    println(
945        """
946}""".lstrip()
947    )
948
949
950def readSupplementalData(core_file):
951    """Reads CLDR Supplemental Data and extracts information for Intl.js.
952
953    Information extracted:
954    - legacyMappings: mappings from legacy tags to preferred complete language tags
955    - languageMappings: mappings from language subtags to preferred subtags
956    - complexLanguageMappings: mappings from language subtags with complex rules
957    - regionMappings: mappings from region subtags to preferred subtags
958    - complexRegionMappings: mappings from region subtags with complex rules
959    - variantMappings: mappings from variant subtags to preferred subtags
960    - likelySubtags: likely subtags used for generating test data only
961    Returns these mappings as dictionaries.
962    """
963    import xml.etree.ElementTree as ET
964
965    # From Unicode BCP 47 locale identifier <https://unicode.org/reports/tr35/>.
966    re_unicode_language_id = re.compile(
967        r"""
968        ^
969        # unicode_language_id = unicode_language_subtag
970        #     unicode_language_subtag = alpha{2,3} | alpha{5,8}
971        (?P<language>[a-z]{2,3}|[a-z]{5,8})
972
973        # (sep unicode_script_subtag)?
974        #     unicode_script_subtag = alpha{4}
975        (?:-(?P<script>[a-z]{4}))?
976
977        # (sep unicode_region_subtag)?
978        #     unicode_region_subtag = (alpha{2} | digit{3})
979        (?:-(?P<region>([a-z]{2}|[0-9]{3})))?
980
981        # (sep unicode_variant_subtag)*
982        #     unicode_variant_subtag = (alphanum{5,8} | digit alphanum{3})
983        (?P<variants>(-([a-z0-9]{5,8}|[0-9][a-z0-9]{3}))+)?
984        $
985        """,
986        re.IGNORECASE | re.VERBOSE,
987    )
988
989    # CLDR uses "_" as the separator for some elements. Replace it with "-".
990    def bcp47_id(cldr_id):
991        return cldr_id.replace("_", "-")
992
993    # Return the tuple (language, script, region, variants) and assert all
994    # subtags are in canonical case.
995    def bcp47_canonical(language, script, region, variants):
996        # Canonical case for language subtags is lower case.
997        assert language is None or language.lower() == language
998
999        # Canonical case for script subtags is title case.
1000        assert script is None or script.title() == script
1001
1002        # Canonical case for region subtags is upper case.
1003        assert region is None or region.upper() == region
1004
1005        # Canonical case for variant subtags is lower case.
1006        assert variants is None or variants.lower() == variants
1007
1008        return (language, script, region, variants[1:] if variants else None)
1009
1010    # Language ids are interpreted as multi-maps in
1011    # <https://www.unicode.org/reports/tr35/#LocaleId_Canonicalization>.
1012    #
1013    # See UTS35, §Annex C, Definitions - 1. Multimap interpretation.
1014    def language_id_to_multimap(language_id):
1015        match = re_unicode_language_id.match(language_id)
1016        assert (
1017            match is not None
1018        ), f"{language_id} invalid Unicode BCP 47 locale identifier"
1019
1020        canonical_language_id = bcp47_canonical(
1021            *match.group("language", "script", "region", "variants")
1022        )
1023        (language, _, _, _) = canonical_language_id
1024
1025        # Normalize "und" language to None, but keep the rest as is.
1026        return (language if language != "und" else None,) + canonical_language_id[1:]
1027
1028    rules = {}
1029    territory_exception_rules = {}
1030
1031    tree = ET.parse(core_file.open("common/supplemental/supplementalMetadata.xml"))
1032
1033    # Load the rules from supplementalMetadata.xml.
1034    #
1035    # See UTS35, §Annex C, Definitions - 2. Alias elements.
1036    # See UTS35, §Annex C, Preprocessing.
1037    for alias_name in [
1038        "languageAlias",
1039        "scriptAlias",
1040        "territoryAlias",
1041        "variantAlias",
1042    ]:
1043        for alias in tree.iterfind(".//" + alias_name):
1044            # Replace '_' by '-'.
1045            type = bcp47_id(alias.get("type"))
1046            replacement = bcp47_id(alias.get("replacement"))
1047
1048            # Prefix with "und-".
1049            if alias_name != "languageAlias":
1050                type = "und-" + type
1051
1052            # Discard all rules where the type is an invalid languageId.
1053            if re_unicode_language_id.match(type) is None:
1054                continue
1055
1056            type = language_id_to_multimap(type)
1057
1058            # Multiple, whitespace-separated territory replacements may be present.
1059            if alias_name == "territoryAlias" and " " in replacement:
1060                replacements = replacement.split(" ")
1061                replacement_list = [
1062                    language_id_to_multimap("und-" + r) for r in replacements
1063                ]
1064
1065                assert (
1066                    type not in territory_exception_rules
1067                ), f"Duplicate alias rule: {type}"
1068
1069                territory_exception_rules[type] = replacement_list
1070
1071                # The first element is the default territory replacement.
1072                replacement = replacements[0]
1073
1074            # Prefix with "und-".
1075            if alias_name != "languageAlias":
1076                replacement = "und-" + replacement
1077
1078            replacement = language_id_to_multimap(replacement)
1079
1080            assert type not in rules, f"Duplicate alias rule: {type}"
1081
1082            rules[type] = replacement
1083
1084    # Helper class for pattern matching.
1085    class AnyClass:
1086        def __eq__(self, obj):
1087            return obj is not None
1088
1089    Any = AnyClass()
1090
1091    modified_rules = True
1092    loop_count = 0
1093
1094    while modified_rules:
1095        modified_rules = False
1096        loop_count += 1
1097
1098        # UTS 35 defines that canonicalization is applied until a fixed point has
1099        # been reached. This iterative application of the canonicalization algorithm
1100        # is only needed for a relatively small set of rules, so we can precompute
1101        # the transitive closure of all rules here and then perform a single pass
1102        # when canonicalizing language tags at runtime.
1103        transitive_rules = {}
1104
1105        # Compute the transitive closure.
1106        # Any case which currently doesn't occur in the CLDR sources isn't supported
1107        # and will lead to throwing an error.
1108        for (type, replacement) in rules.items():
1109            (language, script, region, variants) = type
1110            (r_language, r_script, r_region, r_variants) = replacement
1111
1112            for (i_type, i_replacement) in rules.items():
1113                (i_language, i_script, i_region, i_variants) = i_type
1114                (i_r_language, i_r_script, i_r_region, i_r_variants) = i_replacement
1115
1116                if i_language is not None and i_language == r_language:
1117                    # This case currently only occurs when neither script nor region
1118                    # subtags are present. A single variant subtags may be present
1119                    # in |type|. And |i_type| definitely has a single variant subtag.
1120                    # Should this ever change, update this code accordingly.
1121                    assert type == (Any, None, None, None) or type == (
1122                        Any,
1123                        None,
1124                        None,
1125                        Any,
1126                    )
1127                    assert replacement == (Any, None, None, None)
1128                    assert i_type == (Any, None, None, Any)
1129                    assert i_replacement == (Any, None, None, None)
1130
1131                    # This case happens for the rules
1132                    #   "zh-guoyu -> zh",
1133                    #   "zh-hakka -> hak", and
1134                    #   "und-hakka -> und".
1135                    # Given the possible input "zh-guoyu-hakka", the first rule will
1136                    # change it to "zh-hakka", and then the second rule can be
1137                    # applied. (The third rule isn't applied ever.)
1138                    #
1139                    # Let's assume there's a hypothetical rule
1140                    #   "zh-aaaaa" -> "en"
1141                    # And we have the input "zh-aaaaa-hakka", then "zh-aaaaa -> en"
1142                    # is applied before "zh-hakka -> hak", because rules are sorted
1143                    # alphabetically. That means the overall result is "en":
1144                    # "zh-aaaaa-hakka" is first canonicalized to "en-hakka" and then
1145                    # "hakka" is removed through the third rule.
1146                    #
1147                    # No current rule requires to handle this special case, so we
1148                    # don't yet support it.
1149                    assert variants is None or variants <= i_variants
1150
1151                    # Combine all variants and remove duplicates.
1152                    vars = set(
1153                        i_variants.split("-")
1154                        + (variants.split("-") if variants else [])
1155                    )
1156
1157                    # Add the variants alphabetically sorted.
1158                    n_type = (language, None, None, "-".join(sorted(vars)))
1159
1160                    assert (
1161                        n_type not in transitive_rules
1162                        or transitive_rules[n_type] == i_replacement
1163                    )
1164                    transitive_rules[n_type] = i_replacement
1165
1166                    continue
1167
1168                if i_script is not None and i_script == r_script:
1169                    # This case currently doesn't occur, so we don't yet support it.
1170                    raise ValueError(
1171                        f"{type} -> {replacement} :: {i_type} -> {i_replacement}"
1172                    )
1173                if i_region is not None and i_region == r_region:
1174                    # This case currently only applies for sign language
1175                    # replacements. Similar to the language subtag case any other
1176                    # combination isn't currently supported.
1177                    assert type == (None, None, Any, None)
1178                    assert replacement == (None, None, Any, None)
1179                    assert i_type == ("sgn", None, Any, None)
1180                    assert i_replacement == (Any, None, None, None)
1181
1182                    n_type = ("sgn", None, region, None)
1183
1184                    assert n_type not in transitive_rules
1185                    transitive_rules[n_type] = i_replacement
1186
1187                    continue
1188
1189                if i_variants is not None and i_variants == r_variants:
1190                    # This case currently doesn't occur, so we don't yet support it.
1191                    raise ValueError(
1192                        f"{type} -> {replacement} :: {i_type} -> {i_replacement}"
1193                    )
1194
1195        # Ensure there are no contradicting rules.
1196        assert all(
1197            rules[type] == replacement
1198            for (type, replacement) in transitive_rules.items()
1199            if type in rules
1200        )
1201
1202        # If |transitive_rules| is not a subset of |rules|, new rules will be added.
1203        modified_rules = not (transitive_rules.keys() <= rules.keys())
1204
1205        # Ensure we only have to iterate more than once for the "guoyo-{hakka,xiang}"
1206        # case. Failing this assertion means either there's a bug when computing the
1207        # stop condition of this loop or a new kind of legacy language tags was added.
1208        if modified_rules and loop_count > 1:
1209            new_rules = {k for k in transitive_rules.keys() if k not in rules}
1210            for k in new_rules:
1211                assert k == (Any, None, None, "guoyu-hakka") or k == (
1212                    Any,
1213                    None,
1214                    None,
1215                    "guoyu-xiang",
1216                )
1217
1218        # Merge the transitive rules.
1219        rules.update(transitive_rules)
1220
1221    # Computes the size of the union of all field value sets.
1222    def multi_map_size(locale_id):
1223        (language, script, region, variants) = locale_id
1224
1225        return (
1226            (1 if language is not None else 0)
1227            + (1 if script is not None else 0)
1228            + (1 if region is not None else 0)
1229            + (len(variants.split("-")) if variants is not None else 0)
1230        )
1231
1232    # Dictionary of legacy mappings, contains raw rules, e.g.
1233    # (None, None, None, "hepburn-heploc") -> (None, None, None, "alalc97").
1234    legacy_mappings = {}
1235
1236    # Dictionary of simple language subtag mappings, e.g. "in" -> "id".
1237    language_mappings = {}
1238
1239    # Dictionary of complex language subtag mappings, modifying more than one
1240    # subtag, e.g. "sh" -> ("sr", "Latn", None) and "cnr" -> ("sr", None, "ME").
1241    complex_language_mappings = {}
1242
1243    # Dictionary of simple script subtag mappings, e.g. "Qaai" -> "Zinh".
1244    script_mappings = {}
1245
1246    # Dictionary of simple region subtag mappings, e.g. "DD" -> "DE".
1247    region_mappings = {}
1248
1249    # Dictionary of complex region subtag mappings, containing more than one
1250    # replacement, e.g. "SU" -> ("RU", ["AM", "AZ", "BY", ...]).
1251    complex_region_mappings = {}
1252
1253    # Dictionary of aliased variant subtags to a tuple of preferred replacement
1254    # type and replacement, e.g. "arevela" -> ("language", "hy") or
1255    # "aaland" -> ("region", "AX") or "heploc" -> ("variant", "alalc97").
1256    variant_mappings = {}
1257
1258    # Preprocess all rules so we can perform a single lookup per subtag at runtime.
1259    for (type, replacement) in rules.items():
1260        (language, script, region, variants) = type
1261        (r_language, r_script, r_region, r_variants) = replacement
1262
1263        type_map_size = multi_map_size(type)
1264
1265        # Most mappings are one-to-one and can be encoded through lookup tables.
1266        if type_map_size == 1:
1267            if language is not None:
1268                assert r_language is not None, "Can't remove a language subtag"
1269
1270                # We don't yet support this case.
1271                assert (
1272                    r_variants is None
1273                ), f"Unhandled variant replacement in language alias: {replacement}"
1274
1275                if replacement == (Any, None, None, None):
1276                    language_mappings[language] = r_language
1277                else:
1278                    complex_language_mappings[language] = replacement[:-1]
1279            elif script is not None:
1280                # We don't support removing script subtags.
1281                assert (
1282                    r_script is not None
1283                ), f"Can't remove a script subtag: {replacement}"
1284
1285                # We only support one-to-one script mappings for now.
1286                assert replacement == (
1287                    None,
1288                    Any,
1289                    None,
1290                    None,
1291                ), f"Unhandled replacement in script alias: {replacement}"
1292
1293                script_mappings[script] = r_script
1294            elif region is not None:
1295                # We don't support removing region subtags.
1296                assert (
1297                    r_region is not None
1298                ), f"Can't remove a region subtag: {replacement}"
1299
1300                # We only support one-to-one region mappings for now.
1301                assert replacement == (
1302                    None,
1303                    None,
1304                    Any,
1305                    None,
1306                ), f"Unhandled replacement in region alias: {replacement}"
1307
1308                if type not in territory_exception_rules:
1309                    region_mappings[region] = r_region
1310                else:
1311                    complex_region_mappings[region] = [
1312                        r_region
1313                        for (_, _, r_region, _) in territory_exception_rules[type]
1314                    ]
1315            else:
1316                assert variants is not None
1317                assert len(variants.split("-")) == 1
1318
1319                # We only support one-to-one variant mappings for now.
1320                assert (
1321                    multi_map_size(replacement) <= 1
1322                ), f"Unhandled replacement in variant alias: {replacement}"
1323
1324                if r_language is not None:
1325                    variant_mappings[variants] = ("language", r_language)
1326                elif r_script is not None:
1327                    variant_mappings[variants] = ("script", r_script)
1328                elif r_region is not None:
1329                    variant_mappings[variants] = ("region", r_region)
1330                elif r_variants is not None:
1331                    assert len(r_variants.split("-")) == 1
1332                    variant_mappings[variants] = ("variant", r_variants)
1333                else:
1334                    variant_mappings[variants] = None
1335        else:
1336            # Alias rules which have multiple input fields must be processed
1337            # first. This applies only to a handful of rules, so our generated
1338            # code adds fast paths to skip these rules in the common case.
1339
1340            # Case 1: Language and at least one variant subtag.
1341            if language is not None and variants is not None:
1342                pass
1343
1344            # Case 2: Sign language and a region subtag.
1345            elif language == "sgn" and region is not None:
1346                pass
1347
1348            # Case 3: "hepburn-heploc" to "alalc97" canonicalization.
1349            elif (
1350                language is None
1351                and variants is not None
1352                and len(variants.split("-")) == 2
1353            ):
1354                pass
1355
1356            # Any other combination is currently unsupported.
1357            else:
1358                raise ValueError(f"{type} -> {replacement}")
1359
1360            legacy_mappings[type] = replacement
1361
1362    tree = ET.parse(core_file.open("common/supplemental/likelySubtags.xml"))
1363
1364    likely_subtags = {}
1365
1366    for likely_subtag in tree.iterfind(".//likelySubtag"):
1367        from_tag = bcp47_id(likely_subtag.get("from"))
1368        from_match = re_unicode_language_id.match(from_tag)
1369        assert (
1370            from_match is not None
1371        ), f"{from_tag} invalid Unicode BCP 47 locale identifier"
1372        assert (
1373            from_match.group("variants") is None
1374        ), f"unexpected variant subtags in {from_tag}"
1375
1376        to_tag = bcp47_id(likely_subtag.get("to"))
1377        to_match = re_unicode_language_id.match(to_tag)
1378        assert (
1379            to_match is not None
1380        ), f"{to_tag} invalid Unicode BCP 47 locale identifier"
1381        assert (
1382            to_match.group("variants") is None
1383        ), f"unexpected variant subtags in {to_tag}"
1384
1385        from_canonical = bcp47_canonical(
1386            *from_match.group("language", "script", "region", "variants")
1387        )
1388
1389        to_canonical = bcp47_canonical(
1390            *to_match.group("language", "script", "region", "variants")
1391        )
1392
1393        # Remove the empty variant subtags.
1394        from_canonical = from_canonical[:-1]
1395        to_canonical = to_canonical[:-1]
1396
1397        likely_subtags[from_canonical] = to_canonical
1398
1399    complex_region_mappings_final = {}
1400
1401    for (deprecated_region, replacements) in complex_region_mappings.items():
1402        # Find all likely subtag entries which don't already contain a region
1403        # subtag and whose target region is in the list of replacement regions.
1404        region_likely_subtags = [
1405            (from_language, from_script, to_region)
1406            for (
1407                (from_language, from_script, from_region),
1408                (_, _, to_region),
1409            ) in likely_subtags.items()
1410            if from_region is None and to_region in replacements
1411        ]
1412
1413        # The first replacement entry is the default region.
1414        default = replacements[0]
1415
1416        # Find all likely subtag entries whose region matches the default region.
1417        default_replacements = {
1418            (language, script)
1419            for (language, script, region) in region_likely_subtags
1420            if region == default
1421        }
1422
1423        # And finally find those entries which don't use the default region.
1424        # These are the entries we're actually interested in, because those need
1425        # to be handled specially when selecting the correct preferred region.
1426        non_default_replacements = [
1427            (language, script, region)
1428            for (language, script, region) in region_likely_subtags
1429            if (language, script) not in default_replacements
1430        ]
1431
1432        # If there are no non-default replacements, we can handle the region as
1433        # part of the simple region mapping.
1434        if non_default_replacements:
1435            complex_region_mappings_final[deprecated_region] = (
1436                default,
1437                non_default_replacements,
1438            )
1439        else:
1440            region_mappings[deprecated_region] = default
1441
1442    return {
1443        "legacyMappings": legacy_mappings,
1444        "languageMappings": language_mappings,
1445        "complexLanguageMappings": complex_language_mappings,
1446        "scriptMappings": script_mappings,
1447        "regionMappings": region_mappings,
1448        "complexRegionMappings": complex_region_mappings_final,
1449        "variantMappings": variant_mappings,
1450        "likelySubtags": likely_subtags,
1451    }
1452
1453
1454def readUnicodeExtensions(core_file):
1455    import xml.etree.ElementTree as ET
1456
1457    # Match all xml-files in the BCP 47 directory.
1458    bcpFileRE = re.compile(r"^common/bcp47/.+\.xml$")
1459
1460    # https://www.unicode.org/reports/tr35/#Unicode_locale_identifier
1461    #
1462    # type = alphanum{3,8} (sep alphanum{3,8})* ;
1463    typeRE = re.compile(r"^[a-z0-9]{3,8}(-[a-z0-9]{3,8})*$")
1464
1465    # https://www.unicode.org/reports/tr35/#Unicode_language_identifier
1466    #
1467    # unicode_region_subtag = alpha{2} ;
1468    alphaRegionRE = re.compile(r"^[A-Z]{2}$", re.IGNORECASE)
1469
1470    # Mapping from Unicode extension types to dict of deprecated to
1471    # preferred values.
1472    mapping = {
1473        # Unicode BCP 47 U Extension
1474        "u": {},
1475        # Unicode BCP 47 T Extension
1476        "t": {},
1477    }
1478
1479    def readBCP47File(file):
1480        tree = ET.parse(file)
1481        for keyword in tree.iterfind(".//keyword/key"):
1482            extension = keyword.get("extension", "u")
1483            assert (
1484                extension == "u" or extension == "t"
1485            ), "unknown extension type: {}".format(extension)
1486
1487            extension_name = keyword.get("name")
1488
1489            for type in keyword.iterfind("type"):
1490                # <https://unicode.org/reports/tr35/#Unicode_Locale_Extension_Data_Files>:
1491                #
1492                # The key or type name used by Unicode locale extension with 'u' extension
1493                # syntax or the 't' extensions syntax. When alias below is absent, this name
1494                # can be also used with the old style "@key=type" syntax.
1495                name = type.get("name")
1496
1497                # Ignore the special name:
1498                # - <https://unicode.org/reports/tr35/#CODEPOINTS>
1499                # - <https://unicode.org/reports/tr35/#REORDER_CODE>
1500                # - <https://unicode.org/reports/tr35/#RG_KEY_VALUE>
1501                # - <https://unicode.org/reports/tr35/#SCRIPT_CODE>
1502                # - <https://unicode.org/reports/tr35/#SUBDIVISION_CODE>
1503                # - <https://unicode.org/reports/tr35/#PRIVATE_USE>
1504                if name in (
1505                    "CODEPOINTS",
1506                    "REORDER_CODE",
1507                    "RG_KEY_VALUE",
1508                    "SCRIPT_CODE",
1509                    "SUBDIVISION_CODE",
1510                    "PRIVATE_USE",
1511                ):
1512                    continue
1513
1514                # All other names should match the 'type' production.
1515                assert (
1516                    typeRE.match(name) is not None
1517                ), "{} matches the 'type' production".format(name)
1518
1519                # <https://unicode.org/reports/tr35/#Unicode_Locale_Extension_Data_Files>:
1520                #
1521                # The preferred value of the deprecated key, type or attribute element.
1522                # When a key, type or attribute element is deprecated, this attribute is
1523                # used for specifying a new canonical form if available.
1524                preferred = type.get("preferred")
1525
1526                # <https://unicode.org/reports/tr35/#Unicode_Locale_Extension_Data_Files>:
1527                #
1528                # The BCP 47 form is the canonical form, and recommended. Other aliases are
1529                # included only for backwards compatibility.
1530                alias = type.get("alias")
1531
1532                # <https://unicode.org/reports/tr35/#Canonical_Unicode_Locale_Identifiers>
1533                #
1534                # Use the bcp47 data to replace keys, types, tfields, and tvalues by their
1535                # canonical forms. See Section 3.6.4 U Extension Data Files) and Section
1536                # 3.7.1 T Extension Data Files. The aliases are in the alias attribute
1537                # value, while the canonical is in the name attribute value.
1538
1539                # 'preferred' contains the new preferred name, 'alias' the compatibility
1540                # name, but then there's this entry where 'preferred' and 'alias' are the
1541                # same. So which one to choose? Assume 'preferred' is the actual canonical
1542                # name.
1543                #
1544                # <type name="islamicc"
1545                #       description="Civil (algorithmic) Arabic calendar"
1546                #       deprecated="true"
1547                #       preferred="islamic-civil"
1548                #       alias="islamic-civil"/>
1549
1550                if preferred is not None:
1551                    assert typeRE.match(preferred), preferred
1552                    mapping[extension].setdefault(extension_name, {})[name] = preferred
1553
1554                if alias is not None:
1555                    for alias_name in alias.lower().split(" "):
1556                        # Ignore alias entries which don't match the 'type' production.
1557                        if typeRE.match(alias_name) is None:
1558                            continue
1559
1560                        # See comment above when 'alias' and 'preferred' are both present.
1561                        if (
1562                            preferred is not None
1563                            and name in mapping[extension][extension_name]
1564                        ):
1565                            continue
1566
1567                        # Skip over entries where 'name' and 'alias' are equal.
1568                        #
1569                        # <type name="pst8pdt"
1570                        #       description="POSIX style time zone for US Pacific Time"
1571                        #       alias="PST8PDT"
1572                        #       since="1.8"/>
1573                        if name == alias_name:
1574                            continue
1575
1576                        mapping[extension].setdefault(extension_name, {})[
1577                            alias_name
1578                        ] = name
1579
1580    def readSupplementalMetadata(file):
1581        # Find subdivision and region replacements.
1582        #
1583        # <https://www.unicode.org/reports/tr35/#Canonical_Unicode_Locale_Identifiers>
1584        #
1585        # Replace aliases in special key values:
1586        #   - If there is an 'sd' or 'rg' key, replace any subdivision alias
1587        #     in its value in the same way, using subdivisionAlias data.
1588        tree = ET.parse(file)
1589        for alias in tree.iterfind(".//subdivisionAlias"):
1590            type = alias.get("type")
1591            assert (
1592                typeRE.match(type) is not None
1593            ), "{} matches the 'type' production".format(type)
1594
1595            # Take the first replacement when multiple ones are present.
1596            replacement = alias.get("replacement").split(" ")[0].lower()
1597
1598            # Append "zzzz" if the replacement is a two-letter region code.
1599            if alphaRegionRE.match(replacement) is not None:
1600                replacement += "zzzz"
1601
1602            # Assert the replacement is syntactically correct.
1603            assert (
1604                typeRE.match(replacement) is not None
1605            ), "replacement {} matches the 'type' production".format(replacement)
1606
1607            # 'subdivisionAlias' applies to 'rg' and 'sd' keys.
1608            mapping["u"].setdefault("rg", {})[type] = replacement
1609            mapping["u"].setdefault("sd", {})[type] = replacement
1610
1611    for name in core_file.namelist():
1612        if bcpFileRE.match(name):
1613            readBCP47File(core_file.open(name))
1614
1615    readSupplementalMetadata(
1616        core_file.open("common/supplemental/supplementalMetadata.xml")
1617    )
1618
1619    return {
1620        "unicodeMappings": mapping["u"],
1621        "transformMappings": mapping["t"],
1622    }
1623
1624
1625def writeCLDRLanguageTagData(println, data, url):
1626    """Writes the language tag data to the Intl data file."""
1627
1628    println(generatedFileWarning)
1629    println("// Version: CLDR-{}".format(data["version"]))
1630    println("// URL: {}".format(url))
1631
1632    println(
1633        """
1634#include "mozilla/Assertions.h"
1635#include "mozilla/Span.h"
1636#include "mozilla/TextUtils.h"
1637
1638#include <algorithm>
1639#include <cstdint>
1640#include <cstring>
1641#include <iterator>
1642#include <string>
1643#include <type_traits>
1644
1645#include "mozilla/intl/Locale.h"
1646
1647using namespace mozilla::intl::LanguageTagLimits;
1648
1649template <size_t Length, size_t TagLength, size_t SubtagLength>
1650static inline bool HasReplacement(
1651    const char (&subtags)[Length][TagLength],
1652    const mozilla::intl::LanguageTagSubtag<SubtagLength>& subtag) {
1653  MOZ_ASSERT(subtag.Length() == TagLength - 1,
1654             "subtag must have the same length as the list of subtags");
1655
1656  const char* ptr = subtag.Span().data();
1657  return std::binary_search(std::begin(subtags), std::end(subtags), ptr,
1658                            [](const char* a, const char* b) {
1659                              return memcmp(a, b, TagLength - 1) < 0;
1660                            });
1661}
1662
1663template <size_t Length, size_t TagLength, size_t SubtagLength>
1664static inline const char* SearchReplacement(
1665    const char (&subtags)[Length][TagLength], const char* (&aliases)[Length],
1666    const mozilla::intl::LanguageTagSubtag<SubtagLength>& subtag) {
1667  MOZ_ASSERT(subtag.Length() == TagLength - 1,
1668             "subtag must have the same length as the list of subtags");
1669
1670  const char* ptr = subtag.Span().data();
1671  auto p = std::lower_bound(std::begin(subtags), std::end(subtags), ptr,
1672                            [](const char* a, const char* b) {
1673                              return memcmp(a, b, TagLength - 1) < 0;
1674                            });
1675  if (p != std::end(subtags) && memcmp(*p, ptr, TagLength - 1) == 0) {
1676    return aliases[std::distance(std::begin(subtags), p)];
1677  }
1678  return nullptr;
1679}
1680
1681#ifdef DEBUG
1682static bool IsAsciiLowercaseAlphanumeric(char c) {
1683  return mozilla::IsAsciiLowercaseAlpha(c) || mozilla::IsAsciiDigit(c);
1684}
1685
1686static bool IsAsciiLowercaseAlphanumericOrDash(char c) {
1687  return IsAsciiLowercaseAlphanumeric(c) || c == '-';
1688}
1689
1690static bool IsCanonicallyCasedLanguageTag(mozilla::Span<const char> span) {
1691  return std::all_of(span.begin(), span.end(),
1692                     mozilla::IsAsciiLowercaseAlpha<char>);
1693}
1694
1695static bool IsCanonicallyCasedScriptTag(mozilla::Span<const char> span) {
1696  return mozilla::IsAsciiUppercaseAlpha(span[0]) &&
1697         std::all_of(span.begin() + 1, span.end(),
1698                     mozilla::IsAsciiLowercaseAlpha<char>);
1699}
1700
1701static bool IsCanonicallyCasedRegionTag(mozilla::Span<const char> span) {
1702  return std::all_of(span.begin(), span.end(),
1703                     mozilla::IsAsciiUppercaseAlpha<char>) ||
1704         std::all_of(span.begin(), span.end(), mozilla::IsAsciiDigit<char>);
1705}
1706
1707static bool IsCanonicallyCasedVariantTag(mozilla::Span<const char> span) {
1708  return std::all_of(span.begin(), span.end(), IsAsciiLowercaseAlphanumeric);
1709}
1710
1711static bool IsCanonicallyCasedUnicodeKey(mozilla::Span<const char> key) {
1712  return std::all_of(key.begin(), key.end(), IsAsciiLowercaseAlphanumeric);
1713}
1714
1715static bool IsCanonicallyCasedUnicodeType(mozilla::Span<const char> type) {
1716  return std::all_of(type.begin(), type.end(),
1717                     IsAsciiLowercaseAlphanumericOrDash);
1718}
1719
1720static bool IsCanonicallyCasedTransformKey(mozilla::Span<const char> key) {
1721  return std::all_of(key.begin(), key.end(), IsAsciiLowercaseAlphanumeric);
1722}
1723
1724static bool IsCanonicallyCasedTransformType(mozilla::Span<const char> type) {
1725  return std::all_of(type.begin(), type.end(),
1726                     IsAsciiLowercaseAlphanumericOrDash);
1727}
1728#endif
1729""".rstrip()
1730    )
1731
1732    source = "CLDR Supplemental Data, version {}".format(data["version"])
1733    legacy_mappings = data["legacyMappings"]
1734    language_mappings = data["languageMappings"]
1735    complex_language_mappings = data["complexLanguageMappings"]
1736    script_mappings = data["scriptMappings"]
1737    region_mappings = data["regionMappings"]
1738    complex_region_mappings = data["complexRegionMappings"]
1739    variant_mappings = data["variantMappings"]
1740    unicode_mappings = data["unicodeMappings"]
1741    transform_mappings = data["transformMappings"]
1742
1743    # unicode_language_subtag = alpha{2,3} | alpha{5,8} ;
1744    language_maxlength = 8
1745
1746    # unicode_script_subtag = alpha{4} ;
1747    script_maxlength = 4
1748
1749    # unicode_region_subtag = (alpha{2} | digit{3}) ;
1750    region_maxlength = 3
1751
1752    writeMappingsBinarySearch(
1753        println,
1754        "LanguageMapping",
1755        "LanguageSubtag&",
1756        "language",
1757        "IsStructurallyValidLanguageTag",
1758        "IsCanonicallyCasedLanguageTag",
1759        language_mappings,
1760        language_maxlength,
1761        "Mappings from language subtags to preferred values.",
1762        source,
1763        url,
1764    )
1765    writeMappingsBinarySearch(
1766        println,
1767        "ComplexLanguageMapping",
1768        "const LanguageSubtag&",
1769        "language",
1770        "IsStructurallyValidLanguageTag",
1771        "IsCanonicallyCasedLanguageTag",
1772        complex_language_mappings.keys(),
1773        language_maxlength,
1774        "Language subtags with complex mappings.",
1775        source,
1776        url,
1777    )
1778    writeMappingsBinarySearch(
1779        println,
1780        "ScriptMapping",
1781        "ScriptSubtag&",
1782        "script",
1783        "IsStructurallyValidScriptTag",
1784        "IsCanonicallyCasedScriptTag",
1785        script_mappings,
1786        script_maxlength,
1787        "Mappings from script subtags to preferred values.",
1788        source,
1789        url,
1790    )
1791    writeMappingsBinarySearch(
1792        println,
1793        "RegionMapping",
1794        "RegionSubtag&",
1795        "region",
1796        "IsStructurallyValidRegionTag",
1797        "IsCanonicallyCasedRegionTag",
1798        region_mappings,
1799        region_maxlength,
1800        "Mappings from region subtags to preferred values.",
1801        source,
1802        url,
1803    )
1804    writeMappingsBinarySearch(
1805        println,
1806        "ComplexRegionMapping",
1807        "const RegionSubtag&",
1808        "region",
1809        "IsStructurallyValidRegionTag",
1810        "IsCanonicallyCasedRegionTag",
1811        complex_region_mappings.keys(),
1812        region_maxlength,
1813        "Region subtags with complex mappings.",
1814        source,
1815        url,
1816    )
1817
1818    writeComplexLanguageTagMappings(
1819        println,
1820        complex_language_mappings,
1821        "Language subtags with complex mappings.",
1822        source,
1823        url,
1824    )
1825    writeComplexRegionTagMappings(
1826        println,
1827        complex_region_mappings,
1828        "Region subtags with complex mappings.",
1829        source,
1830        url,
1831    )
1832
1833    writeVariantTagMappings(
1834        println,
1835        variant_mappings,
1836        "Mappings from variant subtags to preferred values.",
1837        source,
1838        url,
1839    )
1840
1841    writeLegacyMappingsFunction(
1842        println, legacy_mappings, "Canonicalize legacy locale identifiers.", source, url
1843    )
1844
1845    writeSignLanguageMappingsFunction(
1846        println, legacy_mappings, "Mappings from legacy sign languages.", source, url
1847    )
1848
1849    writeUnicodeExtensionsMappings(println, unicode_mappings, "Unicode")
1850    writeUnicodeExtensionsMappings(println, transform_mappings, "Transform")
1851
1852
1853def writeCLDRLanguageTagLikelySubtagsTest(println, data, url):
1854    """Writes the likely-subtags test file."""
1855
1856    println(generatedFileWarning)
1857
1858    source = "CLDR Supplemental Data, version {}".format(data["version"])
1859    language_mappings = data["languageMappings"]
1860    complex_language_mappings = data["complexLanguageMappings"]
1861    script_mappings = data["scriptMappings"]
1862    region_mappings = data["regionMappings"]
1863    complex_region_mappings = data["complexRegionMappings"]
1864    likely_subtags = data["likelySubtags"]
1865
1866    def bcp47(tag):
1867        (language, script, region) = tag
1868        return "{}{}{}".format(
1869            language, "-" + script if script else "", "-" + region if region else ""
1870        )
1871
1872    def canonical(tag):
1873        (language, script, region) = tag
1874
1875        # Map deprecated language subtags.
1876        if language in language_mappings:
1877            language = language_mappings[language]
1878        elif language in complex_language_mappings:
1879            (language2, script2, region2) = complex_language_mappings[language]
1880            (language, script, region) = (
1881                language2,
1882                script if script else script2,
1883                region if region else region2,
1884            )
1885
1886        # Map deprecated script subtags.
1887        if script in script_mappings:
1888            script = script_mappings[script]
1889
1890        # Map deprecated region subtags.
1891        if region in region_mappings:
1892            region = region_mappings[region]
1893        else:
1894            # Assume no complex region mappings are needed for now.
1895            assert (
1896                region not in complex_region_mappings
1897            ), "unexpected region with complex mappings: {}".format(region)
1898
1899        return (language, script, region)
1900
1901    # https://unicode.org/reports/tr35/#Likely_Subtags
1902
1903    def addLikelySubtags(tag):
1904        # Step 1: Canonicalize.
1905        (language, script, region) = canonical(tag)
1906        if script == "Zzzz":
1907            script = None
1908        if region == "ZZ":
1909            region = None
1910
1911        # Step 2: Lookup.
1912        searches = (
1913            (language, script, region),
1914            (language, None, region),
1915            (language, script, None),
1916            (language, None, None),
1917            ("und", script, None),
1918        )
1919        search = next(search for search in searches if search in likely_subtags)
1920
1921        (language_s, script_s, region_s) = search
1922        (language_m, script_m, region_m) = likely_subtags[search]
1923
1924        # Step 3: Return.
1925        return (
1926            language if language != language_s else language_m,
1927            script if script != script_s else script_m,
1928            region if region != region_s else region_m,
1929        )
1930
1931    # https://unicode.org/reports/tr35/#Likely_Subtags
1932    def removeLikelySubtags(tag):
1933        # Step 1: Add likely subtags.
1934        max = addLikelySubtags(tag)
1935
1936        # Step 2: Remove variants (doesn't apply here).
1937
1938        # Step 3: Find a match.
1939        (language, script, region) = max
1940        for trial in (
1941            (language, None, None),
1942            (language, None, region),
1943            (language, script, None),
1944        ):
1945            if addLikelySubtags(trial) == max:
1946                return trial
1947
1948        # Step 4: Return maximized if no match found.
1949        return max
1950
1951    def likely_canonical(from_tag, to_tag):
1952        # Canonicalize the input tag.
1953        from_tag = canonical(from_tag)
1954
1955        # Update the expected result if necessary.
1956        if from_tag in likely_subtags:
1957            to_tag = likely_subtags[from_tag]
1958
1959        # Canonicalize the expected output.
1960        to_canonical = canonical(to_tag)
1961
1962        # Sanity check: This should match the result of |addLikelySubtags|.
1963        assert to_canonical == addLikelySubtags(from_tag)
1964
1965        return to_canonical
1966
1967    # |likely_subtags| contains non-canonicalized tags, so canonicalize it first.
1968    likely_subtags_canonical = {
1969        k: likely_canonical(k, v) for (k, v) in likely_subtags.items()
1970    }
1971
1972    # Add test data for |Intl.Locale.prototype.maximize()|.
1973    writeMappingsVar(
1974        println,
1975        {bcp47(k): bcp47(v) for (k, v) in likely_subtags_canonical.items()},
1976        "maxLikelySubtags",
1977        "Extracted from likelySubtags.xml.",
1978        source,
1979        url,
1980    )
1981
1982    # Use the maximalized tags as the input for the remove likely-subtags test.
1983    minimized = {
1984        tag: removeLikelySubtags(tag) for tag in likely_subtags_canonical.values()
1985    }
1986
1987    # Add test data for |Intl.Locale.prototype.minimize()|.
1988    writeMappingsVar(
1989        println,
1990        {bcp47(k): bcp47(v) for (k, v) in minimized.items()},
1991        "minLikelySubtags",
1992        "Extracted from likelySubtags.xml.",
1993        source,
1994        url,
1995    )
1996
1997    println(
1998        """
1999for (let [tag, maximal] of Object.entries(maxLikelySubtags)) {
2000    assertEq(new Intl.Locale(tag).maximize().toString(), maximal);
2001}"""
2002    )
2003
2004    println(
2005        """
2006for (let [tag, minimal] of Object.entries(minLikelySubtags)) {
2007    assertEq(new Intl.Locale(tag).minimize().toString(), minimal);
2008}"""
2009    )
2010
2011    println(
2012        """
2013if (typeof reportCompare === "function")
2014    reportCompare(0, 0);"""
2015    )
2016
2017
2018def readCLDRVersionFromICU():
2019    icuDir = os.path.join(topsrcdir, "intl/icu/source")
2020    if not os.path.isdir(icuDir):
2021        raise RuntimeError("not a directory: {}".format(icuDir))
2022
2023    reVersion = re.compile(r'\s*cldrVersion\{"(\d+(?:\.\d+)?)"\}')
2024
2025    for line in flines(os.path.join(icuDir, "data/misc/supplementalData.txt")):
2026        m = reVersion.match(line)
2027        if m:
2028            version = m.group(1)
2029            break
2030
2031    if version is None:
2032        raise RuntimeError("can't resolve CLDR version")
2033
2034    return version
2035
2036
2037def updateCLDRLangTags(args):
2038    """Update the LanguageTagGenerated.cpp file."""
2039    version = args.version
2040    url = args.url
2041    out = args.out
2042    filename = args.file
2043
2044    # Determine current CLDR version from ICU.
2045    if version is None:
2046        version = readCLDRVersionFromICU()
2047
2048    url = url.replace("<VERSION>", version)
2049
2050    print("Arguments:")
2051    print("\tCLDR version: %s" % version)
2052    print("\tDownload url: %s" % url)
2053    if filename is not None:
2054        print("\tLocal CLDR core.zip file: %s" % filename)
2055    print("\tOutput file: %s" % out)
2056    print("")
2057
2058    data = {
2059        "version": version,
2060    }
2061
2062    def readFiles(cldr_file):
2063        with ZipFile(cldr_file) as zip_file:
2064            data.update(readSupplementalData(zip_file))
2065            data.update(readUnicodeExtensions(zip_file))
2066
2067    print("Processing CLDR data...")
2068    if filename is not None:
2069        print("Always make sure you have the newest CLDR core.zip!")
2070        with open(filename, "rb") as cldr_file:
2071            readFiles(cldr_file)
2072    else:
2073        print("Downloading CLDR core.zip...")
2074        with closing(urlopen(url)) as cldr_file:
2075            cldr_data = io.BytesIO(cldr_file.read())
2076            readFiles(cldr_data)
2077
2078    print("Writing Intl data...")
2079    with io.open(out, mode="w", encoding="utf-8", newline="") as f:
2080        println = partial(print, file=f)
2081
2082        writeCLDRLanguageTagData(println, data, url)
2083
2084    print("Writing Intl test data...")
2085    js_src_builtin_intl_dir = os.path.dirname(os.path.abspath(__file__))
2086    test_file = os.path.join(
2087        js_src_builtin_intl_dir,
2088        "../../tests/non262/Intl/Locale/likely-subtags-generated.js",
2089    )
2090    with io.open(test_file, mode="w", encoding="utf-8", newline="") as f:
2091        println = partial(print, file=f)
2092
2093        println("// |reftest| skip-if(!this.hasOwnProperty('Intl'))")
2094        writeCLDRLanguageTagLikelySubtagsTest(println, data, url)
2095
2096
2097def flines(filepath, encoding="utf-8"):
2098    """Open filepath and iterate over its content."""
2099    with io.open(filepath, mode="r", encoding=encoding) as f:
2100        for line in f:
2101            yield line
2102
2103
2104@total_ordering
2105class Zone(object):
2106    """Time zone with optional file name."""
2107
2108    def __init__(self, name, filename=""):
2109        self.name = name
2110        self.filename = filename
2111
2112    def __eq__(self, other):
2113        return hasattr(other, "name") and self.name == other.name
2114
2115    def __lt__(self, other):
2116        return self.name < other.name
2117
2118    def __hash__(self):
2119        return hash(self.name)
2120
2121    def __str__(self):
2122        return self.name
2123
2124    def __repr__(self):
2125        return self.name
2126
2127
2128class TzDataDir(object):
2129    """tzdata source from a directory."""
2130
2131    def __init__(self, obj):
2132        self.name = partial(os.path.basename, obj)
2133        self.resolve = partial(os.path.join, obj)
2134        self.basename = os.path.basename
2135        self.isfile = os.path.isfile
2136        self.listdir = partial(os.listdir, obj)
2137        self.readlines = flines
2138
2139
2140class TzDataFile(object):
2141    """tzdata source from a file (tar or gzipped)."""
2142
2143    def __init__(self, obj):
2144        self.name = lambda: os.path.splitext(
2145            os.path.splitext(os.path.basename(obj))[0]
2146        )[0]
2147        self.resolve = obj.getmember
2148        self.basename = attrgetter("name")
2149        self.isfile = tarfile.TarInfo.isfile
2150        self.listdir = obj.getnames
2151        self.readlines = partial(self._tarlines, obj)
2152
2153    def _tarlines(self, tar, m):
2154        with closing(tar.extractfile(m)) as f:
2155            for line in f:
2156                yield line.decode("utf-8")
2157
2158
2159def validateTimeZones(zones, links):
2160    """Validate the zone and link entries."""
2161    linkZones = set(links.keys())
2162    intersect = linkZones.intersection(zones)
2163    if intersect:
2164        raise RuntimeError("Links also present in zones: %s" % intersect)
2165
2166    zoneNames = {z.name for z in zones}
2167    linkTargets = set(links.values())
2168    if not linkTargets.issubset(zoneNames):
2169        raise RuntimeError(
2170            "Link targets not found: %s" % linkTargets.difference(zoneNames)
2171        )
2172
2173
2174def partition(iterable, *predicates):
2175    def innerPartition(pred, it):
2176        it1, it2 = tee(it)
2177        return (filter(pred, it1), filterfalse(pred, it2))
2178
2179    if len(predicates) == 0:
2180        return iterable
2181    (left, right) = innerPartition(predicates[0], iterable)
2182    if len(predicates) == 1:
2183        return (left, right)
2184    return tuple([left] + list(partition(right, *predicates[1:])))
2185
2186
2187def listIANAFiles(tzdataDir):
2188    def isTzFile(d, m, f):
2189        return m(f) and d.isfile(d.resolve(f))
2190
2191    return filter(
2192        partial(isTzFile, tzdataDir, re.compile("^[a-z0-9]+$").match),
2193        tzdataDir.listdir(),
2194    )
2195
2196
2197def readIANAFiles(tzdataDir, files):
2198    """Read all IANA time zone files from the given iterable."""
2199    nameSyntax = "[\w/+\-]+"
2200    pZone = re.compile(r"Zone\s+(?P<name>%s)\s+.*" % nameSyntax)
2201    pLink = re.compile(
2202        r"Link\s+(?P<target>%s)\s+(?P<name>%s)(?:\s+#.*)?" % (nameSyntax, nameSyntax)
2203    )
2204
2205    def createZone(line, fname):
2206        match = pZone.match(line)
2207        name = match.group("name")
2208        return Zone(name, fname)
2209
2210    def createLink(line, fname):
2211        match = pLink.match(line)
2212        (name, target) = match.group("name", "target")
2213        return (Zone(name, fname), target)
2214
2215    zones = set()
2216    links = dict()
2217    for filename in files:
2218        filepath = tzdataDir.resolve(filename)
2219        for line in tzdataDir.readlines(filepath):
2220            if line.startswith("Zone"):
2221                zones.add(createZone(line, filename))
2222            if line.startswith("Link"):
2223                (link, target) = createLink(line, filename)
2224                links[link] = target
2225
2226    return (zones, links)
2227
2228
2229def readIANATimeZones(tzdataDir, ignoreBackzone, ignoreFactory):
2230    """Read the IANA time zone information from `tzdataDir`."""
2231
2232    backzoneFiles = {"backzone"}
2233    (bkfiles, tzfiles) = partition(listIANAFiles(tzdataDir), backzoneFiles.__contains__)
2234
2235    # Read zone and link infos.
2236    (zones, links) = readIANAFiles(tzdataDir, tzfiles)
2237    (backzones, backlinks) = readIANAFiles(tzdataDir, bkfiles)
2238
2239    # Remove the placeholder time zone "Factory".
2240    if ignoreFactory:
2241        zones.remove(Zone("Factory"))
2242
2243    # Merge with backzone data.
2244    if not ignoreBackzone:
2245        zones |= backzones
2246        links = {
2247            name: target for name, target in links.items() if name not in backzones
2248        }
2249        links.update(backlinks)
2250
2251    validateTimeZones(zones, links)
2252
2253    return (zones, links)
2254
2255
2256def readICUResourceFile(filename):
2257    """Read an ICU resource file.
2258
2259    Yields (<table-name>, <startOrEnd>, <value>) for each table.
2260    """
2261
2262    numberValue = r"-?\d+"
2263    stringValue = r'".+?"'
2264
2265    def asVector(val):
2266        return r"%s(?:\s*,\s*%s)*" % (val, val)
2267
2268    numberVector = asVector(numberValue)
2269    stringVector = asVector(stringValue)
2270
2271    reNumberVector = re.compile(numberVector)
2272    reStringVector = re.compile(stringVector)
2273    reNumberValue = re.compile(numberValue)
2274    reStringValue = re.compile(stringValue)
2275
2276    def parseValue(value):
2277        m = reNumberVector.match(value)
2278        if m:
2279            return [int(v) for v in reNumberValue.findall(value)]
2280        m = reStringVector.match(value)
2281        if m:
2282            return [v[1:-1] for v in reStringValue.findall(value)]
2283        raise RuntimeError("unknown value type: %s" % value)
2284
2285    def extractValue(values):
2286        if len(values) == 0:
2287            return None
2288        if len(values) == 1:
2289            return values[0]
2290        return values
2291
2292    def line(*args):
2293        maybeMultiComments = r"(?:/\*[^*]*\*/)*"
2294        maybeSingleComment = r"(?://.*)?"
2295        lineStart = "^%s" % maybeMultiComments
2296        lineEnd = "%s\s*%s$" % (maybeMultiComments, maybeSingleComment)
2297        return re.compile(r"\s*".join(chain([lineStart], args, [lineEnd])))
2298
2299    tableName = r'(?P<quote>"?)(?P<name>.+?)(?P=quote)'
2300    tableValue = r"(?P<value>%s|%s)" % (numberVector, stringVector)
2301
2302    reStartTable = line(tableName, r"\{")
2303    reEndTable = line(r"\}")
2304    reSingleValue = line(r",?", tableValue, r",?")
2305    reCompactTable = line(tableName, r"\{", tableValue, r"\}")
2306    reEmptyLine = line()
2307
2308    tables = []
2309
2310    def currentTable():
2311        return "|".join(tables)
2312
2313    values = []
2314    for line in flines(filename, "utf-8-sig"):
2315        line = line.strip()
2316        if line == "":
2317            continue
2318
2319        m = reEmptyLine.match(line)
2320        if m:
2321            continue
2322
2323        m = reStartTable.match(line)
2324        if m:
2325            assert len(values) == 0
2326            tables.append(m.group("name"))
2327            continue
2328
2329        m = reEndTable.match(line)
2330        if m:
2331            yield (currentTable(), extractValue(values))
2332            tables.pop()
2333            values = []
2334            continue
2335
2336        m = reCompactTable.match(line)
2337        if m:
2338            assert len(values) == 0
2339            tables.append(m.group("name"))
2340            yield (currentTable(), extractValue(parseValue(m.group("value"))))
2341            tables.pop()
2342            continue
2343
2344        m = reSingleValue.match(line)
2345        if m and tables:
2346            values.extend(parseValue(m.group("value")))
2347            continue
2348
2349        raise RuntimeError("unknown entry: %s" % line)
2350
2351
2352def readICUTimeZonesFromTimezoneTypes(icuTzDir):
2353    """Read the ICU time zone information from `icuTzDir`/timezoneTypes.txt
2354    and returns the tuple (zones, links).
2355    """
2356    typeMapTimeZoneKey = "timezoneTypes:table(nofallback)|typeMap|timezone|"
2357    typeAliasTimeZoneKey = "timezoneTypes:table(nofallback)|typeAlias|timezone|"
2358
2359    def toTimeZone(name):
2360        return Zone(name.replace(":", "/"))
2361
2362    zones = set()
2363    links = dict()
2364
2365    for name, value in readICUResourceFile(os.path.join(icuTzDir, "timezoneTypes.txt")):
2366        if name.startswith(typeMapTimeZoneKey):
2367            zones.add(toTimeZone(name[len(typeMapTimeZoneKey) :]))
2368        if name.startswith(typeAliasTimeZoneKey):
2369            links[toTimeZone(name[len(typeAliasTimeZoneKey) :])] = value
2370
2371    validateTimeZones(zones, links)
2372
2373    return (zones, links)
2374
2375
2376def readICUTimeZonesFromZoneInfo(icuTzDir):
2377    """Read the ICU time zone information from `icuTzDir`/zoneinfo64.txt
2378    and returns the tuple (zones, links).
2379    """
2380    zoneKey = "zoneinfo64:table(nofallback)|Zones:array|:table"
2381    linkKey = "zoneinfo64:table(nofallback)|Zones:array|:int"
2382    namesKey = "zoneinfo64:table(nofallback)|Names"
2383
2384    tzId = 0
2385    tzLinks = dict()
2386    tzNames = []
2387
2388    for name, value in readICUResourceFile(os.path.join(icuTzDir, "zoneinfo64.txt")):
2389        if name == zoneKey:
2390            tzId += 1
2391        elif name == linkKey:
2392            tzLinks[tzId] = int(value)
2393            tzId += 1
2394        elif name == namesKey:
2395            tzNames.extend(value)
2396
2397    links = {Zone(tzNames[zone]): tzNames[target] for (zone, target) in tzLinks.items()}
2398    zones = {Zone(v) for v in tzNames if Zone(v) not in links}
2399
2400    validateTimeZones(zones, links)
2401
2402    return (zones, links)
2403
2404
2405def readICUTimeZones(icuDir, icuTzDir, ignoreFactory):
2406    # zoneinfo64.txt contains the supported time zones by ICU. This data is
2407    # generated from tzdata files, it doesn't include "backzone" in stock ICU.
2408    (zoneinfoZones, zoneinfoLinks) = readICUTimeZonesFromZoneInfo(icuTzDir)
2409
2410    # timezoneTypes.txt contains the canonicalization information for ICU. This
2411    # data is generated from CLDR files. It includes data about time zones from
2412    # tzdata's "backzone" file.
2413    (typesZones, typesLinks) = readICUTimeZonesFromTimezoneTypes(icuTzDir)
2414
2415    # Remove the placeholder time zone "Factory".
2416    # See also <https://github.com/eggert/tz/blob/master/factory>.
2417    if ignoreFactory:
2418        zoneinfoZones.remove(Zone("Factory"))
2419
2420    # Remove the ICU placeholder time zone "Etc/Unknown".
2421    # See also <https://unicode.org/reports/tr35/#Time_Zone_Identifiers>.
2422    for zones in (zoneinfoZones, typesZones):
2423        zones.remove(Zone("Etc/Unknown"))
2424
2425    # Remove any outdated ICU links.
2426    for links in (zoneinfoLinks, typesLinks):
2427        for zone in otherICULegacyLinks().keys():
2428            if zone not in links:
2429                raise KeyError(f"Can't remove non-existent link from '{zone}'")
2430            del links[zone]
2431
2432    # Information in zoneinfo64 should be a superset of timezoneTypes.
2433    def inZoneInfo64(zone):
2434        return zone in zoneinfoZones or zone in zoneinfoLinks
2435
2436    notFoundInZoneInfo64 = [zone for zone in typesZones if not inZoneInfo64(zone)]
2437    if notFoundInZoneInfo64:
2438        raise RuntimeError(
2439            "Missing time zones in zoneinfo64.txt: %s" % notFoundInZoneInfo64
2440        )
2441
2442    notFoundInZoneInfo64 = [
2443        zone for zone in typesLinks.keys() if not inZoneInfo64(zone)
2444    ]
2445    if notFoundInZoneInfo64:
2446        raise RuntimeError(
2447            "Missing time zones in zoneinfo64.txt: %s" % notFoundInZoneInfo64
2448        )
2449
2450    # zoneinfo64.txt only defines the supported time zones by ICU, the canonicalization
2451    # rules are defined through timezoneTypes.txt. Merge both to get the actual zones
2452    # and links used by ICU.
2453    icuZones = set(
2454        chain(
2455            (zone for zone in zoneinfoZones if zone not in typesLinks),
2456            (zone for zone in typesZones),
2457        )
2458    )
2459    icuLinks = dict(
2460        chain(
2461            (
2462                (zone, target)
2463                for (zone, target) in zoneinfoLinks.items()
2464                if zone not in typesZones
2465            ),
2466            ((zone, target) for (zone, target) in typesLinks.items()),
2467        )
2468    )
2469
2470    return (icuZones, icuLinks)
2471
2472
2473def readICULegacyZones(icuDir):
2474    """Read the ICU legacy time zones from `icuTzDir`/tools/tzcode/icuzones
2475    and returns the tuple (zones, links).
2476    """
2477    tzdir = TzDataDir(os.path.join(icuDir, "tools/tzcode"))
2478
2479    # Per spec we must recognize only IANA time zones and links, but ICU
2480    # recognizes various legacy, non-IANA time zones and links. Compute these
2481    # non-IANA time zones and links.
2482
2483    # Most legacy, non-IANA time zones and links are in the icuzones file.
2484    (zones, links) = readIANAFiles(tzdir, ["icuzones"])
2485
2486    # Remove the ICU placeholder time zone "Etc/Unknown".
2487    # See also <https://unicode.org/reports/tr35/#Time_Zone_Identifiers>.
2488    zones.remove(Zone("Etc/Unknown"))
2489
2490    # A handful of non-IANA zones/links are not in icuzones and must be added
2491    # manually so that we won't invoke ICU with them.
2492    for (zone, target) in otherICULegacyLinks().items():
2493        if zone in links:
2494            if links[zone] != target:
2495                raise KeyError(
2496                    f"Can't overwrite link '{zone} -> {links[zone]}' with '{target}'"
2497                )
2498            else:
2499                print(
2500                    f"Info: Link '{zone} -> {target}' can be removed from otherICULegacyLinks()"
2501                )
2502        links[zone] = target
2503
2504    return (zones, links)
2505
2506
2507def otherICULegacyLinks():
2508    """The file `icuTzDir`/tools/tzcode/icuzones contains all ICU legacy time
2509    zones with the exception of time zones which are removed by IANA after an
2510    ICU release.
2511
2512    For example ICU 67 uses tzdata2018i, but tzdata2020b removed the link from
2513    "US/Pacific-New" to "America/Los_Angeles". ICU standalone tzdata updates
2514    don't include modified icuzones files, so we must manually record any IANA
2515    modifications here.
2516
2517    After an ICU update, we can remove any no longer needed entries from this
2518    function by checking if the relevant entries are now included in icuzones.
2519    """
2520
2521    return {
2522        # Current ICU is up-to-date with IANA, so this dict is empty.
2523    }
2524
2525
2526def icuTzDataVersion(icuTzDir):
2527    """Read the ICU time zone version from `icuTzDir`/zoneinfo64.txt."""
2528
2529    def searchInFile(pattern, f):
2530        p = re.compile(pattern)
2531        for line in flines(f, "utf-8-sig"):
2532            m = p.search(line)
2533            if m:
2534                return m.group(1)
2535        return None
2536
2537    zoneinfo = os.path.join(icuTzDir, "zoneinfo64.txt")
2538    if not os.path.isfile(zoneinfo):
2539        raise RuntimeError("file not found: %s" % zoneinfo)
2540    version = searchInFile("^//\s+tz version:\s+([0-9]{4}[a-z])$", zoneinfo)
2541    if version is None:
2542        raise RuntimeError(
2543            "%s does not contain a valid tzdata version string" % zoneinfo
2544        )
2545    return version
2546
2547
2548def findIncorrectICUZones(ianaZones, ianaLinks, icuZones, icuLinks, ignoreBackzone):
2549    """Find incorrect ICU zone entries."""
2550
2551    def isIANATimeZone(zone):
2552        return zone in ianaZones or zone in ianaLinks
2553
2554    def isICUTimeZone(zone):
2555        return zone in icuZones or zone in icuLinks
2556
2557    def isICULink(zone):
2558        return zone in icuLinks
2559
2560    # All IANA zones should be present in ICU.
2561    missingTimeZones = [zone for zone in ianaZones if not isICUTimeZone(zone)]
2562    # Normally zones in backzone are also present as links in one of the other
2563    # time zone files. The only exception to this rule is the Asia/Hanoi time
2564    # zone, this zone is only present in the backzone file.
2565    expectedMissing = [] if ignoreBackzone else [Zone("Asia/Hanoi")]
2566    if missingTimeZones != expectedMissing:
2567        raise RuntimeError(
2568            "Not all zones are present in ICU, did you forget "
2569            "to run intl/update-tzdata.sh? %s" % missingTimeZones
2570        )
2571
2572    # Zones which are only present in ICU?
2573    additionalTimeZones = [zone for zone in icuZones if not isIANATimeZone(zone)]
2574    if additionalTimeZones:
2575        raise RuntimeError(
2576            "Additional zones present in ICU, did you forget "
2577            "to run intl/update-tzdata.sh? %s" % additionalTimeZones
2578        )
2579
2580    # Zones which are marked as links in ICU.
2581    result = ((zone, icuLinks[zone]) for zone in ianaZones if isICULink(zone))
2582
2583    # Remove unnecessary UTC mappings.
2584    utcnames = ["Etc/UTC", "Etc/UCT", "Etc/GMT"]
2585    result = ((zone, target) for (zone, target) in result if zone.name not in utcnames)
2586
2587    return sorted(result, key=itemgetter(0))
2588
2589
2590def findIncorrectICULinks(ianaZones, ianaLinks, icuZones, icuLinks):
2591    """Find incorrect ICU link entries."""
2592
2593    def isIANATimeZone(zone):
2594        return zone in ianaZones or zone in ianaLinks
2595
2596    def isICUTimeZone(zone):
2597        return zone in icuZones or zone in icuLinks
2598
2599    def isICULink(zone):
2600        return zone in icuLinks
2601
2602    def isICUZone(zone):
2603        return zone in icuZones
2604
2605    # All links should be present in ICU.
2606    missingTimeZones = [zone for zone in ianaLinks.keys() if not isICUTimeZone(zone)]
2607    if missingTimeZones:
2608        raise RuntimeError(
2609            "Not all zones are present in ICU, did you forget "
2610            "to run intl/update-tzdata.sh? %s" % missingTimeZones
2611        )
2612
2613    # Links which are only present in ICU?
2614    additionalTimeZones = [zone for zone in icuLinks.keys() if not isIANATimeZone(zone)]
2615    if additionalTimeZones:
2616        raise RuntimeError(
2617            "Additional links present in ICU, did you forget "
2618            "to run intl/update-tzdata.sh? %s" % additionalTimeZones
2619        )
2620
2621    result = chain(
2622        # IANA links which have a different target in ICU.
2623        (
2624            (zone, target, icuLinks[zone])
2625            for (zone, target) in ianaLinks.items()
2626            if isICULink(zone) and target != icuLinks[zone]
2627        ),
2628        # IANA links which are zones in ICU.
2629        (
2630            (zone, target, zone.name)
2631            for (zone, target) in ianaLinks.items()
2632            if isICUZone(zone)
2633        ),
2634    )
2635
2636    # Remove unnecessary UTC mappings.
2637    utcnames = ["Etc/UTC", "Etc/UCT", "Etc/GMT"]
2638    result = (
2639        (zone, target, icuTarget)
2640        for (zone, target, icuTarget) in result
2641        if target not in utcnames or icuTarget not in utcnames
2642    )
2643
2644    return sorted(result, key=itemgetter(0))
2645
2646
2647generatedFileWarning = "// Generated by make_intl_data.py. DO NOT EDIT."
2648tzdataVersionComment = "// tzdata version = {0}"
2649
2650
2651def processTimeZones(
2652    tzdataDir, icuDir, icuTzDir, version, ignoreBackzone, ignoreFactory, out
2653):
2654    """Read the time zone info and create a new time zone cpp file."""
2655    print("Processing tzdata mapping...")
2656    (ianaZones, ianaLinks) = readIANATimeZones(tzdataDir, ignoreBackzone, ignoreFactory)
2657    (icuZones, icuLinks) = readICUTimeZones(icuDir, icuTzDir, ignoreFactory)
2658    (legacyZones, legacyLinks) = readICULegacyZones(icuDir)
2659
2660    # Remove all legacy ICU time zones.
2661    icuZones = {zone for zone in icuZones if zone not in legacyZones}
2662    icuLinks = {
2663        zone: target for (zone, target) in icuLinks.items() if zone not in legacyLinks
2664    }
2665
2666    incorrectZones = findIncorrectICUZones(
2667        ianaZones, ianaLinks, icuZones, icuLinks, ignoreBackzone
2668    )
2669    if not incorrectZones:
2670        print("<<< No incorrect ICU time zones found, please update Intl.js! >>>")
2671        print("<<< Maybe https://ssl.icu-project.org/trac/ticket/12044 was fixed? >>>")
2672
2673    incorrectLinks = findIncorrectICULinks(ianaZones, ianaLinks, icuZones, icuLinks)
2674    if not incorrectLinks:
2675        print("<<< No incorrect ICU time zone links found, please update Intl.js! >>>")
2676        print("<<< Maybe https://ssl.icu-project.org/trac/ticket/12044 was fixed? >>>")
2677
2678    print("Writing Intl tzdata file...")
2679    with io.open(out, mode="w", encoding="utf-8", newline="") as f:
2680        println = partial(print, file=f)
2681
2682        println(generatedFileWarning)
2683        println(tzdataVersionComment.format(version))
2684        println("")
2685
2686        println("#ifndef builtin_intl_TimeZoneDataGenerated_h")
2687        println("#define builtin_intl_TimeZoneDataGenerated_h")
2688        println("")
2689
2690        println("namespace js {")
2691        println("namespace timezone {")
2692        println("")
2693
2694        println("// Format:")
2695        println('// "ZoneName" // ICU-Name [time zone file]')
2696        println("const char* const ianaZonesTreatedAsLinksByICU[] = {")
2697        for (zone, icuZone) in incorrectZones:
2698            println('    "%s", // %s [%s]' % (zone, icuZone, zone.filename))
2699        println("};")
2700        println("")
2701
2702        println("// Format:")
2703        println('// "LinkName", "Target" // ICU-Target [time zone file]')
2704        println("struct LinkAndTarget")
2705        println("{")
2706        println("    const char* const link;")
2707        println("    const char* const target;")
2708        println("};")
2709        println("")
2710        println("const LinkAndTarget ianaLinksCanonicalizedDifferentlyByICU[] = {")
2711        for (zone, target, icuTarget) in incorrectLinks:
2712            println(
2713                '    { "%s", "%s" }, // %s [%s]'
2714                % (zone, target, icuTarget, zone.filename)
2715            )
2716        println("};")
2717        println("")
2718
2719        println(
2720            "// Legacy ICU time zones, these are not valid IANA time zone names. We also"
2721        )
2722        println("// disallow the old and deprecated System V time zones.")
2723        println(
2724            "// https://ssl.icu-project.org/repos/icu/trunk/icu4c/source/tools/tzcode/icuzones"
2725        )  # NOQA: E501
2726        println("const char* const legacyICUTimeZones[] = {")
2727        for zone in chain(sorted(legacyLinks.keys()), sorted(legacyZones)):
2728            println('    "%s",' % zone)
2729        println("};")
2730        println("")
2731
2732        println("} // namespace timezone")
2733        println("} // namespace js")
2734        println("")
2735        println("#endif /* builtin_intl_TimeZoneDataGenerated_h */")
2736
2737
2738def updateBackzoneLinks(tzdataDir, links):
2739    def withZone(fn):
2740        return lambda zone_target: fn(zone_target[0])
2741
2742    (backzoneZones, backzoneLinks) = readIANAFiles(tzdataDir, ["backzone"])
2743    (stableZones, updatedLinks, updatedZones) = partition(
2744        links.items(),
2745        # Link not changed in backzone.
2746        withZone(lambda zone: zone not in backzoneLinks and zone not in backzoneZones),
2747        # Link has a new target.
2748        withZone(lambda zone: zone in backzoneLinks),
2749    )
2750    # Keep stable zones and links with updated target.
2751    return dict(
2752        chain(
2753            stableZones,
2754            map(withZone(lambda zone: (zone, backzoneLinks[zone])), updatedLinks),
2755        )
2756    )
2757
2758
2759def generateTzDataLinkTestContent(testDir, version, fileName, description, links):
2760    with io.open(
2761        os.path.join(testDir, fileName), mode="w", encoding="utf-8", newline=""
2762    ) as f:
2763        println = partial(print, file=f)
2764
2765        println('// |reftest| skip-if(!this.hasOwnProperty("Intl"))')
2766        println("")
2767        println(generatedFileWarning)
2768        println(tzdataVersionComment.format(version))
2769        println(
2770            """
2771const tzMapper = [
2772    x => x,
2773    x => x.toUpperCase(),
2774    x => x.toLowerCase(),
2775];
2776"""
2777        )
2778
2779        println(description)
2780        println("const links = {")
2781        for (zone, target) in sorted(links, key=itemgetter(0)):
2782            println('    "%s": "%s",' % (zone, target))
2783        println("};")
2784
2785        println(
2786            """
2787for (let [linkName, target] of Object.entries(links)) {
2788    if (target === "Etc/UTC" || target === "Etc/GMT")
2789        target = "UTC";
2790
2791    for (let map of tzMapper) {
2792        let dtf = new Intl.DateTimeFormat(undefined, {timeZone: map(linkName)});
2793        let resolvedTimeZone = dtf.resolvedOptions().timeZone;
2794        assertEq(resolvedTimeZone, target, `${linkName} -> ${target}`);
2795    }
2796}
2797"""
2798        )
2799        println(
2800            """
2801if (typeof reportCompare === "function")
2802    reportCompare(0, 0, "ok");
2803"""
2804        )
2805
2806
2807def generateTzDataTestBackwardLinks(tzdataDir, version, ignoreBackzone, testDir):
2808    (zones, links) = readIANAFiles(tzdataDir, ["backward"])
2809    assert len(zones) == 0
2810
2811    if not ignoreBackzone:
2812        links = updateBackzoneLinks(tzdataDir, links)
2813
2814    generateTzDataLinkTestContent(
2815        testDir,
2816        version,
2817        "timeZone_backward_links.js",
2818        "// Link names derived from IANA Time Zone Database, backward file.",
2819        links.items(),
2820    )
2821
2822
2823def generateTzDataTestNotBackwardLinks(tzdataDir, version, ignoreBackzone, testDir):
2824    tzfiles = filterfalse(
2825        {"backward", "backzone"}.__contains__, listIANAFiles(tzdataDir)
2826    )
2827    (zones, links) = readIANAFiles(tzdataDir, tzfiles)
2828
2829    if not ignoreBackzone:
2830        links = updateBackzoneLinks(tzdataDir, links)
2831
2832    generateTzDataLinkTestContent(
2833        testDir,
2834        version,
2835        "timeZone_notbackward_links.js",
2836        "// Link names derived from IANA Time Zone Database, excluding backward file.",
2837        links.items(),
2838    )
2839
2840
2841def generateTzDataTestBackzone(tzdataDir, version, ignoreBackzone, testDir):
2842    backzoneFiles = {"backzone"}
2843    (bkfiles, tzfiles) = partition(listIANAFiles(tzdataDir), backzoneFiles.__contains__)
2844
2845    # Read zone and link infos.
2846    (zones, links) = readIANAFiles(tzdataDir, tzfiles)
2847    (backzones, backlinks) = readIANAFiles(tzdataDir, bkfiles)
2848
2849    if not ignoreBackzone:
2850        comment = """\
2851// This file was generated with historical, pre-1970 backzone information
2852// respected. Therefore, every zone key listed below is its own Zone, not
2853// a Link to a modern-day target as IANA ignoring backzones would say.
2854
2855"""
2856    else:
2857        comment = """\
2858// This file was generated while ignoring historical, pre-1970 backzone
2859// information. Therefore, every zone key listed below is part of a Link
2860// whose target is the corresponding value.
2861
2862"""
2863
2864    generateTzDataLinkTestContent(
2865        testDir,
2866        version,
2867        "timeZone_backzone.js",
2868        comment + "// Backzone zones derived from IANA Time Zone Database.",
2869        (
2870            (zone, zone if not ignoreBackzone else links[zone])
2871            for zone in backzones
2872            if zone in links
2873        ),
2874    )
2875
2876
2877def generateTzDataTestBackzoneLinks(tzdataDir, version, ignoreBackzone, testDir):
2878    backzoneFiles = {"backzone"}
2879    (bkfiles, tzfiles) = partition(listIANAFiles(tzdataDir), backzoneFiles.__contains__)
2880
2881    # Read zone and link infos.
2882    (zones, links) = readIANAFiles(tzdataDir, tzfiles)
2883    (backzones, backlinks) = readIANAFiles(tzdataDir, bkfiles)
2884
2885    if not ignoreBackzone:
2886        comment = """\
2887// This file was generated with historical, pre-1970 backzone information
2888// respected. Therefore, every zone key listed below points to a target
2889// in the backzone file and not to its modern-day target as IANA ignoring
2890// backzones would say.
2891
2892"""
2893    else:
2894        comment = """\
2895// This file was generated while ignoring historical, pre-1970 backzone
2896// information. Therefore, every zone key listed below is part of a Link
2897// whose target is the corresponding value ignoring any backzone entries.
2898
2899"""
2900
2901    generateTzDataLinkTestContent(
2902        testDir,
2903        version,
2904        "timeZone_backzone_links.js",
2905        comment + "// Backzone links derived from IANA Time Zone Database.",
2906        (
2907            (zone, target if not ignoreBackzone else links[zone])
2908            for (zone, target) in backlinks.items()
2909        ),
2910    )
2911
2912
2913def generateTzDataTestVersion(tzdataDir, version, testDir):
2914    fileName = "timeZone_version.js"
2915
2916    with io.open(
2917        os.path.join(testDir, fileName), mode="w", encoding="utf-8", newline=""
2918    ) as f:
2919        println = partial(print, file=f)
2920
2921        println('// |reftest| skip-if(!this.hasOwnProperty("Intl"))')
2922        println("")
2923        println(generatedFileWarning)
2924        println(tzdataVersionComment.format(version))
2925        println("""const tzdata = "{0}";""".format(version))
2926
2927        println(
2928            """
2929if (typeof getICUOptions === "undefined") {
2930    var getICUOptions = SpecialPowers.Cu.getJSTestingFunctions().getICUOptions;
2931}
2932
2933var options = getICUOptions();
2934
2935assertEq(options.tzdata, tzdata);
2936
2937if (typeof reportCompare === "function")
2938    reportCompare(0, 0, "ok");
2939"""
2940        )
2941
2942
2943def generateTzDataTestCanonicalZones(
2944    tzdataDir, version, ignoreBackzone, ignoreFactory, testDir
2945):
2946    fileName = "supportedValuesOf-timeZones-canonical.js"
2947
2948    # Read zone and link infos.
2949    (ianaZones, _) = readIANATimeZones(tzdataDir, ignoreBackzone, ignoreFactory)
2950
2951    # Replace Etc/GMT and Etc/UTC with UTC.
2952    ianaZones.remove(Zone("Etc/GMT"))
2953    ianaZones.remove(Zone("Etc/UTC"))
2954    ianaZones.add(Zone("UTC"))
2955
2956    # See findIncorrectICUZones() for why Asia/Hanoi has to be special-cased.
2957    ianaZones.remove(Zone("Asia/Hanoi"))
2958
2959    if not ignoreBackzone:
2960        comment = """\
2961// This file was generated with historical, pre-1970 backzone information
2962// respected.
2963"""
2964    else:
2965        comment = """\
2966// This file was generated while ignoring historical, pre-1970 backzone
2967// information.
2968"""
2969
2970    with io.open(
2971        os.path.join(testDir, fileName), mode="w", encoding="utf-8", newline=""
2972    ) as f:
2973        println = partial(print, file=f)
2974
2975        println('// |reftest| skip-if(!this.hasOwnProperty("Intl"))')
2976        println("")
2977        println(generatedFileWarning)
2978        println(tzdataVersionComment.format(version))
2979        println("")
2980        println(comment)
2981
2982        println("const zones = [")
2983        for zone in sorted(ianaZones):
2984            println(f'  "{zone}",')
2985        println("];")
2986
2987        println(
2988            """
2989let supported = Intl.supportedValuesOf("timeZone");
2990
2991assertEqArray(supported, zones);
2992
2993if (typeof reportCompare === "function")
2994    reportCompare(0, 0, "ok");
2995"""
2996        )
2997
2998
2999def generateTzDataTests(tzdataDir, version, ignoreBackzone, ignoreFactory, testDir):
3000    dtfTestDir = os.path.join(testDir, "DateTimeFormat")
3001    if not os.path.isdir(dtfTestDir):
3002        raise RuntimeError("not a directory: %s" % dtfTestDir)
3003
3004    generateTzDataTestBackwardLinks(tzdataDir, version, ignoreBackzone, dtfTestDir)
3005    generateTzDataTestNotBackwardLinks(tzdataDir, version, ignoreBackzone, dtfTestDir)
3006    generateTzDataTestBackzone(tzdataDir, version, ignoreBackzone, dtfTestDir)
3007    generateTzDataTestBackzoneLinks(tzdataDir, version, ignoreBackzone, dtfTestDir)
3008    generateTzDataTestVersion(tzdataDir, version, dtfTestDir)
3009    generateTzDataTestCanonicalZones(
3010        tzdataDir, version, ignoreBackzone, ignoreFactory, testDir
3011    )
3012
3013
3014def updateTzdata(topsrcdir, args):
3015    """Update the time zone cpp file."""
3016
3017    icuDir = os.path.join(topsrcdir, "intl/icu/source")
3018    if not os.path.isdir(icuDir):
3019        raise RuntimeError("not a directory: %s" % icuDir)
3020
3021    icuTzDir = os.path.join(topsrcdir, "intl/tzdata/source")
3022    if not os.path.isdir(icuTzDir):
3023        raise RuntimeError("not a directory: %s" % icuTzDir)
3024
3025    intlTestDir = os.path.join(topsrcdir, "js/src/tests/non262/Intl")
3026    if not os.path.isdir(intlTestDir):
3027        raise RuntimeError("not a directory: %s" % intlTestDir)
3028
3029    tzDir = args.tz
3030    if tzDir is not None and not (os.path.isdir(tzDir) or os.path.isfile(tzDir)):
3031        raise RuntimeError("not a directory or file: %s" % tzDir)
3032    ignoreBackzone = args.ignore_backzone
3033    # TODO: Accept or ignore the placeholder time zone "Factory"?
3034    ignoreFactory = False
3035    out = args.out
3036
3037    version = icuTzDataVersion(icuTzDir)
3038    url = (
3039        "https://www.iana.org/time-zones/repository/releases/tzdata%s.tar.gz" % version
3040    )
3041
3042    print("Arguments:")
3043    print("\ttzdata version: %s" % version)
3044    print("\ttzdata URL: %s" % url)
3045    print("\ttzdata directory|file: %s" % tzDir)
3046    print("\tICU directory: %s" % icuDir)
3047    print("\tICU timezone directory: %s" % icuTzDir)
3048    print("\tIgnore backzone file: %s" % ignoreBackzone)
3049    print("\tOutput file: %s" % out)
3050    print("")
3051
3052    def updateFrom(f):
3053        if os.path.isfile(f) and tarfile.is_tarfile(f):
3054            with tarfile.open(f, "r:*") as tar:
3055                processTimeZones(
3056                    TzDataFile(tar),
3057                    icuDir,
3058                    icuTzDir,
3059                    version,
3060                    ignoreBackzone,
3061                    ignoreFactory,
3062                    out,
3063                )
3064                generateTzDataTests(
3065                    TzDataFile(tar), version, ignoreBackzone, ignoreFactory, intlTestDir
3066                )
3067        elif os.path.isdir(f):
3068            processTimeZones(
3069                TzDataDir(f),
3070                icuDir,
3071                icuTzDir,
3072                version,
3073                ignoreBackzone,
3074                ignoreFactory,
3075                out,
3076            )
3077            generateTzDataTests(
3078                TzDataDir(f), version, ignoreBackzone, ignoreFactory, intlTestDir
3079            )
3080        else:
3081            raise RuntimeError("unknown format")
3082
3083    if tzDir is None:
3084        print("Downloading tzdata file...")
3085        with closing(urlopen(url)) as tzfile:
3086            fname = urlsplit(tzfile.geturl()).path.split("/")[-1]
3087            with tempfile.NamedTemporaryFile(suffix=fname) as tztmpfile:
3088                print("File stored in %s" % tztmpfile.name)
3089                tztmpfile.write(tzfile.read())
3090                tztmpfile.flush()
3091                updateFrom(tztmpfile.name)
3092    else:
3093        updateFrom(tzDir)
3094
3095
3096def readCurrencyFile(tree):
3097    reCurrency = re.compile(r"^[A-Z]{3}$")
3098    reIntMinorUnits = re.compile(r"^\d+$")
3099
3100    for country in tree.iterfind(".//CcyNtry"):
3101        # Skip entry if no currency information is available.
3102        currency = country.findtext("Ccy")
3103        if currency is None:
3104            continue
3105        assert reCurrency.match(currency)
3106
3107        minorUnits = country.findtext("CcyMnrUnts")
3108        assert minorUnits is not None
3109
3110        # Skip all entries without minorUnits or which use the default minorUnits.
3111        if reIntMinorUnits.match(minorUnits) and int(minorUnits) != 2:
3112            currencyName = country.findtext("CcyNm")
3113            countryName = country.findtext("CtryNm")
3114            yield (currency, int(minorUnits), currencyName, countryName)
3115
3116
3117def writeCurrencyFile(published, currencies, out):
3118    with io.open(out, mode="w", encoding="utf-8", newline="") as f:
3119        println = partial(print, file=f)
3120
3121        println(generatedFileWarning)
3122        println("// Version: {}".format(published))
3123
3124        println(
3125            """
3126/**
3127 * Mapping from currency codes to the number of decimal digits used for them.
3128 * Default is 2 digits.
3129 *
3130 * Spec: ISO 4217 Currency and Funds Code List.
3131 * http://www.currency-iso.org/en/home/tables/table-a1.html
3132 */"""
3133        )
3134        println("var currencyDigits = {")
3135        for (currency, entries) in groupby(
3136            sorted(currencies, key=itemgetter(0)), itemgetter(0)
3137        ):
3138            for (_, minorUnits, currencyName, countryName) in entries:
3139                println("    // {} ({})".format(currencyName, countryName))
3140            println("    {}: {},".format(currency, minorUnits))
3141        println("};")
3142
3143
3144def updateCurrency(topsrcdir, args):
3145    """Update the CurrencyDataGenerated.js file."""
3146    import xml.etree.ElementTree as ET
3147    from random import randint
3148
3149    url = args.url
3150    out = args.out
3151    filename = args.file
3152
3153    print("Arguments:")
3154    print("\tDownload url: %s" % url)
3155    print("\tLocal currency file: %s" % filename)
3156    print("\tOutput file: %s" % out)
3157    print("")
3158
3159    def updateFrom(currencyFile):
3160        print("Processing currency code list file...")
3161        tree = ET.parse(currencyFile)
3162        published = tree.getroot().attrib["Pblshd"]
3163        currencies = readCurrencyFile(tree)
3164
3165        print("Writing CurrencyData file...")
3166        writeCurrencyFile(published, currencies, out)
3167
3168    if filename is not None:
3169        print("Always make sure you have the newest currency code list file!")
3170        updateFrom(filename)
3171    else:
3172        print("Downloading currency & funds code list...")
3173        request = UrlRequest(url)
3174        request.add_header(
3175            "User-agent",
3176            "Mozilla/5.0 (Mobile; rv:{0}.0) Gecko/{0}.0 Firefox/{0}.0".format(
3177                randint(1, 999)
3178            ),
3179        )
3180        with closing(urlopen(request)) as currencyFile:
3181            fname = urlsplit(currencyFile.geturl()).path.split("/")[-1]
3182            with tempfile.NamedTemporaryFile(suffix=fname) as currencyTmpFile:
3183                print("File stored in %s" % currencyTmpFile.name)
3184                currencyTmpFile.write(currencyFile.read())
3185                currencyTmpFile.flush()
3186                updateFrom(currencyTmpFile.name)
3187
3188
3189def writeUnicodeExtensionsMappings(println, mapping, extension):
3190    println(
3191        """
3192template <size_t Length>
3193static inline bool Is{0}Key(mozilla::Span<const char> key, const char (&str)[Length]) {{
3194  static_assert(Length == {0}KeyLength + 1,
3195                "{0} extension key is two characters long");
3196  return memcmp(key.data(), str, Length - 1) == 0;
3197}}
3198
3199template <size_t Length>
3200static inline bool Is{0}Type(mozilla::Span<const char> type, const char (&str)[Length]) {{
3201  static_assert(Length > {0}KeyLength + 1,
3202                "{0} extension type contains more than two characters");
3203  return type.size() == (Length - 1) &&
3204         memcmp(type.data(), str, Length - 1) == 0;
3205}}
3206""".format(
3207            extension
3208        ).rstrip(
3209            "\n"
3210        )
3211    )
3212
3213    linear_search_max_length = 4
3214
3215    needs_binary_search = any(
3216        len(replacements.items()) > linear_search_max_length
3217        for replacements in mapping.values()
3218    )
3219
3220    if needs_binary_search:
3221        println(
3222            """
3223static int32_t Compare{0}Type(const char* a, mozilla::Span<const char> b) {{
3224  MOZ_ASSERT(!std::char_traits<char>::find(b.data(), b.size(), '\\0'),
3225             "unexpected null-character in string");
3226
3227  using UnsignedChar = unsigned char;
3228  for (size_t i = 0; i < b.size(); i++) {{
3229    // |a| is zero-terminated and |b| doesn't contain a null-terminator. So if
3230    // we've reached the end of |a|, the below if-statement will always be true.
3231    // That ensures we don't read past the end of |a|.
3232    if (int32_t r = UnsignedChar(a[i]) - UnsignedChar(b[i])) {{
3233      return r;
3234    }}
3235  }}
3236
3237  // Return zero if both strings are equal or a positive number if |b| is a
3238  // prefix of |a|.
3239  return int32_t(UnsignedChar(a[b.size()]));
3240}}
3241
3242template <size_t Length>
3243static inline const char* Search{0}Replacement(
3244  const char* (&types)[Length], const char* (&aliases)[Length],
3245  mozilla::Span<const char> type) {{
3246
3247  auto p = std::lower_bound(std::begin(types), std::end(types), type,
3248                            [](const auto& a, const auto& b) {{
3249                              return Compare{0}Type(a, b) < 0;
3250                            }});
3251  if (p != std::end(types) && Compare{0}Type(*p, type) == 0) {{
3252    return aliases[std::distance(std::begin(types), p)];
3253  }}
3254  return nullptr;
3255}}
3256""".format(
3257                extension
3258            ).rstrip(
3259                "\n"
3260            )
3261        )
3262
3263    println(
3264        """
3265/**
3266 * Mapping from deprecated BCP 47 {0} extension types to their preferred
3267 * values.
3268 *
3269 * Spec: https://www.unicode.org/reports/tr35/#Unicode_Locale_Extension_Data_Files
3270 * Spec: https://www.unicode.org/reports/tr35/#t_Extension
3271 */
3272const char* mozilla::intl::Locale::Replace{0}ExtensionType(
3273    mozilla::Span<const char> key, mozilla::Span<const char> type) {{
3274  MOZ_ASSERT(key.size() == {0}KeyLength);
3275  MOZ_ASSERT(IsCanonicallyCased{0}Key(key));
3276
3277  MOZ_ASSERT(type.size() > {0}KeyLength);
3278  MOZ_ASSERT(IsCanonicallyCased{0}Type(type));
3279""".format(
3280            extension
3281        )
3282    )
3283
3284    def to_hash_key(replacements):
3285        return str(sorted(replacements.items()))
3286
3287    def write_array(subtags, name, length):
3288        max_entries = (80 - len("    ")) // (length + len('"", '))
3289
3290        println("    static const char* {}[{}] = {{".format(name, len(subtags)))
3291
3292        for entries in grouper(subtags, max_entries):
3293            entries = (
3294                '"{}"'.format(tag).center(length + 2)
3295                for tag in entries
3296                if tag is not None
3297            )
3298            println("        {},".format(", ".join(entries)))
3299
3300        println("    };")
3301
3302    # Merge duplicate keys.
3303    key_aliases = {}
3304    for (key, replacements) in sorted(mapping.items(), key=itemgetter(0)):
3305        hash_key = to_hash_key(replacements)
3306        if hash_key not in key_aliases:
3307            key_aliases[hash_key] = []
3308        else:
3309            key_aliases[hash_key].append(key)
3310
3311    first_key = True
3312    for (key, replacements) in sorted(mapping.items(), key=itemgetter(0)):
3313        hash_key = to_hash_key(replacements)
3314        if key in key_aliases[hash_key]:
3315            continue
3316
3317        cond = (
3318            'Is{}Key(key, "{}")'.format(extension, k)
3319            for k in [key] + key_aliases[hash_key]
3320        )
3321
3322        if_kind = "if" if first_key else "else if"
3323        cond = (" ||\n" + " " * (2 + len(if_kind) + 2)).join(cond)
3324        println(
3325            """
3326  {} ({}) {{""".format(
3327                if_kind, cond
3328            ).strip(
3329                "\n"
3330            )
3331        )
3332        first_key = False
3333
3334        replacements = sorted(replacements.items(), key=itemgetter(0))
3335
3336        if len(replacements) > linear_search_max_length:
3337            types = [t for (t, _) in replacements]
3338            preferred = [r for (_, r) in replacements]
3339            max_len = max(len(k) for k in types + preferred)
3340
3341            write_array(types, "types", max_len)
3342            write_array(preferred, "aliases", max_len)
3343            println(
3344                """
3345    return Search{}Replacement(types, aliases, type);
3346""".format(
3347                    extension
3348                ).strip(
3349                    "\n"
3350                )
3351            )
3352        else:
3353            for (type, replacement) in replacements:
3354                println(
3355                    """
3356    if (Is{}Type(type, "{}")) {{
3357      return "{}";
3358    }}""".format(
3359                        extension, type, replacement
3360                    ).strip(
3361                        "\n"
3362                    )
3363                )
3364
3365        println(
3366            """
3367  }""".lstrip(
3368                "\n"
3369            )
3370        )
3371
3372    println(
3373        """
3374  return nullptr;
3375}
3376""".strip(
3377            "\n"
3378        )
3379    )
3380
3381
3382def readICUUnitResourceFile(filepath):
3383    """Return a set of unit descriptor pairs where the first entry denotes the unit type and the
3384    second entry the unit name.
3385
3386    Example:
3387
3388    root{
3389        units{
3390            compound{
3391            }
3392            coordinate{
3393            }
3394            length{
3395                meter{
3396                }
3397            }
3398        }
3399        unitsNarrow:alias{"/LOCALE/unitsShort"}
3400        unitsShort{
3401            duration{
3402                day{
3403                }
3404                day-person:alias{"/LOCALE/unitsShort/duration/day"}
3405            }
3406            length{
3407                meter{
3408                }
3409            }
3410        }
3411    }
3412
3413    Returns {("length", "meter"), ("duration", "day"), ("duration", "day-person")}
3414    """
3415
3416    start_table_re = re.compile(r"^([\w\-%:\"]+)\{$")
3417    end_table_re = re.compile(r"^\}$")
3418    table_entry_re = re.compile(r"^([\w\-%:\"]+)\{\"(.*?)\"\}$")
3419
3420    # The current resource table.
3421    table = {}
3422
3423    # List of parent tables when parsing.
3424    parents = []
3425
3426    # Track multi-line comments state.
3427    in_multiline_comment = False
3428
3429    for line in flines(filepath, "utf-8-sig"):
3430        # Remove leading and trailing whitespace.
3431        line = line.strip()
3432
3433        # Skip over comments.
3434        if in_multiline_comment:
3435            if line.endswith("*/"):
3436                in_multiline_comment = False
3437            continue
3438
3439        if line.startswith("//"):
3440            continue
3441
3442        if line.startswith("/*"):
3443            in_multiline_comment = True
3444            continue
3445
3446        # Try to match the start of a table, e.g. `length{` or `meter{`.
3447        match = start_table_re.match(line)
3448        if match:
3449            parents.append(table)
3450            table_name = match.group(1)
3451            new_table = {}
3452            table[table_name] = new_table
3453            table = new_table
3454            continue
3455
3456        # Try to match the end of a table.
3457        match = end_table_re.match(line)
3458        if match:
3459            table = parents.pop()
3460            continue
3461
3462        # Try to match a table entry, e.g. `dnam{"meter"}`.
3463        match = table_entry_re.match(line)
3464        if match:
3465            entry_key = match.group(1)
3466            entry_value = match.group(2)
3467            table[entry_key] = entry_value
3468            continue
3469
3470        raise Exception("unexpected line: '{}' in {}".format(line, filepath))
3471
3472    assert len(parents) == 0, "Not all tables closed"
3473    assert len(table) == 1, "More than one root table"
3474
3475    # Remove the top-level language identifier table.
3476    (_, unit_table) = table.popitem()
3477
3478    # Add all units for the three display formats "units", "unitsNarrow", and "unitsShort".
3479    # But exclude the pseudo-units "compound" and "ccoordinate".
3480    return {
3481        (unit_type, unit_name if not unit_name.endswith(":alias") else unit_name[:-6])
3482        for unit_display in ("units", "unitsNarrow", "unitsShort")
3483        if unit_display in unit_table
3484        for (unit_type, unit_names) in unit_table[unit_display].items()
3485        if unit_type != "compound" and unit_type != "coordinate"
3486        for unit_name in unit_names.keys()
3487    }
3488
3489
3490def computeSupportedUnits(all_units, sanctioned_units):
3491    """Given the set of all possible ICU unit identifiers and the set of sanctioned unit
3492    identifiers, compute the set of effectively supported ICU unit identifiers.
3493    """
3494
3495    def find_match(unit):
3496        unit_match = [
3497            (unit_type, unit_name)
3498            for (unit_type, unit_name) in all_units
3499            if unit_name == unit
3500        ]
3501        if unit_match:
3502            assert len(unit_match) == 1
3503            return unit_match[0]
3504        return None
3505
3506    def compound_unit_identifiers():
3507        for numerator in sanctioned_units:
3508            for denominator in sanctioned_units:
3509                yield "{}-per-{}".format(numerator, denominator)
3510
3511    supported_simple_units = {find_match(unit) for unit in sanctioned_units}
3512    assert None not in supported_simple_units
3513
3514    supported_compound_units = {
3515        unit_match
3516        for unit_match in (find_match(unit) for unit in compound_unit_identifiers())
3517        if unit_match
3518    }
3519
3520    return supported_simple_units | supported_compound_units
3521
3522
3523def readICUDataFilterForUnits(data_filter_file):
3524    with io.open(data_filter_file, mode="r", encoding="utf-8") as f:
3525        data_filter = json.load(f)
3526
3527    # Find the rule set for the "unit_tree".
3528    unit_tree_rules = [
3529        entry["rules"]
3530        for entry in data_filter["resourceFilters"]
3531        if entry["categories"] == ["unit_tree"]
3532    ]
3533    assert len(unit_tree_rules) == 1
3534
3535    # Compute the list of included units from that rule set. The regular expression must match
3536    # "+/*/length/meter" and mustn't match either "-/*" or "+/*/compound".
3537    included_unit_re = re.compile(r"^\+/\*/(.+?)/(.+)$")
3538    filtered_units = (included_unit_re.match(unit) for unit in unit_tree_rules[0])
3539
3540    return {(unit.group(1), unit.group(2)) for unit in filtered_units if unit}
3541
3542
3543def writeSanctionedSimpleUnitIdentifiersFiles(all_units, sanctioned_units):
3544    js_src_builtin_intl_dir = os.path.dirname(os.path.abspath(__file__))
3545    intl_components_src_dir = os.path.join(
3546        js_src_builtin_intl_dir, "../../../../intl/components/src"
3547    )
3548
3549    def find_unit_type(unit):
3550        result = [
3551            unit_type for (unit_type, unit_name) in all_units if unit_name == unit
3552        ]
3553        assert result and len(result) == 1
3554        return result[0]
3555
3556    sanctioned_js_file = os.path.join(
3557        js_src_builtin_intl_dir, "SanctionedSimpleUnitIdentifiersGenerated.js"
3558    )
3559    with io.open(sanctioned_js_file, mode="w", encoding="utf-8", newline="") as f:
3560        println = partial(print, file=f)
3561
3562        sanctioned_units_object = json.dumps(
3563            {unit: True for unit in sorted(sanctioned_units)},
3564            sort_keys=True,
3565            indent=4,
3566            separators=(",", ": "),
3567        )
3568
3569        println(generatedFileWarning)
3570
3571        println(
3572            """
3573/**
3574 * The list of currently supported simple unit identifiers.
3575 *
3576 * Intl.NumberFormat Unified API Proposal
3577 */"""
3578        )
3579
3580        println(
3581            "var sanctionedSimpleUnitIdentifiers = {};".format(sanctioned_units_object)
3582        )
3583
3584    sanctioned_h_file = os.path.join(intl_components_src_dir, "MeasureUnitGenerated.h")
3585    with io.open(sanctioned_h_file, mode="w", encoding="utf-8", newline="") as f:
3586        println = partial(print, file=f)
3587
3588        println(generatedFileWarning)
3589
3590        println(
3591            """
3592#ifndef intl_components_MeasureUnitGenerated_h
3593#define intl_components_MeasureUnitGenerated_h
3594
3595namespace mozilla::intl {
3596
3597struct SimpleMeasureUnit {
3598  const char* const type;
3599  const char* const name;
3600};
3601
3602/**
3603 * The list of currently supported simple unit identifiers.
3604 *
3605 * The list must be kept in alphabetical order of |name|.
3606 */
3607inline constexpr SimpleMeasureUnit simpleMeasureUnits[] = {
3608    // clang-format off"""
3609        )
3610
3611        for unit_name in sorted(sanctioned_units):
3612            println('  {{"{}", "{}"}},'.format(find_unit_type(unit_name), unit_name))
3613
3614        println(
3615            """
3616    // clang-format on
3617};
3618
3619}  // namespace mozilla::intl
3620
3621#endif
3622""".strip(
3623                "\n"
3624            )
3625        )
3626
3627    writeUnitTestFiles(all_units, sanctioned_units)
3628
3629
3630def writeUnitTestFiles(all_units, sanctioned_units):
3631    """Generate test files for unit number formatters."""
3632
3633    js_src_builtin_intl_dir = os.path.dirname(os.path.abspath(__file__))
3634    test_dir = os.path.join(
3635        js_src_builtin_intl_dir, "../../tests/non262/Intl/NumberFormat"
3636    )
3637
3638    def write_test(file_name, test_content, indent=4):
3639        file_path = os.path.join(test_dir, file_name)
3640        with io.open(file_path, mode="w", encoding="utf-8", newline="") as f:
3641            println = partial(print, file=f)
3642
3643            println('// |reftest| skip-if(!this.hasOwnProperty("Intl"))')
3644            println("")
3645            println(generatedFileWarning)
3646            println("")
3647
3648            sanctioned_units_array = json.dumps(
3649                [unit for unit in sorted(sanctioned_units)],
3650                indent=indent,
3651                separators=(",", ": "),
3652            )
3653
3654            println(
3655                "const sanctionedSimpleUnitIdentifiers = {};".format(
3656                    sanctioned_units_array
3657                )
3658            )
3659
3660            println(test_content)
3661
3662            println(
3663                """
3664if (typeof reportCompare === "function")
3665{}reportCompare(true, true);""".format(
3666                    " " * indent
3667                )
3668            )
3669
3670    write_test(
3671        "unit-compound-combinations.js",
3672        """
3673// Test all simple unit identifier combinations are allowed.
3674
3675for (const numerator of sanctionedSimpleUnitIdentifiers) {
3676    for (const denominator of sanctionedSimpleUnitIdentifiers) {
3677        const unit = `${numerator}-per-${denominator}`;
3678        const nf = new Intl.NumberFormat("en", {style: "unit", unit});
3679
3680        assertEq(nf.format(1), nf.formatToParts(1).map(p => p.value).join(""));
3681    }
3682}""",
3683    )
3684
3685    all_units_array = json.dumps(
3686        ["-".join(unit) for unit in sorted(all_units)], indent=4, separators=(",", ": ")
3687    )
3688
3689    write_test(
3690        "unit-well-formed.js",
3691        """
3692const allUnits = {};
3693""".format(
3694            all_units_array
3695        )
3696        + """
3697// Test only sanctioned unit identifiers are allowed.
3698
3699for (const typeAndUnit of allUnits) {
3700    const [_, type, unit] = typeAndUnit.match(/(\w+)-(.+)/);
3701
3702    let allowed;
3703    if (unit.includes("-per-")) {
3704        const [numerator, denominator] = unit.split("-per-");
3705        allowed = sanctionedSimpleUnitIdentifiers.includes(numerator) &&
3706                  sanctionedSimpleUnitIdentifiers.includes(denominator);
3707    } else {
3708        allowed = sanctionedSimpleUnitIdentifiers.includes(unit);
3709    }
3710
3711    if (allowed) {
3712        const nf = new Intl.NumberFormat("en", {style: "unit", unit});
3713        assertEq(nf.format(1), nf.formatToParts(1).map(p => p.value).join(""));
3714    } else {
3715        assertThrowsInstanceOf(() => new Intl.NumberFormat("en", {style: "unit", unit}),
3716                               RangeError, `Missing error for "${typeAndUnit}"`);
3717    }
3718}""",
3719    )
3720
3721    write_test(
3722        "unit-formatToParts-has-unit-field.js",
3723        """
3724// Test only English and Chinese to keep the overall runtime reasonable.
3725//
3726// Chinese is included because it contains more than one "unit" element for
3727// certain unit combinations.
3728const locales = ["en", "zh"];
3729
3730// Plural rules for English only differentiate between "one" and "other". Plural
3731// rules for Chinese only use "other". That means we only need to test two values
3732// per unit.
3733const values = [0, 1];
3734
3735// Ensure unit formatters contain at least one "unit" element.
3736
3737for (const locale of locales) {
3738  for (const unit of sanctionedSimpleUnitIdentifiers) {
3739    const nf = new Intl.NumberFormat(locale, {style: "unit", unit});
3740
3741    for (const value of values) {
3742      assertEq(nf.formatToParts(value).some(e => e.type === "unit"), true,
3743               `locale=${locale}, unit=${unit}`);
3744    }
3745  }
3746
3747  for (const numerator of sanctionedSimpleUnitIdentifiers) {
3748    for (const denominator of sanctionedSimpleUnitIdentifiers) {
3749      const unit = `${numerator}-per-${denominator}`;
3750      const nf = new Intl.NumberFormat(locale, {style: "unit", unit});
3751
3752      for (const value of values) {
3753        assertEq(nf.formatToParts(value).some(e => e.type === "unit"), true,
3754                 `locale=${locale}, unit=${unit}`);
3755      }
3756    }
3757  }
3758}""",
3759        indent=2,
3760    )
3761
3762
3763def updateUnits(topsrcdir, args):
3764    js_src_builtin_intl_dir = os.path.dirname(os.path.abspath(__file__))
3765    icu_path = os.path.join(topsrcdir, "intl", "icu")
3766    icu_unit_path = os.path.join(icu_path, "source", "data", "unit")
3767
3768    with io.open(
3769        os.path.join(js_src_builtin_intl_dir, "SanctionedSimpleUnitIdentifiers.yaml"),
3770        mode="r",
3771        encoding="utf-8",
3772    ) as f:
3773        sanctioned_units = yaml.safe_load(f)
3774
3775    # Read all possible ICU unit identifiers from the "unit/root.txt" resource.
3776    unit_root_file = os.path.join(icu_unit_path, "root.txt")
3777    all_units = readICUUnitResourceFile(unit_root_file)
3778
3779    # Compute the set of effectively supported ICU unit identifiers.
3780    supported_units = computeSupportedUnits(all_units, sanctioned_units)
3781
3782    # Read the list of units we're including into the ICU data file.
3783    data_filter_file = os.path.join(icu_path, "data_filter.json")
3784    filtered_units = readICUDataFilterForUnits(data_filter_file)
3785
3786    # Both sets must match to avoid resource loading errors at runtime.
3787    if supported_units != filtered_units:
3788
3789        def units_to_string(units):
3790            return ", ".join("/".join(u) for u in units)
3791
3792        missing = supported_units - filtered_units
3793        if missing:
3794            raise RuntimeError("Missing units: {}".format(units_to_string(missing)))
3795
3796        # Not exactly an error, but we currently don't have a use case where we need to support
3797        # more units than required by ECMA-402.
3798        extra = filtered_units - supported_units
3799        if extra:
3800            raise RuntimeError("Unnecessary units: {}".format(units_to_string(extra)))
3801
3802    writeSanctionedSimpleUnitIdentifiersFiles(all_units, sanctioned_units)
3803
3804
3805def readICUNumberingSystemsResourceFile(filepath):
3806    """Returns a dictionary of numbering systems where the key denotes the numbering system name
3807    and the value a dictionary with additional numbering system data.
3808
3809    Example:
3810
3811    numberingSystems:table(nofallback){
3812        numberingSystems{
3813            latn{
3814                algorithmic:int{0}
3815                desc{"0123456789"}
3816                radix:int{10}
3817            }
3818            roman{
3819                algorithmic:int{1}
3820                desc{"%roman-upper"}
3821                radix:int{10}
3822            }
3823        }
3824    }
3825
3826    Returns {"latn": {"digits": "0123456789", "algorithmic": False},
3827             "roman": {"algorithmic": True}}
3828    """
3829
3830    start_table_re = re.compile(r"^(\w+)(?:\:[\w\(\)]+)?\{$")
3831    end_table_re = re.compile(r"^\}$")
3832    table_entry_re = re.compile(r"^(\w+)(?:\:[\w\(\)]+)?\{(?:(?:\"(.*?)\")|(\d+))\}$")
3833
3834    # The current resource table.
3835    table = {}
3836
3837    # List of parent tables when parsing.
3838    parents = []
3839
3840    # Track multi-line comments state.
3841    in_multiline_comment = False
3842
3843    for line in flines(filepath, "utf-8-sig"):
3844        # Remove leading and trailing whitespace.
3845        line = line.strip()
3846
3847        # Skip over comments.
3848        if in_multiline_comment:
3849            if line.endswith("*/"):
3850                in_multiline_comment = False
3851            continue
3852
3853        if line.startswith("//"):
3854            continue
3855
3856        if line.startswith("/*"):
3857            in_multiline_comment = True
3858            continue
3859
3860        # Try to match the start of a table, e.g. `latn{`.
3861        match = start_table_re.match(line)
3862        if match:
3863            parents.append(table)
3864            table_name = match.group(1)
3865            new_table = {}
3866            table[table_name] = new_table
3867            table = new_table
3868            continue
3869
3870        # Try to match the end of a table.
3871        match = end_table_re.match(line)
3872        if match:
3873            table = parents.pop()
3874            continue
3875
3876        # Try to match a table entry, e.g. `desc{"0123456789"}`.
3877        match = table_entry_re.match(line)
3878        if match:
3879            entry_key = match.group(1)
3880            entry_value = (
3881                match.group(2) if match.group(2) is not None else int(match.group(3))
3882            )
3883            table[entry_key] = entry_value
3884            continue
3885
3886        raise Exception("unexpected line: '{}' in {}".format(line, filepath))
3887
3888    assert len(parents) == 0, "Not all tables closed"
3889    assert len(table) == 1, "More than one root table"
3890
3891    # Remove the two top-level "numberingSystems" tables.
3892    (_, numbering_systems) = table.popitem()
3893    (_, numbering_systems) = numbering_systems.popitem()
3894
3895    # Assert all numbering systems use base 10.
3896    assert all(ns["radix"] == 10 for ns in numbering_systems.values())
3897
3898    # Return the numbering systems.
3899    return {
3900        key: {"digits": value["desc"], "algorithmic": False}
3901        if not bool(value["algorithmic"])
3902        else {"algorithmic": True}
3903        for (key, value) in numbering_systems.items()
3904    }
3905
3906
3907def writeNumberingSystemFiles(numbering_systems):
3908    js_src_builtin_intl_dir = os.path.dirname(os.path.abspath(__file__))
3909
3910    numbering_systems_js_file = os.path.join(
3911        js_src_builtin_intl_dir, "NumberingSystemsGenerated.h"
3912    )
3913    with io.open(
3914        numbering_systems_js_file, mode="w", encoding="utf-8", newline=""
3915    ) as f:
3916        println = partial(print, file=f)
3917
3918        println(generatedFileWarning)
3919
3920        println(
3921            """
3922/**
3923 * The list of numbering systems with simple digit mappings.
3924 */
3925
3926#ifndef builtin_intl_NumberingSystemsGenerated_h
3927#define builtin_intl_NumberingSystemsGenerated_h
3928"""
3929        )
3930
3931        simple_numbering_systems = sorted(
3932            name
3933            for (name, value) in numbering_systems.items()
3934            if not value["algorithmic"]
3935        )
3936
3937        println("// clang-format off")
3938        println("#define NUMBERING_SYSTEMS_WITH_SIMPLE_DIGIT_MAPPINGS \\")
3939        println(
3940            "{}".format(
3941                ", \\\n".join(
3942                    '  "{}"'.format(name) for name in simple_numbering_systems
3943                )
3944            )
3945        )
3946        println("// clang-format on")
3947        println("")
3948
3949        println("#endif  // builtin_intl_NumberingSystemsGenerated_h")
3950
3951    js_src_builtin_intl_dir = os.path.dirname(os.path.abspath(__file__))
3952    test_dir = os.path.join(js_src_builtin_intl_dir, "../../tests/non262/Intl")
3953
3954    intl_shell_js_file = os.path.join(test_dir, "shell.js")
3955
3956    with io.open(intl_shell_js_file, mode="w", encoding="utf-8", newline="") as f:
3957        println = partial(print, file=f)
3958
3959        println(generatedFileWarning)
3960
3961        println(
3962            """
3963// source: CLDR file common/bcp47/number.xml; version CLDR {}.
3964// https://github.com/unicode-org/cldr/blob/master/common/bcp47/number.xml
3965// https://github.com/unicode-org/cldr/blob/master/common/supplemental/numberingSystems.xml
3966""".format(
3967                readCLDRVersionFromICU()
3968            ).rstrip()
3969        )
3970
3971        numbering_systems_object = json.dumps(
3972            numbering_systems,
3973            indent=2,
3974            separators=(",", ": "),
3975            sort_keys=True,
3976            ensure_ascii=False,
3977        )
3978        println("const numberingSystems = {};".format(numbering_systems_object))
3979
3980
3981def updateNumberingSystems(topsrcdir, args):
3982    js_src_builtin_intl_dir = os.path.dirname(os.path.abspath(__file__))
3983    icu_path = os.path.join(topsrcdir, "intl", "icu")
3984    icu_misc_path = os.path.join(icu_path, "source", "data", "misc")
3985
3986    with io.open(
3987        os.path.join(js_src_builtin_intl_dir, "NumberingSystems.yaml"),
3988        mode="r",
3989        encoding="utf-8",
3990    ) as f:
3991        numbering_systems = yaml.safe_load(f)
3992
3993    # Read all possible ICU unit identifiers from the "misc/numberingSystems.txt" resource.
3994    misc_ns_file = os.path.join(icu_misc_path, "numberingSystems.txt")
3995    all_numbering_systems = readICUNumberingSystemsResourceFile(misc_ns_file)
3996
3997    all_numbering_systems_simple_digits = {
3998        name
3999        for (name, value) in all_numbering_systems.items()
4000        if not value["algorithmic"]
4001    }
4002
4003    # Assert ICU includes support for all required numbering systems. If this assertion fails,
4004    # something is broken in ICU.
4005    assert all_numbering_systems_simple_digits.issuperset(
4006        numbering_systems
4007    ), "{}".format(numbering_systems.difference(all_numbering_systems_simple_digits))
4008
4009    # Assert the spec requires support for all numbering systems with simple digit mappings. If
4010    # this assertion fails, file a PR at <https://github.com/tc39/ecma402> to include any new
4011    # numbering systems.
4012    assert all_numbering_systems_simple_digits.issubset(numbering_systems), "{}".format(
4013        all_numbering_systems_simple_digits.difference(numbering_systems)
4014    )
4015
4016    writeNumberingSystemFiles(all_numbering_systems)
4017
4018
4019if __name__ == "__main__":
4020    import argparse
4021
4022    # This script must reside in js/src/builtin/intl to work correctly.
4023    (thisDir, thisFile) = os.path.split(os.path.abspath(__file__))
4024    dirPaths = os.path.normpath(thisDir).split(os.sep)
4025    if "/".join(dirPaths[-4:]) != "js/src/builtin/intl":
4026        raise RuntimeError("%s must reside in js/src/builtin/intl" % __file__)
4027    topsrcdir = "/".join(dirPaths[:-4])
4028
4029    def EnsureHttps(v):
4030        if not v.startswith("https:"):
4031            raise argparse.ArgumentTypeError("URL protocol must be https: " % v)
4032        return v
4033
4034    parser = argparse.ArgumentParser(description="Update intl data.")
4035    subparsers = parser.add_subparsers(help="Select update mode")
4036
4037    parser_cldr_tags = subparsers.add_parser(
4038        "langtags", help="Update CLDR language tags data"
4039    )
4040    parser_cldr_tags.add_argument(
4041        "--version", metavar="VERSION", help="CLDR version number"
4042    )
4043    parser_cldr_tags.add_argument(
4044        "--url",
4045        metavar="URL",
4046        default="https://unicode.org/Public/cldr/<VERSION>/core.zip",
4047        type=EnsureHttps,
4048        help="Download url CLDR data (default: %(default)s)",
4049    )
4050    parser_cldr_tags.add_argument(
4051        "--out",
4052        default=os.path.join(
4053            topsrcdir, "intl", "components", "src", "LocaleGenerated.cpp"
4054        ),
4055        help="Output file (default: %(default)s)",
4056    )
4057    parser_cldr_tags.add_argument(
4058        "file", nargs="?", help="Local cldr-core.zip file, if omitted uses <URL>"
4059    )
4060    parser_cldr_tags.set_defaults(func=updateCLDRLangTags)
4061
4062    parser_tz = subparsers.add_parser("tzdata", help="Update tzdata")
4063    parser_tz.add_argument(
4064        "--tz",
4065        help="Local tzdata directory or file, if omitted downloads tzdata "
4066        "distribution from https://www.iana.org/time-zones/",
4067    )
4068    # ICU doesn't include the backzone file by default, but we still like to
4069    # use the backzone time zone names to avoid user confusion. This does lead
4070    # to formatting "historic" dates (pre-1970 era) with the wrong time zone,
4071    # but that's probably acceptable for now.
4072    parser_tz.add_argument(
4073        "--ignore-backzone",
4074        action="store_true",
4075        help="Ignore tzdata's 'backzone' file. Can be enabled to generate more "
4076        "accurate time zone canonicalization reflecting the actual time "
4077        "zones as used by ICU.",
4078    )
4079    parser_tz.add_argument(
4080        "--out",
4081        default=os.path.join(thisDir, "TimeZoneDataGenerated.h"),
4082        help="Output file (default: %(default)s)",
4083    )
4084    parser_tz.set_defaults(func=partial(updateTzdata, topsrcdir))
4085
4086    parser_currency = subparsers.add_parser(
4087        "currency", help="Update currency digits mapping"
4088    )
4089    parser_currency.add_argument(
4090        "--url",
4091        metavar="URL",
4092        default="https://www.currency-iso.org/dam/downloads/lists/list_one.xml",  # NOQA: E501
4093        type=EnsureHttps,
4094        help="Download url for the currency & funds code list (default: "
4095        "%(default)s)",
4096    )
4097    parser_currency.add_argument(
4098        "--out",
4099        default=os.path.join(thisDir, "CurrencyDataGenerated.js"),
4100        help="Output file (default: %(default)s)",
4101    )
4102    parser_currency.add_argument(
4103        "file", nargs="?", help="Local currency code list file, if omitted uses <URL>"
4104    )
4105    parser_currency.set_defaults(func=partial(updateCurrency, topsrcdir))
4106
4107    parser_units = subparsers.add_parser(
4108        "units", help="Update sanctioned unit identifiers mapping"
4109    )
4110    parser_units.set_defaults(func=partial(updateUnits, topsrcdir))
4111
4112    parser_numbering_systems = subparsers.add_parser(
4113        "numbering", help="Update numbering systems with simple digit mappings"
4114    )
4115    parser_numbering_systems.set_defaults(
4116        func=partial(updateNumberingSystems, topsrcdir)
4117    )
4118
4119    args = parser.parse_args()
4120    args.func(args)
4121