1#!/usr/bin/env python 2# -*- coding: utf-8 -*- 3# 4# This Source Code Form is subject to the terms of the Mozilla Public 5# License, v. 2.0. If a copy of the MPL was not distributed with this 6# file, You can obtain one at http://mozilla.org/MPL/2.0/. 7 8""" Usage: 9 make_intl_data.py langtags [cldr_core.zip] 10 make_intl_data.py tzdata 11 make_intl_data.py currency 12 make_intl_data.py units 13 make_intl_data.py numbering 14 15 16 Target "langtags": 17 This script extracts information about 1) mappings between deprecated and 18 current Unicode BCP 47 locale identifiers, and 2) deprecated and current 19 BCP 47 Unicode extension value from CLDR, and converts it to C++ mapping 20 code in LanguageTagGenerated.cpp. The code is used in LanguageTag.cpp. 21 22 23 Target "tzdata": 24 This script computes which time zone informations are not up-to-date in ICU 25 and provides the necessary mappings to workaround this problem. 26 https://ssl.icu-project.org/trac/ticket/12044 27 28 29 Target "currency": 30 Generates the mapping from currency codes to decimal digits used for them. 31 32 33 Target "units": 34 Generate source and test files using the list of so-called "sanctioned unit 35 identifiers" and verifies that the ICU data filter includes these units. 36 37 38 Target "numbering": 39 Generate source and test files using the list of numbering systems with 40 simple digit mappings and verifies that it's in sync with ICU/CLDR. 41""" 42 43from __future__ import print_function 44import os 45import re 46import io 47import json 48import shutil 49import sys 50import tarfile 51import tempfile 52import yaml 53from contextlib import closing 54from functools import partial, total_ordering 55from itertools import chain, groupby, tee 56from operator import attrgetter, itemgetter 57from zipfile import ZipFile 58 59if sys.version_info.major == 2: 60 from itertools import ( 61 ifilter as filter, 62 ifilterfalse as filterfalse, 63 imap as map, 64 izip_longest as zip_longest, 65 ) 66 from urllib2 import urlopen, Request as UrlRequest 67 from urlparse import urlsplit 68else: 69 from itertools import filterfalse, zip_longest 70 from urllib.request import urlopen, Request as UrlRequest 71 from urllib.parse import urlsplit 72 73 74# From https://docs.python.org/3/library/itertools.html 75def grouper(iterable, n, fillvalue=None): 76 "Collect data into fixed-length chunks or blocks" 77 # grouper('ABCDEFG', 3, 'x') --> ABC DEF Gxx" 78 args = [iter(iterable)] * n 79 return zip_longest(*args, fillvalue=fillvalue) 80 81 82def writeMappingHeader(println, description, source, url): 83 if type(description) is not list: 84 description = [description] 85 for desc in description: 86 println("// {0}".format(desc)) 87 println("// Derived from {0}.".format(source)) 88 println("// {0}".format(url)) 89 90 91def writeMappingsVar(println, mapping, name, description, source, url): 92 """Writes a variable definition with a mapping table. 93 94 Writes the contents of dictionary |mapping| through the |println| 95 function with the given variable name and a comment with description, 96 fileDate, and URL. 97 """ 98 println("") 99 writeMappingHeader(println, description, source, url) 100 println("var {0} = {{".format(name)) 101 for (key, value) in sorted(mapping.items(), key=itemgetter(0)): 102 println(' "{0}": "{1}",'.format(key, value)) 103 println("};") 104 105 106def writeMappingsBinarySearch( 107 println, 108 fn_name, 109 type_name, 110 name, 111 validate_fn, 112 validate_case_fn, 113 mappings, 114 tag_maxlength, 115 description, 116 source, 117 url, 118): 119 """Emit code to perform a binary search on language tag subtags. 120 121 Uses the contents of |mapping|, which can either be a dictionary or set, 122 to emit a mapping function to find subtag replacements. 123 """ 124 println("") 125 writeMappingHeader(println, description, source, url) 126 println( 127 """ 128bool js::intl::LanguageTag::{0}({1} {2}) {{ 129 MOZ_ASSERT({3}({2}.span())); 130 MOZ_ASSERT({4}({2}.span())); 131""".format( 132 fn_name, type_name, name, validate_fn, validate_case_fn 133 ).strip() 134 ) 135 writeMappingsBinarySearchBody(println, name, name, mappings, tag_maxlength) 136 137 println( 138 """ 139}""".lstrip( 140 "\n" 141 ) 142 ) 143 144 145def writeMappingsBinarySearchBody( 146 println, source_name, target_name, mappings, tag_maxlength 147): 148 def write_array(subtags, name, length, fixed): 149 if fixed: 150 println( 151 " static const char {}[{}][{}] = {{".format( 152 name, len(subtags), length + 1 153 ) 154 ) 155 else: 156 println(" static const char* {}[{}] = {{".format(name, len(subtags))) 157 158 # Group in pairs of ten to not exceed the 80 line column limit. 159 for entries in grouper(subtags, 10): 160 entries = ( 161 '"{}"'.format(tag).rjust(length + 2) 162 for tag in entries 163 if tag is not None 164 ) 165 println(" {},".format(", ".join(entries))) 166 167 println(" };") 168 169 trailing_return = True 170 171 # Sort the subtags by length. That enables using an optimized comparator 172 # for the binary search, which only performs a single |memcmp| for multiple 173 # of two subtag lengths. 174 mappings_keys = mappings.keys() if type(mappings) == dict else mappings 175 for (length, subtags) in groupby(sorted(mappings_keys, key=len), len): 176 # Omit the length check if the current length is the maximum length. 177 if length != tag_maxlength: 178 println( 179 """ 180 if ({}.length() == {}) {{ 181""".format( 182 source_name, length 183 ).rstrip( 184 "\n" 185 ) 186 ) 187 else: 188 trailing_return = False 189 println( 190 """ 191 { 192""".rstrip( 193 "\n" 194 ) 195 ) 196 197 # The subtags need to be sorted for binary search to work. 198 subtags = sorted(subtags) 199 200 def equals(subtag): 201 return """{}.equalTo("{}")""".format(source_name, subtag) 202 203 # Don't emit a binary search for short lists. 204 if len(subtags) == 1: 205 if type(mappings) == dict: 206 println( 207 """ 208 if ({}) {{ 209 {}.set(mozilla::MakeStringSpan("{}")); 210 return true; 211 }} 212 return false; 213""".format( 214 equals(subtags[0]), target_name, mappings[subtags[0]] 215 ).strip( 216 "\n" 217 ) 218 ) 219 else: 220 println( 221 """ 222 return {}; 223""".format( 224 equals(subtags[0]) 225 ).strip( 226 "\n" 227 ) 228 ) 229 elif len(subtags) <= 4: 230 if type(mappings) == dict: 231 for subtag in subtags: 232 println( 233 """ 234 if ({}) {{ 235 {}.set("{}"); 236 return true; 237 }} 238""".format( 239 equals(subtag), target_name, mappings[subtag] 240 ).strip( 241 "\n" 242 ) 243 ) 244 245 println( 246 """ 247 return false; 248""".strip( 249 "\n" 250 ) 251 ) 252 else: 253 cond = (equals(subtag) for subtag in subtags) 254 cond = (" ||\n" + " " * (4 + len("return "))).join(cond) 255 println( 256 """ 257 return {}; 258""".format( 259 cond 260 ).strip( 261 "\n" 262 ) 263 ) 264 else: 265 write_array(subtags, source_name + "s", length, True) 266 267 if type(mappings) == dict: 268 write_array([mappings[k] for k in subtags], "aliases", length, False) 269 270 println( 271 """ 272 if (const char* replacement = SearchReplacement({0}s, aliases, {0})) {{ 273 {1}.set(mozilla::MakeStringSpan(replacement)); 274 return true; 275 }} 276 return false; 277""".format( 278 source_name, target_name 279 ).rstrip() 280 ) 281 else: 282 println( 283 """ 284 return HasReplacement({0}s, {0}); 285""".format( 286 source_name 287 ).rstrip() 288 ) 289 290 println( 291 """ 292 } 293""".strip( 294 "\n" 295 ) 296 ) 297 298 if trailing_return: 299 println( 300 """ 301 return false;""" 302 ) 303 304 305def writeComplexLanguageTagMappings( 306 println, complex_language_mappings, description, source, url 307): 308 println("") 309 writeMappingHeader(println, description, source, url) 310 println( 311 """ 312void js::intl::LanguageTag::performComplexLanguageMappings() { 313 MOZ_ASSERT(IsStructurallyValidLanguageTag(language().span())); 314 MOZ_ASSERT(IsCanonicallyCasedLanguageTag(language().span())); 315""".lstrip() 316 ) 317 318 # Merge duplicate language entries. 319 language_aliases = {} 320 for (deprecated_language, (language, script, region)) in sorted( 321 complex_language_mappings.items(), key=itemgetter(0) 322 ): 323 key = (language, script, region) 324 if key not in language_aliases: 325 language_aliases[key] = [] 326 else: 327 language_aliases[key].append(deprecated_language) 328 329 first_language = True 330 for (deprecated_language, (language, script, region)) in sorted( 331 complex_language_mappings.items(), key=itemgetter(0) 332 ): 333 key = (language, script, region) 334 if deprecated_language in language_aliases[key]: 335 continue 336 337 if_kind = "if" if first_language else "else if" 338 first_language = False 339 340 cond = ( 341 'language().equalTo("{}")'.format(lang) 342 for lang in [deprecated_language] + language_aliases[key] 343 ) 344 cond = (" ||\n" + " " * (2 + len(if_kind) + 2)).join(cond) 345 346 println( 347 """ 348 {} ({}) {{""".format( 349 if_kind, cond 350 ).strip( 351 "\n" 352 ) 353 ) 354 355 println( 356 """ 357 setLanguage("{}");""".format( 358 language 359 ).strip( 360 "\n" 361 ) 362 ) 363 364 if script is not None: 365 println( 366 """ 367 if (script().missing()) {{ 368 setScript("{}"); 369 }}""".format( 370 script 371 ).strip( 372 "\n" 373 ) 374 ) 375 if region is not None: 376 println( 377 """ 378 if (region().missing()) {{ 379 setRegion("{}"); 380 }}""".format( 381 region 382 ).strip( 383 "\n" 384 ) 385 ) 386 println( 387 """ 388 }""".strip( 389 "\n" 390 ) 391 ) 392 393 println( 394 """ 395} 396""".strip( 397 "\n" 398 ) 399 ) 400 401 402def writeComplexRegionTagMappings( 403 println, complex_region_mappings, description, source, url 404): 405 println("") 406 writeMappingHeader(println, description, source, url) 407 println( 408 """ 409void js::intl::LanguageTag::performComplexRegionMappings() { 410 MOZ_ASSERT(IsStructurallyValidLanguageTag(language().span())); 411 MOZ_ASSERT(IsCanonicallyCasedLanguageTag(language().span())); 412 MOZ_ASSERT(IsStructurallyValidRegionTag(region().span())); 413 MOZ_ASSERT(IsCanonicallyCasedRegionTag(region().span())); 414""".lstrip() 415 ) 416 417 # |non_default_replacements| is a list and hence not hashable. Convert it 418 # to a string to get a proper hashable value. 419 def hash_key(default, non_default_replacements): 420 return (default, str(sorted(str(v) for v in non_default_replacements))) 421 422 # Merge duplicate region entries. 423 region_aliases = {} 424 for (deprecated_region, (default, non_default_replacements)) in sorted( 425 complex_region_mappings.items(), key=itemgetter(0) 426 ): 427 key = hash_key(default, non_default_replacements) 428 if key not in region_aliases: 429 region_aliases[key] = [] 430 else: 431 region_aliases[key].append(deprecated_region) 432 433 first_region = True 434 for (deprecated_region, (default, non_default_replacements)) in sorted( 435 complex_region_mappings.items(), key=itemgetter(0) 436 ): 437 key = hash_key(default, non_default_replacements) 438 if deprecated_region in region_aliases[key]: 439 continue 440 441 if_kind = "if" if first_region else "else if" 442 first_region = False 443 444 cond = ( 445 'region().equalTo("{}")'.format(region) 446 for region in [deprecated_region] + region_aliases[key] 447 ) 448 cond = (" ||\n" + " " * (2 + len(if_kind) + 2)).join(cond) 449 450 println( 451 """ 452 {} ({}) {{""".format( 453 if_kind, cond 454 ).strip( 455 "\n" 456 ) 457 ) 458 459 replacement_regions = sorted( 460 {region for (_, _, region) in non_default_replacements} 461 ) 462 463 first_case = True 464 for replacement_region in replacement_regions: 465 replacement_language_script = sorted( 466 (language, script) 467 for (language, script, region) in (non_default_replacements) 468 if region == replacement_region 469 ) 470 471 if_kind = "if" if first_case else "else if" 472 first_case = False 473 474 def compare_tags(language, script): 475 if script is None: 476 return 'language().equalTo("{}")'.format(language) 477 return '(language().equalTo("{}") && script().equalTo("{}"))'.format( 478 language, script 479 ) 480 481 cond = ( 482 compare_tags(language, script) 483 for (language, script) in replacement_language_script 484 ) 485 cond = (" ||\n" + " " * (4 + len(if_kind) + 2)).join(cond) 486 487 println( 488 """ 489 {} ({}) {{ 490 setRegion("{}"); 491 }}""".format( 492 if_kind, cond, replacement_region 493 ) 494 .rstrip() 495 .strip("\n") 496 ) 497 498 println( 499 """ 500 else {{ 501 setRegion("{}"); 502 }} 503 }}""".format( 504 default 505 ) 506 .rstrip() 507 .strip("\n") 508 ) 509 510 println( 511 """ 512} 513""".strip( 514 "\n" 515 ) 516 ) 517 518 519def writeVariantTagMappings(println, variant_mappings, description, source, url): 520 """ Writes a function definition that maps variant subtags. """ 521 println( 522 """ 523static const char* ToCharPointer(const char* str) { 524 return str; 525} 526 527static const char* ToCharPointer(const js::UniqueChars& str) { 528 return str.get(); 529} 530 531template <typename T, typename U = T> 532static bool IsLessThan(const T& a, const U& b) { 533 return strcmp(ToCharPointer(a), ToCharPointer(b)) < 0; 534} 535""" 536 ) 537 writeMappingHeader(println, description, source, url) 538 println( 539 """ 540bool js::intl::LanguageTag::performVariantMappings(JSContext* cx) { 541 // The variant subtags need to be sorted for binary search. 542 MOZ_ASSERT(std::is_sorted(variants_.begin(), variants_.end(), 543 IsLessThan<decltype(variants_)::ElementType>)); 544 545 auto removeVariantAt = [&](size_t index) { 546 variants_.erase(variants_.begin() + index); 547 }; 548 549 auto insertVariantSortedIfNotPresent = [&](const char* variant) { 550 auto* p = std::lower_bound(variants_.begin(), variants_.end(), variant, 551 IsLessThan<decltype(variants_)::ElementType, 552 decltype(variant)>); 553 554 // Don't insert the replacement when already present. 555 if (p != variants_.end() && strcmp(p->get(), variant) == 0) { 556 return true; 557 } 558 559 // Insert the preferred variant in sort order. 560 auto preferred = DuplicateString(cx, variant); 561 if (!preferred) { 562 return false; 563 } 564 return !!variants_.insert(p, std::move(preferred)); 565 }; 566 567 for (size_t i = 0; i < variants_.length(); ) { 568 const char* variant = variants_[i].get(); 569 MOZ_ASSERT(IsCanonicallyCasedVariantTag(mozilla::MakeStringSpan(variant))); 570""".lstrip() 571 ) 572 573 (no_alias, with_alias) = partition( 574 variant_mappings.items(), lambda item: item[1] is None 575 ) 576 577 no_replacements = " ||\n ".join( 578 f"""strcmp(variant, "{deprecated_variant}") == 0""" 579 for (deprecated_variant, _) in sorted(no_alias, key=itemgetter(0)) 580 ) 581 582 println( 583 f""" 584 if ({no_replacements}) {{ 585 removeVariantAt(i); 586 }} 587""".strip( 588 "\n" 589 ) 590 ) 591 592 for (deprecated_variant, (type, replacement)) in sorted( 593 with_alias, key=itemgetter(0) 594 ): 595 println( 596 f""" 597 else if (strcmp(variant, "{deprecated_variant}") == 0) {{ 598 removeVariantAt(i); 599""".strip( 600 "\n" 601 ) 602 ) 603 604 if type == "language": 605 println( 606 f""" 607 setLanguage("{replacement}"); 608""".strip( 609 "\n" 610 ) 611 ) 612 elif type == "region": 613 println( 614 f""" 615 setRegion("{replacement}"); 616""".strip( 617 "\n" 618 ) 619 ) 620 else: 621 assert type == "variant" 622 println( 623 f""" 624 if (!insertVariantSortedIfNotPresent("{replacement}")) {{ 625 return false; 626 }} 627""".strip( 628 "\n" 629 ) 630 ) 631 632 println( 633 """ 634 } 635""".strip( 636 "\n" 637 ) 638 ) 639 640 println( 641 """ 642 else { 643 i++; 644 } 645 } 646 return true; 647} 648""".strip( 649 "\n" 650 ) 651 ) 652 653 654def writeLegacyMappingsFunction(println, legacy_mappings, description, source, url): 655 """ Writes a function definition that maps legacy language tags. """ 656 println("") 657 writeMappingHeader(println, description, source, url) 658 println( 659 """\ 660bool js::intl::LanguageTag::updateLegacyMappings(JSContext* cx) { 661 // We're mapping legacy tags to non-legacy form here. 662 // Other tags remain unchanged. 663 // 664 // Legacy tags are either sign language tags ("sgn") or have one or multiple 665 // variant subtags. Therefore we can quickly exclude most tags by checking 666 // these two subtags. 667 668 MOZ_ASSERT(IsCanonicallyCasedLanguageTag(language().span())); 669 670 if (!language().equalTo("sgn") && variants().length() == 0) { 671 return true; 672 } 673 674 for ([[maybe_unused]] const auto& variant : variants()) { 675 MOZ_ASSERT(IsStructurallyValidVariantTag(mozilla::MakeStringSpan(variant.get()))); 676 MOZ_ASSERT(IsCanonicallyCasedVariantTag(mozilla::MakeStringSpan(variant.get()))); 677 } 678 679 // The variant subtags need to be sorted for binary search. 680 MOZ_ASSERT(std::is_sorted(variants_.begin(), variants_.end(), 681 IsLessThan<decltype(variants_)::ElementType>)); 682 683 auto findVariant = [this](const char* variant) { 684 auto* p = std::lower_bound(variants_.begin(), variants_.end(), variant, 685 IsLessThan<decltype(variants_)::ElementType, 686 decltype(variant)>); 687 688 if (p != variants_.end() && strcmp(p->get(), variant) == 0) { 689 return p; 690 } 691 return static_cast<decltype(p)>(nullptr); 692 }; 693 694 auto insertVariantSortedIfNotPresent = [&](const char* variant) { 695 auto* p = std::lower_bound(variants_.begin(), variants_.end(), variant, 696 IsLessThan<decltype(variants_)::ElementType, 697 decltype(variant)>); 698 699 // Don't insert the replacement when already present. 700 if (p != variants_.end() && strcmp(p->get(), variant) == 0) { 701 return true; 702 } 703 704 // Insert the preferred variant in sort order. 705 auto preferred = DuplicateString(cx, variant); 706 if (!preferred) { 707 return false; 708 } 709 return !!variants_.insert(p, std::move(preferred)); 710 }; 711 712 auto removeVariant = [&](auto* p) { 713 size_t index = std::distance(variants_.begin(), p); 714 variants_.erase(variants_.begin() + index); 715 }; 716 717 auto removeVariants = [&](auto* p, auto* q) { 718 size_t pIndex = std::distance(variants_.begin(), p); 719 size_t qIndex = std::distance(variants_.begin(), q); 720 MOZ_ASSERT(pIndex < qIndex, "variant subtags are sorted"); 721 722 variants_.erase(variants_.begin() + qIndex); 723 variants_.erase(variants_.begin() + pIndex); 724 };""" 725 ) 726 727 # Helper class for pattern matching. 728 class AnyClass: 729 def __eq__(self, obj): 730 return obj is not None 731 732 Any = AnyClass() 733 734 # Group the mappings by language. 735 legacy_mappings_by_language = {} 736 for (type, replacement) in legacy_mappings.items(): 737 (language, _, _, _) = type 738 legacy_mappings_by_language.setdefault(language, {})[type] = replacement 739 740 # Handle the empty language case first. 741 if None in legacy_mappings_by_language: 742 # Get the mappings and remove them from the dict. 743 mappings = legacy_mappings_by_language.pop(None) 744 745 # This case only applies for the "hepburn-heploc" -> "alalc97" 746 # mapping, so just inline it here. 747 from_tag = (None, None, None, "hepburn-heploc") 748 to_tag = (None, None, None, "alalc97") 749 750 assert len(mappings) == 1 751 assert mappings[from_tag] == to_tag 752 753 println( 754 """ 755 if (variants().length() >= 2) { 756 if (auto* hepburn = findVariant("hepburn")) { 757 if (auto* heploc = findVariant("heploc")) { 758 removeVariants(hepburn, heploc); 759 760 if (!insertVariantSortedIfNotPresent("alalc97")) { 761 return false; 762 } 763 } 764 } 765 } 766""" 767 ) 768 769 # Handle sign languages next. 770 if "sgn" in legacy_mappings_by_language: 771 mappings = legacy_mappings_by_language.pop("sgn") 772 773 # Legacy sign language mappings have the form "sgn-XX" where "XX" is 774 # some region code. 775 assert all(type == ("sgn", None, Any, None) for type in mappings.keys()) 776 777 # Legacy sign languages are mapped to a single language subtag. 778 assert all( 779 replacement == (Any, None, None, None) for replacement in mappings.values() 780 ) 781 782 println( 783 """ 784 if (language().equalTo("sgn")) { 785 if (region().present() && signLanguageMapping(language_, region())) { 786 region_.set(mozilla::MakeStringSpan("")); 787 } 788 } 789""".rstrip().lstrip( 790 "\n" 791 ) 792 ) 793 794 # Finally handle all remaining cases. 795 796 # The remaining mappings have neither script nor region subtags in the source locale. 797 assert all( 798 type == (Any, None, None, Any) 799 for mappings in legacy_mappings_by_language.values() 800 for type in mappings.keys() 801 ) 802 803 # And they have neither script nor region nor variant subtags in the target locale. 804 assert all( 805 replacement == (Any, None, None, None) 806 for mappings in legacy_mappings_by_language.values() 807 for replacement in mappings.values() 808 ) 809 810 # Compact the mappings table by removing empty fields. 811 legacy_mappings_by_language = { 812 lang: { 813 variants: r_language 814 for ((_, _, _, variants), (r_language, _, _, _)) in mappings.items() 815 } 816 for (lang, mappings) in legacy_mappings_by_language.items() 817 } 818 819 # Try to combine the remaining cases. 820 legacy_mappings_compact = {} 821 822 # Python can't hash dicts or lists, so use the string representation as the hash key. 823 def hash_key(mappings): 824 return str(sorted(mappings.items(), key=itemgetter(0))) 825 826 for (lang, mappings) in sorted( 827 legacy_mappings_by_language.items(), key=itemgetter(0) 828 ): 829 key = hash_key(mappings) 830 legacy_mappings_compact.setdefault(key, []).append(lang) 831 832 for langs in legacy_mappings_compact.values(): 833 language_equal_to = ( 834 f"""language().equalTo("{lang}")""" for lang in sorted(langs) 835 ) 836 cond = f""" ||\n{" " * len(" else if (")}""".join(language_equal_to) 837 838 println( 839 f""" 840 else if ({cond}) {{ 841""".rstrip().lstrip( 842 "\n" 843 ) 844 ) 845 846 mappings = legacy_mappings_by_language[langs[0]] 847 848 # Count the variant subtags to determine the sort order. 849 def variant_size(m): 850 (k, _) = m 851 return len(k.split("-")) 852 853 # Alias rules are applied by largest union size first. 854 for (size, mappings_by_size) in groupby( 855 sorted(mappings.items(), key=variant_size, reverse=True), key=variant_size 856 ): 857 858 # Convert grouper object to dict. 859 mappings_by_size = dict(mappings_by_size) 860 861 is_first = True 862 chain_if = size == 1 863 864 # Alias rules are applied in alphabetical order 865 for (variants, r_language) in sorted( 866 mappings_by_size.items(), key=itemgetter(0) 867 ): 868 sorted_variants = sorted(variants.split("-")) 869 len_variants = len(sorted_variants) 870 871 maybe_else = "else " if chain_if and not is_first else "" 872 is_first = False 873 874 for (i, variant) in enumerate(sorted_variants): 875 println( 876 f""" 877 {" " * i}{maybe_else}if (auto* {variant} = findVariant("{variant}")) {{ 878""".rstrip().lstrip( 879 "\n" 880 ) 881 ) 882 883 indent = " " * len_variants 884 885 println( 886 f""" 887 {indent}removeVariant{"s" if len_variants > 1 else ""}({", ".join(sorted_variants)}); 888 {indent}setLanguage("{r_language}"); 889 {indent}{"return true;" if not chain_if else ""} 890""".rstrip().lstrip( 891 "\n" 892 ) 893 ) 894 895 for i in range(len_variants, 0, -1): 896 println( 897 f""" 898 {" " * (i - 1)}}} 899""".rstrip().lstrip( 900 "\n" 901 ) 902 ) 903 904 println( 905 """ 906 } 907""".rstrip().lstrip( 908 "\n" 909 ) 910 ) 911 912 println( 913 """ 914 return true; 915}""" 916 ) 917 918 919def writeSignLanguageMappingsFunction( 920 println, legacy_mappings, description, source, url 921): 922 """ Writes a function definition that maps legacy sign language tags. """ 923 println("") 924 writeMappingHeader(println, description, source, url) 925 println( 926 """\ 927bool js::intl::LanguageTag::signLanguageMapping(LanguageSubtag& language, 928 const RegionSubtag& region) { 929 MOZ_ASSERT(language.equalTo("sgn")); 930 MOZ_ASSERT(IsStructurallyValidRegionTag(region.span())); 931 MOZ_ASSERT(IsCanonicallyCasedRegionTag(region.span())); 932""".rstrip() 933 ) 934 935 region_mappings = { 936 rg: lg 937 for ((lang, _, rg, _), (lg, _, _, _)) in legacy_mappings.items() 938 if lang == "sgn" 939 } 940 941 source_name = "region" 942 target_name = "language" 943 tag_maxlength = 3 944 writeMappingsBinarySearchBody( 945 println, source_name, target_name, region_mappings, tag_maxlength 946 ) 947 948 println( 949 """ 950}""".lstrip() 951 ) 952 953 954def readSupplementalData(core_file): 955 """Reads CLDR Supplemental Data and extracts information for Intl.js. 956 957 Information extracted: 958 - legacyMappings: mappings from legacy tags to preferred complete language tags 959 - languageMappings: mappings from language subtags to preferred subtags 960 - complexLanguageMappings: mappings from language subtags with complex rules 961 - regionMappings: mappings from region subtags to preferred subtags 962 - complexRegionMappings: mappings from region subtags with complex rules 963 - variantMappings: mappings from variant subtags to preferred subtags 964 - likelySubtags: likely subtags used for generating test data only 965 Returns these mappings as dictionaries. 966 """ 967 import xml.etree.ElementTree as ET 968 969 # From Unicode BCP 47 locale identifier <https://unicode.org/reports/tr35/>. 970 re_unicode_language_id = re.compile( 971 r""" 972 ^ 973 # unicode_language_id = unicode_language_subtag 974 # unicode_language_subtag = alpha{2,3} | alpha{5,8} 975 (?P<language>[a-z]{2,3}|[a-z]{5,8}) 976 977 # (sep unicode_script_subtag)? 978 # unicode_script_subtag = alpha{4} 979 (?:-(?P<script>[a-z]{4}))? 980 981 # (sep unicode_region_subtag)? 982 # unicode_region_subtag = (alpha{2} | digit{3}) 983 (?:-(?P<region>([a-z]{2}|[0-9]{3})))? 984 985 # (sep unicode_variant_subtag)* 986 # unicode_variant_subtag = (alphanum{5,8} | digit alphanum{3}) 987 (?P<variants>(-([a-z0-9]{5,8}|[0-9][a-z0-9]{3}))+)? 988 $ 989 """, 990 re.IGNORECASE | re.VERBOSE, 991 ) 992 993 # CLDR uses "_" as the separator for some elements. Replace it with "-". 994 def bcp47_id(cldr_id): 995 return cldr_id.replace("_", "-") 996 997 # Return the tuple (language, script, region, variants) and assert all 998 # subtags are in canonical case. 999 def bcp47_canonical(language, script, region, variants): 1000 # Canonical case for language subtags is lower case. 1001 assert language is None or language.lower() == language 1002 1003 # Canonical case for script subtags is title case. 1004 assert script is None or script.title() == script 1005 1006 # Canonical case for region subtags is upper case. 1007 assert region is None or region.upper() == region 1008 1009 # Canonical case for variant subtags is lower case. 1010 assert variants is None or variants.lower() == variants 1011 1012 return (language, script, region, variants[1:] if variants else None) 1013 1014 # Language ids are interpreted as multi-maps in 1015 # <https://www.unicode.org/reports/tr35/#LocaleId_Canonicalization>. 1016 # 1017 # See UTS35, §Annex C, Definitions - 1. Multimap interpretation. 1018 def language_id_to_multimap(language_id): 1019 match = re_unicode_language_id.match(language_id) 1020 assert ( 1021 match is not None 1022 ), f"{language_id} invalid Unicode BCP 47 locale identifier" 1023 1024 canonical_language_id = bcp47_canonical( 1025 *match.group("language", "script", "region", "variants") 1026 ) 1027 (language, _, _, _) = canonical_language_id 1028 1029 # Normalize "und" language to None, but keep the rest as is. 1030 return (language if language != "und" else None,) + canonical_language_id[1:] 1031 1032 rules = {} 1033 territory_exception_rules = {} 1034 1035 tree = ET.parse(core_file.open("common/supplemental/supplementalMetadata.xml")) 1036 1037 # Load the rules from supplementalMetadata.xml. 1038 # 1039 # See UTS35, §Annex C, Definitions - 2. Alias elements. 1040 # See UTS35, §Annex C, Preprocessing. 1041 for alias_name in [ 1042 "languageAlias", 1043 "scriptAlias", 1044 "territoryAlias", 1045 "variantAlias", 1046 ]: 1047 for alias in tree.iterfind(".//" + alias_name): 1048 # Replace '_' by '-'. 1049 type = bcp47_id(alias.get("type")) 1050 replacement = bcp47_id(alias.get("replacement")) 1051 1052 # Prefix with "und-". 1053 if alias_name != "languageAlias": 1054 type = "und-" + type 1055 1056 # Discard all rules where the type is an invalid languageId. 1057 if re_unicode_language_id.match(type) is None: 1058 continue 1059 1060 type = language_id_to_multimap(type) 1061 1062 # Multiple, whitespace-separated territory replacements may be present. 1063 if alias_name == "territoryAlias" and " " in replacement: 1064 replacements = replacement.split(" ") 1065 replacement_list = [ 1066 language_id_to_multimap("und-" + r) for r in replacements 1067 ] 1068 1069 assert ( 1070 type not in territory_exception_rules 1071 ), f"Duplicate alias rule: {type}" 1072 1073 territory_exception_rules[type] = replacement_list 1074 1075 # The first element is the default territory replacement. 1076 replacement = replacements[0] 1077 1078 # Prefix with "und-". 1079 if alias_name != "languageAlias": 1080 replacement = "und-" + replacement 1081 1082 replacement = language_id_to_multimap(replacement) 1083 1084 assert type not in rules, f"Duplicate alias rule: {type}" 1085 1086 rules[type] = replacement 1087 1088 # Helper class for pattern matching. 1089 class AnyClass: 1090 def __eq__(self, obj): 1091 return obj is not None 1092 1093 Any = AnyClass() 1094 1095 modified_rules = True 1096 loop_count = 0 1097 1098 while modified_rules: 1099 modified_rules = False 1100 loop_count += 1 1101 1102 # UTS 35 defines that canonicalization is applied until a fixed point has 1103 # been reached. This iterative application of the canonicalization algorithm 1104 # is only needed for a relatively small set of rules, so we can precompute 1105 # the transitive closure of all rules here and then perform a single pass 1106 # when canonicalizing language tags at runtime. 1107 transitive_rules = {} 1108 1109 # Compute the transitive closure. 1110 # Any case which currently doesn't occur in the CLDR sources isn't supported 1111 # and will lead to throwing an error. 1112 for (type, replacement) in rules.items(): 1113 (language, script, region, variants) = type 1114 (r_language, r_script, r_region, r_variants) = replacement 1115 1116 for (i_type, i_replacement) in rules.items(): 1117 (i_language, i_script, i_region, i_variants) = i_type 1118 (i_r_language, i_r_script, i_r_region, i_r_variants) = i_replacement 1119 1120 if i_language is not None and i_language == r_language: 1121 # This case currently only occurs when neither script nor region 1122 # subtags are present. A single variant subtags may be present 1123 # in |type|. And |i_type| definitely has a single variant subtag. 1124 # Should this ever change, update this code accordingly. 1125 assert type == (Any, None, None, None) or type == ( 1126 Any, 1127 None, 1128 None, 1129 Any, 1130 ) 1131 assert replacement == (Any, None, None, None) 1132 assert i_type == (Any, None, None, Any) 1133 assert i_replacement == (Any, None, None, None) 1134 1135 # This case happens for the rules 1136 # "zh-guoyu -> zh", 1137 # "zh-hakka -> hak", and 1138 # "und-hakka -> und". 1139 # Given the possible input "zh-guoyu-hakka", the first rule will 1140 # change it to "zh-hakka", and then the second rule can be 1141 # applied. (The third rule isn't applied ever.) 1142 # 1143 # Let's assume there's a hypothetical rule 1144 # "zh-aaaaa" -> "en" 1145 # And we have the input "zh-aaaaa-hakka", then "zh-aaaaa -> en" 1146 # is applied before "zh-hakka -> hak", because rules are sorted 1147 # alphabetically. That means the overall result is "en": 1148 # "zh-aaaaa-hakka" is first canonicalized to "en-hakka" and then 1149 # "hakka" is removed through the third rule. 1150 # 1151 # No current rule requires to handle this special case, so we 1152 # don't yet support it. 1153 assert variants is None or variants <= i_variants 1154 1155 # Combine all variants and remove duplicates. 1156 vars = set( 1157 i_variants.split("-") 1158 + (variants.split("-") if variants else []) 1159 ) 1160 1161 # Add the variants alphabetically sorted. 1162 n_type = (language, None, None, "-".join(sorted(vars))) 1163 1164 assert ( 1165 n_type not in transitive_rules 1166 or transitive_rules[n_type] == i_replacement 1167 ) 1168 transitive_rules[n_type] = i_replacement 1169 1170 continue 1171 1172 if i_script is not None and i_script == r_script: 1173 # This case currently doesn't occur, so we don't yet support it. 1174 raise ValueError( 1175 f"{type} -> {replacement} :: {i_type} -> {i_replacement}" 1176 ) 1177 if i_region is not None and i_region == r_region: 1178 # This case currently only applies for sign language 1179 # replacements. Similar to the language subtag case any other 1180 # combination isn't currently supported. 1181 assert type == (None, None, Any, None) 1182 assert replacement == (None, None, Any, None) 1183 assert i_type == ("sgn", None, Any, None) 1184 assert i_replacement == (Any, None, None, None) 1185 1186 n_type = ("sgn", None, region, None) 1187 1188 assert n_type not in transitive_rules 1189 transitive_rules[n_type] = i_replacement 1190 1191 continue 1192 1193 if i_variants is not None and i_variants == r_variants: 1194 # This case currently doesn't occur, so we don't yet support it. 1195 raise ValueError( 1196 f"{type} -> {replacement} :: {i_type} -> {i_replacement}" 1197 ) 1198 1199 # Ensure there are no contradicting rules. 1200 assert all( 1201 rules[type] == replacement 1202 for (type, replacement) in transitive_rules.items() 1203 if type in rules 1204 ) 1205 1206 # If |transitive_rules| is not a subset of |rules|, new rules will be added. 1207 modified_rules = not (transitive_rules.keys() <= rules.keys()) 1208 1209 # Ensure we only have to iterate more than once for the "guoyo-{hakka,xiang}" 1210 # case. Failing this assertion means either there's a bug when computing the 1211 # stop condition of this loop or a new kind of legacy language tags was added. 1212 if modified_rules and loop_count > 1: 1213 new_rules = {k for k in transitive_rules.keys() if k not in rules} 1214 for k in new_rules: 1215 assert k == (Any, None, None, "guoyu-hakka") or k == ( 1216 Any, 1217 None, 1218 None, 1219 "guoyu-xiang", 1220 ) 1221 1222 # Merge the transitive rules. 1223 rules.update(transitive_rules) 1224 1225 # Computes the size of the union of all field value sets. 1226 def multi_map_size(locale_id): 1227 (language, script, region, variants) = locale_id 1228 1229 return ( 1230 (1 if language is not None else 0) 1231 + (1 if script is not None else 0) 1232 + (1 if region is not None else 0) 1233 + (len(variants.split("-")) if variants is not None else 0) 1234 ) 1235 1236 # Dictionary of legacy mappings, contains raw rules, e.g. 1237 # (None, None, None, "hepburn-heploc") -> (None, None, None, "alalc97"). 1238 legacy_mappings = {} 1239 1240 # Dictionary of simple language subtag mappings, e.g. "in" -> "id". 1241 language_mappings = {} 1242 1243 # Dictionary of complex language subtag mappings, modifying more than one 1244 # subtag, e.g. "sh" -> ("sr", "Latn", None) and "cnr" -> ("sr", None, "ME"). 1245 complex_language_mappings = {} 1246 1247 # Dictionary of simple script subtag mappings, e.g. "Qaai" -> "Zinh". 1248 script_mappings = {} 1249 1250 # Dictionary of simple region subtag mappings, e.g. "DD" -> "DE". 1251 region_mappings = {} 1252 1253 # Dictionary of complex region subtag mappings, containing more than one 1254 # replacement, e.g. "SU" -> ("RU", ["AM", "AZ", "BY", ...]). 1255 complex_region_mappings = {} 1256 1257 # Dictionary of aliased variant subtags to a tuple of preferred replacement 1258 # type and replacement, e.g. "arevela" -> ("language", "hy") or 1259 # "aaland" -> ("region", "AX") or "heploc" -> ("variant", "alalc97"). 1260 variant_mappings = {} 1261 1262 # Preprocess all rules so we can perform a single lookup per subtag at runtime. 1263 for (type, replacement) in rules.items(): 1264 (language, script, region, variants) = type 1265 (r_language, r_script, r_region, r_variants) = replacement 1266 1267 type_map_size = multi_map_size(type) 1268 1269 # Most mappings are one-to-one and can be encoded through lookup tables. 1270 if type_map_size == 1: 1271 if language is not None: 1272 assert r_language is not None, "Can't remove a language subtag" 1273 1274 # We don't yet support this case. 1275 assert ( 1276 r_variants is None 1277 ), f"Unhandled variant replacement in language alias: {replacement}" 1278 1279 if replacement == (Any, None, None, None): 1280 language_mappings[language] = r_language 1281 else: 1282 complex_language_mappings[language] = replacement[:-1] 1283 elif script is not None: 1284 # We don't support removing script subtags. 1285 assert ( 1286 r_script is not None 1287 ), f"Can't remove a script subtag: {replacement}" 1288 1289 # We only support one-to-one script mappings for now. 1290 assert replacement == ( 1291 None, 1292 Any, 1293 None, 1294 None, 1295 ), f"Unhandled replacement in script alias: {replacement}" 1296 1297 script_mappings[script] = r_script 1298 elif region is not None: 1299 # We don't support removing region subtags. 1300 assert ( 1301 r_region is not None 1302 ), f"Can't remove a region subtag: {replacement}" 1303 1304 # We only support one-to-one region mappings for now. 1305 assert replacement == ( 1306 None, 1307 None, 1308 Any, 1309 None, 1310 ), f"Unhandled replacement in region alias: {replacement}" 1311 1312 if type not in territory_exception_rules: 1313 region_mappings[region] = r_region 1314 else: 1315 complex_region_mappings[region] = [ 1316 r_region 1317 for (_, _, r_region, _) in territory_exception_rules[type] 1318 ] 1319 else: 1320 assert variants is not None 1321 assert len(variants.split("-")) == 1 1322 1323 # We only support one-to-one variant mappings for now. 1324 assert ( 1325 multi_map_size(replacement) <= 1 1326 ), f"Unhandled replacement in variant alias: {replacement}" 1327 1328 if r_language is not None: 1329 variant_mappings[variants] = ("language", r_language) 1330 elif r_script is not None: 1331 variant_mappings[variants] = ("script", r_script) 1332 elif r_region is not None: 1333 variant_mappings[variants] = ("region", r_region) 1334 elif r_variants is not None: 1335 assert len(r_variants.split("-")) == 1 1336 variant_mappings[variants] = ("variant", r_variants) 1337 else: 1338 variant_mappings[variants] = None 1339 else: 1340 # Alias rules which have multiple input fields must be processed 1341 # first. This applies only to a handful of rules, so our generated 1342 # code adds fast paths to skip these rules in the common case. 1343 1344 # Case 1: Language and at least one variant subtag. 1345 if language is not None and variants is not None: 1346 pass 1347 1348 # Case 2: Sign language and a region subtag. 1349 elif language == "sgn" and region is not None: 1350 pass 1351 1352 # Case 3: "hepburn-heploc" to "alalc97" canonicalization. 1353 elif ( 1354 language is None 1355 and variants is not None 1356 and len(variants.split("-")) == 2 1357 ): 1358 pass 1359 1360 # Any other combination is currently unsupported. 1361 else: 1362 raise ValueError(f"{type} -> {replacement}") 1363 1364 legacy_mappings[type] = replacement 1365 1366 tree = ET.parse(core_file.open("common/supplemental/likelySubtags.xml")) 1367 1368 likely_subtags = {} 1369 1370 for likely_subtag in tree.iterfind(".//likelySubtag"): 1371 from_tag = bcp47_id(likely_subtag.get("from")) 1372 from_match = re_unicode_language_id.match(from_tag) 1373 assert ( 1374 from_match is not None 1375 ), f"{from_tag} invalid Unicode BCP 47 locale identifier" 1376 assert ( 1377 from_match.group("variants") is None 1378 ), f"unexpected variant subtags in {from_tag}" 1379 1380 to_tag = bcp47_id(likely_subtag.get("to")) 1381 to_match = re_unicode_language_id.match(to_tag) 1382 assert ( 1383 to_match is not None 1384 ), f"{to_tag} invalid Unicode BCP 47 locale identifier" 1385 assert ( 1386 to_match.group("variants") is None 1387 ), f"unexpected variant subtags in {to_tag}" 1388 1389 from_canonical = bcp47_canonical( 1390 *from_match.group("language", "script", "region", "variants") 1391 ) 1392 1393 to_canonical = bcp47_canonical( 1394 *to_match.group("language", "script", "region", "variants") 1395 ) 1396 1397 # Remove the empty variant subtags. 1398 from_canonical = from_canonical[:-1] 1399 to_canonical = to_canonical[:-1] 1400 1401 likely_subtags[from_canonical] = to_canonical 1402 1403 complex_region_mappings_final = {} 1404 1405 for (deprecated_region, replacements) in complex_region_mappings.items(): 1406 # Find all likely subtag entries which don't already contain a region 1407 # subtag and whose target region is in the list of replacement regions. 1408 region_likely_subtags = [ 1409 (from_language, from_script, to_region) 1410 for ( 1411 (from_language, from_script, from_region), 1412 (_, _, to_region), 1413 ) in likely_subtags.items() 1414 if from_region is None and to_region in replacements 1415 ] 1416 1417 # The first replacement entry is the default region. 1418 default = replacements[0] 1419 1420 # Find all likely subtag entries whose region matches the default region. 1421 default_replacements = { 1422 (language, script) 1423 for (language, script, region) in region_likely_subtags 1424 if region == default 1425 } 1426 1427 # And finally find those entries which don't use the default region. 1428 # These are the entries we're actually interested in, because those need 1429 # to be handled specially when selecting the correct preferred region. 1430 non_default_replacements = [ 1431 (language, script, region) 1432 for (language, script, region) in region_likely_subtags 1433 if (language, script) not in default_replacements 1434 ] 1435 1436 # If there are no non-default replacements, we can handle the region as 1437 # part of the simple region mapping. 1438 if non_default_replacements: 1439 complex_region_mappings_final[deprecated_region] = ( 1440 default, 1441 non_default_replacements, 1442 ) 1443 else: 1444 region_mappings[deprecated_region] = default 1445 1446 return { 1447 "legacyMappings": legacy_mappings, 1448 "languageMappings": language_mappings, 1449 "complexLanguageMappings": complex_language_mappings, 1450 "scriptMappings": script_mappings, 1451 "regionMappings": region_mappings, 1452 "complexRegionMappings": complex_region_mappings_final, 1453 "variantMappings": variant_mappings, 1454 "likelySubtags": likely_subtags, 1455 } 1456 1457 1458def readUnicodeExtensions(core_file): 1459 import xml.etree.ElementTree as ET 1460 1461 # Match all xml-files in the BCP 47 directory. 1462 bcpFileRE = re.compile(r"^common/bcp47/.+\.xml$") 1463 1464 # https://www.unicode.org/reports/tr35/#Unicode_locale_identifier 1465 # 1466 # type = alphanum{3,8} (sep alphanum{3,8})* ; 1467 typeRE = re.compile(r"^[a-z0-9]{3,8}(-[a-z0-9]{3,8})*$") 1468 1469 # Mapping from Unicode extension types to dict of deprecated to 1470 # preferred values. 1471 mapping = { 1472 # Unicode BCP 47 U Extension 1473 "u": {}, 1474 # Unicode BCP 47 T Extension 1475 "t": {}, 1476 } 1477 1478 def readBCP47File(file): 1479 tree = ET.parse(file) 1480 for keyword in tree.iterfind(".//keyword/key"): 1481 extension = keyword.get("extension", "u") 1482 assert ( 1483 extension == "u" or extension == "t" 1484 ), "unknown extension type: {}".format(extension) 1485 1486 extension_name = keyword.get("name") 1487 1488 for type in keyword.iterfind("type"): 1489 # <https://unicode.org/reports/tr35/#Unicode_Locale_Extension_Data_Files>: 1490 # 1491 # The key or type name used by Unicode locale extension with 'u' extension 1492 # syntax or the 't' extensions syntax. When alias below is absent, this name 1493 # can be also used with the old style "@key=type" syntax. 1494 name = type.get("name") 1495 1496 # Ignore the special name: 1497 # - <https://unicode.org/reports/tr35/#CODEPOINTS> 1498 # - <https://unicode.org/reports/tr35/#REORDER_CODE> 1499 # - <https://unicode.org/reports/tr35/#RG_KEY_VALUE> 1500 # - <https://unicode.org/reports/tr35/#SCRIPT_CODE> 1501 # - <https://unicode.org/reports/tr35/#SUBDIVISION_CODE> 1502 # - <https://unicode.org/reports/tr35/#PRIVATE_USE> 1503 if name in ( 1504 "CODEPOINTS", 1505 "REORDER_CODE", 1506 "RG_KEY_VALUE", 1507 "SCRIPT_CODE", 1508 "SUBDIVISION_CODE", 1509 "PRIVATE_USE", 1510 ): 1511 continue 1512 1513 # All other names should match the 'type' production. 1514 assert ( 1515 typeRE.match(name) is not None 1516 ), "{} matches the 'type' production".format(name) 1517 1518 # <https://unicode.org/reports/tr35/#Unicode_Locale_Extension_Data_Files>: 1519 # 1520 # The preferred value of the deprecated key, type or attribute element. 1521 # When a key, type or attribute element is deprecated, this attribute is 1522 # used for specifying a new canonical form if available. 1523 preferred = type.get("preferred") 1524 1525 # <https://unicode.org/reports/tr35/#Unicode_Locale_Extension_Data_Files>: 1526 # 1527 # The BCP 47 form is the canonical form, and recommended. Other aliases are 1528 # included only for backwards compatibility. 1529 alias = type.get("alias") 1530 1531 # <https://unicode.org/reports/tr35/#Canonical_Unicode_Locale_Identifiers> 1532 # 1533 # Use the bcp47 data to replace keys, types, tfields, and tvalues by their 1534 # canonical forms. See Section 3.6.4 U Extension Data Files) and Section 1535 # 3.7.1 T Extension Data Files. The aliases are in the alias attribute 1536 # value, while the canonical is in the name attribute value. 1537 1538 # 'preferred' contains the new preferred name, 'alias' the compatibility 1539 # name, but then there's this entry where 'preferred' and 'alias' are the 1540 # same. So which one to choose? Assume 'preferred' is the actual canonical 1541 # name. 1542 # 1543 # <type name="islamicc" 1544 # description="Civil (algorithmic) Arabic calendar" 1545 # deprecated="true" 1546 # preferred="islamic-civil" 1547 # alias="islamic-civil"/> 1548 1549 if preferred is not None: 1550 assert typeRE.match(preferred), preferred 1551 mapping[extension].setdefault(extension_name, {})[name] = preferred 1552 1553 if alias is not None: 1554 for alias_name in alias.lower().split(" "): 1555 # Ignore alias entries which don't match the 'type' production. 1556 if typeRE.match(alias_name) is None: 1557 continue 1558 1559 # See comment above when 'alias' and 'preferred' are both present. 1560 if ( 1561 preferred is not None 1562 and name in mapping[extension][extension_name] 1563 ): 1564 continue 1565 1566 # Skip over entries where 'name' and 'alias' are equal. 1567 # 1568 # <type name="pst8pdt" 1569 # description="POSIX style time zone for US Pacific Time" 1570 # alias="PST8PDT" 1571 # since="1.8"/> 1572 if name == alias_name: 1573 continue 1574 1575 mapping[extension].setdefault(extension_name, {})[ 1576 alias_name 1577 ] = name 1578 1579 def readSupplementalMetadata(file): 1580 # Find subdivision and region replacements. 1581 # 1582 # <https://www.unicode.org/reports/tr35/#Canonical_Unicode_Locale_Identifiers> 1583 # 1584 # Replace aliases in special key values: 1585 # - If there is an 'sd' or 'rg' key, replace any subdivision alias 1586 # in its value in the same way, using subdivisionAlias data. 1587 tree = ET.parse(file) 1588 for alias in tree.iterfind(".//subdivisionAlias"): 1589 type = alias.get("type") 1590 assert ( 1591 typeRE.match(type) is not None 1592 ), "{} matches the 'type' production".format(type) 1593 1594 # Take the first replacement when multiple ones are present. 1595 replacement = alias.get("replacement").split(" ")[0].lower() 1596 1597 # Skip over invalid replacements. 1598 # 1599 # <subdivisionAlias type="fi01" replacement="AX" reason="overlong"/> 1600 # 1601 # It's not entirely clear to me if CLDR actually wants to use 1602 # "axzzzz" as the replacement for this case. 1603 if typeRE.match(replacement) is None: 1604 continue 1605 1606 # 'subdivisionAlias' applies to 'rg' and 'sd' keys. 1607 mapping["u"].setdefault("rg", {})[type] = replacement 1608 mapping["u"].setdefault("sd", {})[type] = replacement 1609 1610 for name in core_file.namelist(): 1611 if bcpFileRE.match(name): 1612 readBCP47File(core_file.open(name)) 1613 1614 readSupplementalMetadata( 1615 core_file.open("common/supplemental/supplementalMetadata.xml") 1616 ) 1617 1618 return { 1619 "unicodeMappings": mapping["u"], 1620 "transformMappings": mapping["t"], 1621 } 1622 1623 1624def writeCLDRLanguageTagData(println, data, url): 1625 """ Writes the language tag data to the Intl data file. """ 1626 1627 println(generatedFileWarning) 1628 println("// Version: CLDR-{}".format(data["version"])) 1629 println("// URL: {}".format(url)) 1630 1631 println( 1632 """ 1633#include "mozilla/Assertions.h" 1634#include "mozilla/Span.h" 1635#include "mozilla/TextUtils.h" 1636 1637#include <algorithm> 1638#include <cstdint> 1639#include <cstring> 1640#include <iterator> 1641#include <string> 1642#include <type_traits> 1643 1644#include "builtin/intl/LanguageTag.h" 1645#include "util/Text.h" 1646#include "vm/JSContext.h" 1647 1648using namespace js::intl::LanguageTagLimits; 1649 1650template <size_t Length, size_t TagLength, size_t SubtagLength> 1651static inline bool HasReplacement( 1652 const char (&subtags)[Length][TagLength], 1653 const js::intl::LanguageTagSubtag<SubtagLength>& subtag) { 1654 MOZ_ASSERT(subtag.length() == TagLength - 1, 1655 "subtag must have the same length as the list of subtags"); 1656 1657 const char* ptr = subtag.span().data(); 1658 return std::binary_search(std::begin(subtags), std::end(subtags), ptr, 1659 [](const char* a, const char* b) { 1660 return memcmp(a, b, TagLength - 1) < 0; 1661 }); 1662} 1663 1664template <size_t Length, size_t TagLength, size_t SubtagLength> 1665static inline const char* SearchReplacement( 1666 const char (&subtags)[Length][TagLength], 1667 const char* (&aliases)[Length], 1668 const js::intl::LanguageTagSubtag<SubtagLength>& subtag) { 1669 MOZ_ASSERT(subtag.length() == TagLength - 1, 1670 "subtag must have the same length as the list of subtags"); 1671 1672 const char* ptr = subtag.span().data(); 1673 auto p = std::lower_bound(std::begin(subtags), std::end(subtags), ptr, 1674 [](const char* a, const char* b) { 1675 return memcmp(a, b, TagLength - 1) < 0; 1676 }); 1677 if (p != std::end(subtags) && memcmp(*p, ptr, TagLength - 1) == 0) { 1678 return aliases[std::distance(std::begin(subtags), p)]; 1679 } 1680 return nullptr; 1681} 1682 1683#ifdef DEBUG 1684static bool IsAsciiLowercaseAlphanumeric(char c) { 1685 return mozilla::IsAsciiLowercaseAlpha(c) || mozilla::IsAsciiDigit(c); 1686} 1687 1688static bool IsAsciiLowercaseAlphanumericOrDash(char c) { 1689 return IsAsciiLowercaseAlphanumeric(c) || c == '-'; 1690} 1691 1692static bool IsCanonicallyCasedLanguageTag(mozilla::Span<const char> span) { 1693 // Tell the analysis the |std::all_of| function can't GC. 1694 JS::AutoSuppressGCAnalysis nogc; 1695 1696 return std::all_of(span.begin(), span.end(), mozilla::IsAsciiLowercaseAlpha<char>); 1697} 1698 1699static bool IsCanonicallyCasedScriptTag(mozilla::Span<const char> span) { 1700 // Tell the analysis the |std::all_of| function can't GC. 1701 JS::AutoSuppressGCAnalysis nogc; 1702 1703 return mozilla::IsAsciiUppercaseAlpha(span[0]) && 1704 std::all_of(span.begin() + 1, span.end(), mozilla::IsAsciiLowercaseAlpha<char>); 1705} 1706 1707static bool IsCanonicallyCasedRegionTag(mozilla::Span<const char> span) { 1708 // Tell the analysis the |std::all_of| function can't GC. 1709 JS::AutoSuppressGCAnalysis nogc; 1710 1711 return std::all_of(span.begin(), span.end(), mozilla::IsAsciiUppercaseAlpha<char>) || 1712 std::all_of(span.begin(), span.end(), mozilla::IsAsciiDigit<char>); 1713} 1714 1715static bool IsCanonicallyCasedVariantTag(mozilla::Span<const char> span) { 1716 // Tell the analysis the |std::all_of| function can't GC. 1717 JS::AutoSuppressGCAnalysis nogc; 1718 1719 return std::all_of(span.begin(), span.end(), IsAsciiLowercaseAlphanumeric); 1720} 1721 1722static bool IsCanonicallyCasedUnicodeKey(mozilla::Span<const char> key) { 1723 return std::all_of(key.begin(), key.end(), IsAsciiLowercaseAlphanumeric); 1724} 1725 1726static bool IsCanonicallyCasedUnicodeType(mozilla::Span<const char> type) { 1727 return std::all_of(type.begin(), type.end(), IsAsciiLowercaseAlphanumericOrDash); 1728} 1729 1730static bool IsCanonicallyCasedTransformKey(mozilla::Span<const char> key) { 1731 return std::all_of(key.begin(), key.end(), IsAsciiLowercaseAlphanumeric); 1732} 1733 1734static bool IsCanonicallyCasedTransformType(mozilla::Span<const char> type) { 1735 return std::all_of(type.begin(), type.end(), IsAsciiLowercaseAlphanumericOrDash); 1736} 1737#endif 1738""".rstrip() 1739 ) 1740 1741 source = "CLDR Supplemental Data, version {}".format(data["version"]) 1742 legacy_mappings = data["legacyMappings"] 1743 language_mappings = data["languageMappings"] 1744 complex_language_mappings = data["complexLanguageMappings"] 1745 script_mappings = data["scriptMappings"] 1746 region_mappings = data["regionMappings"] 1747 complex_region_mappings = data["complexRegionMappings"] 1748 variant_mappings = data["variantMappings"] 1749 unicode_mappings = data["unicodeMappings"] 1750 transform_mappings = data["transformMappings"] 1751 1752 # unicode_language_subtag = alpha{2,3} | alpha{5,8} ; 1753 language_maxlength = 8 1754 1755 # unicode_script_subtag = alpha{4} ; 1756 script_maxlength = 4 1757 1758 # unicode_region_subtag = (alpha{2} | digit{3}) ; 1759 region_maxlength = 3 1760 1761 writeMappingsBinarySearch( 1762 println, 1763 "languageMapping", 1764 "LanguageSubtag&", 1765 "language", 1766 "IsStructurallyValidLanguageTag", 1767 "IsCanonicallyCasedLanguageTag", 1768 language_mappings, 1769 language_maxlength, 1770 "Mappings from language subtags to preferred values.", 1771 source, 1772 url, 1773 ) 1774 writeMappingsBinarySearch( 1775 println, 1776 "complexLanguageMapping", 1777 "const LanguageSubtag&", 1778 "language", 1779 "IsStructurallyValidLanguageTag", 1780 "IsCanonicallyCasedLanguageTag", 1781 complex_language_mappings.keys(), 1782 language_maxlength, 1783 "Language subtags with complex mappings.", 1784 source, 1785 url, 1786 ) 1787 writeMappingsBinarySearch( 1788 println, 1789 "scriptMapping", 1790 "ScriptSubtag&", 1791 "script", 1792 "IsStructurallyValidScriptTag", 1793 "IsCanonicallyCasedScriptTag", 1794 script_mappings, 1795 script_maxlength, 1796 "Mappings from script subtags to preferred values.", 1797 source, 1798 url, 1799 ) 1800 writeMappingsBinarySearch( 1801 println, 1802 "regionMapping", 1803 "RegionSubtag&", 1804 "region", 1805 "IsStructurallyValidRegionTag", 1806 "IsCanonicallyCasedRegionTag", 1807 region_mappings, 1808 region_maxlength, 1809 "Mappings from region subtags to preferred values.", 1810 source, 1811 url, 1812 ) 1813 writeMappingsBinarySearch( 1814 println, 1815 "complexRegionMapping", 1816 "const RegionSubtag&", 1817 "region", 1818 "IsStructurallyValidRegionTag", 1819 "IsCanonicallyCasedRegionTag", 1820 complex_region_mappings.keys(), 1821 region_maxlength, 1822 "Region subtags with complex mappings.", 1823 source, 1824 url, 1825 ) 1826 1827 writeComplexLanguageTagMappings( 1828 println, 1829 complex_language_mappings, 1830 "Language subtags with complex mappings.", 1831 source, 1832 url, 1833 ) 1834 writeComplexRegionTagMappings( 1835 println, 1836 complex_region_mappings, 1837 "Region subtags with complex mappings.", 1838 source, 1839 url, 1840 ) 1841 1842 writeVariantTagMappings( 1843 println, 1844 variant_mappings, 1845 "Mappings from variant subtags to preferred values.", 1846 source, 1847 url, 1848 ) 1849 1850 writeLegacyMappingsFunction( 1851 println, legacy_mappings, "Canonicalize legacy locale identifiers.", source, url 1852 ) 1853 1854 writeSignLanguageMappingsFunction( 1855 println, legacy_mappings, "Mappings from legacy sign languages.", source, url 1856 ) 1857 1858 writeUnicodeExtensionsMappings(println, unicode_mappings, "Unicode") 1859 writeUnicodeExtensionsMappings(println, transform_mappings, "Transform") 1860 1861 1862def writeCLDRLanguageTagLikelySubtagsTest(println, data, url): 1863 """ Writes the likely-subtags test file. """ 1864 1865 println(generatedFileWarning) 1866 1867 source = "CLDR Supplemental Data, version {}".format(data["version"]) 1868 language_mappings = data["languageMappings"] 1869 complex_language_mappings = data["complexLanguageMappings"] 1870 script_mappings = data["scriptMappings"] 1871 region_mappings = data["regionMappings"] 1872 complex_region_mappings = data["complexRegionMappings"] 1873 likely_subtags = data["likelySubtags"] 1874 1875 def bcp47(tag): 1876 (language, script, region) = tag 1877 return "{}{}{}".format( 1878 language, "-" + script if script else "", "-" + region if region else "" 1879 ) 1880 1881 def canonical(tag): 1882 (language, script, region) = tag 1883 1884 # Map deprecated language subtags. 1885 if language in language_mappings: 1886 language = language_mappings[language] 1887 elif language in complex_language_mappings: 1888 (language2, script2, region2) = complex_language_mappings[language] 1889 (language, script, region) = ( 1890 language2, 1891 script if script else script2, 1892 region if region else region2, 1893 ) 1894 1895 # Map deprecated script subtags. 1896 if script in script_mappings: 1897 script = script_mappings[script] 1898 1899 # Map deprecated region subtags. 1900 if region in region_mappings: 1901 region = region_mappings[region] 1902 else: 1903 # Assume no complex region mappings are needed for now. 1904 assert ( 1905 region not in complex_region_mappings 1906 ), "unexpected region with complex mappings: {}".format(region) 1907 1908 return (language, script, region) 1909 1910 # https://unicode.org/reports/tr35/#Likely_Subtags 1911 1912 def addLikelySubtags(tag): 1913 # Step 1: Canonicalize. 1914 (language, script, region) = canonical(tag) 1915 if script == "Zzzz": 1916 script = None 1917 if region == "ZZ": 1918 region = None 1919 1920 # Step 2: Lookup. 1921 searches = ( 1922 (language, script, region), 1923 (language, None, region), 1924 (language, script, None), 1925 (language, None, None), 1926 ("und", script, None), 1927 ) 1928 search = next(search for search in searches if search in likely_subtags) 1929 1930 (language_s, script_s, region_s) = search 1931 (language_m, script_m, region_m) = likely_subtags[search] 1932 1933 # Step 3: Return. 1934 return ( 1935 language if language != language_s else language_m, 1936 script if script != script_s else script_m, 1937 region if region != region_s else region_m, 1938 ) 1939 1940 # https://unicode.org/reports/tr35/#Likely_Subtags 1941 def removeLikelySubtags(tag): 1942 # Step 1: Add likely subtags. 1943 max = addLikelySubtags(tag) 1944 1945 # Step 2: Remove variants (doesn't apply here). 1946 1947 # Step 3: Find a match. 1948 (language, script, region) = max 1949 for trial in ( 1950 (language, None, None), 1951 (language, None, region), 1952 (language, script, None), 1953 ): 1954 if addLikelySubtags(trial) == max: 1955 return trial 1956 1957 # Step 4: Return maximized if no match found. 1958 return max 1959 1960 def likely_canonical(from_tag, to_tag): 1961 # Canonicalize the input tag. 1962 from_tag = canonical(from_tag) 1963 1964 # Update the expected result if necessary. 1965 if from_tag in likely_subtags: 1966 to_tag = likely_subtags[from_tag] 1967 1968 # Canonicalize the expected output. 1969 to_canonical = canonical(to_tag) 1970 1971 # Sanity check: This should match the result of |addLikelySubtags|. 1972 assert to_canonical == addLikelySubtags(from_tag) 1973 1974 return to_canonical 1975 1976 # |likely_subtags| contains non-canonicalized tags, so canonicalize it first. 1977 likely_subtags_canonical = { 1978 k: likely_canonical(k, v) for (k, v) in likely_subtags.items() 1979 } 1980 1981 # Add test data for |Intl.Locale.prototype.maximize()|. 1982 writeMappingsVar( 1983 println, 1984 {bcp47(k): bcp47(v) for (k, v) in likely_subtags_canonical.items()}, 1985 "maxLikelySubtags", 1986 "Extracted from likelySubtags.xml.", 1987 source, 1988 url, 1989 ) 1990 1991 # Use the maximalized tags as the input for the remove likely-subtags test. 1992 minimized = { 1993 tag: removeLikelySubtags(tag) for tag in likely_subtags_canonical.values() 1994 } 1995 1996 # Add test data for |Intl.Locale.prototype.minimize()|. 1997 writeMappingsVar( 1998 println, 1999 {bcp47(k): bcp47(v) for (k, v) in minimized.items()}, 2000 "minLikelySubtags", 2001 "Extracted from likelySubtags.xml.", 2002 source, 2003 url, 2004 ) 2005 2006 println( 2007 """ 2008for (let [tag, maximal] of Object.entries(maxLikelySubtags)) { 2009 assertEq(new Intl.Locale(tag).maximize().toString(), maximal); 2010}""" 2011 ) 2012 2013 println( 2014 """ 2015for (let [tag, minimal] of Object.entries(minLikelySubtags)) { 2016 assertEq(new Intl.Locale(tag).minimize().toString(), minimal); 2017}""" 2018 ) 2019 2020 println( 2021 """ 2022if (typeof reportCompare === "function") 2023 reportCompare(0, 0);""" 2024 ) 2025 2026 2027def readCLDRVersionFromICU(): 2028 icuDir = os.path.join(topsrcdir, "intl/icu/source") 2029 if not os.path.isdir(icuDir): 2030 raise RuntimeError("not a directory: {}".format(icuDir)) 2031 2032 reVersion = re.compile(r'\s*cldrVersion\{"(\d+(?:\.\d+)?)"\}') 2033 2034 for line in flines(os.path.join(icuDir, "data/misc/supplementalData.txt")): 2035 m = reVersion.match(line) 2036 if m: 2037 version = m.group(1) 2038 break 2039 2040 if version is None: 2041 raise RuntimeError("can't resolve CLDR version") 2042 2043 return version 2044 2045 2046def updateCLDRLangTags(args): 2047 """ Update the LanguageTagGenerated.cpp file. """ 2048 version = args.version 2049 url = args.url 2050 out = args.out 2051 filename = args.file 2052 2053 # Determine current CLDR version from ICU. 2054 if version is None: 2055 version = readCLDRVersionFromICU() 2056 2057 url = url.replace("<VERSION>", version) 2058 2059 print("Arguments:") 2060 print("\tCLDR version: %s" % version) 2061 print("\tDownload url: %s" % url) 2062 if filename is not None: 2063 print("\tLocal CLDR core.zip file: %s" % filename) 2064 print("\tOutput file: %s" % out) 2065 print("") 2066 2067 data = { 2068 "version": version, 2069 } 2070 2071 def readFiles(cldr_file): 2072 with ZipFile(cldr_file) as zip_file: 2073 data.update(readSupplementalData(zip_file)) 2074 data.update(readUnicodeExtensions(zip_file)) 2075 2076 print("Processing CLDR data...") 2077 if filename is not None: 2078 print("Always make sure you have the newest CLDR core.zip!") 2079 with open(filename, "rb") as cldr_file: 2080 readFiles(cldr_file) 2081 else: 2082 print("Downloading CLDR core.zip...") 2083 with closing(urlopen(url)) as cldr_file: 2084 cldr_data = io.BytesIO(cldr_file.read()) 2085 readFiles(cldr_data) 2086 2087 print("Writing Intl data...") 2088 with io.open(out, mode="w", encoding="utf-8", newline="") as f: 2089 println = partial(print, file=f) 2090 2091 writeCLDRLanguageTagData(println, data, url) 2092 2093 print("Writing Intl test data...") 2094 js_src_builtin_intl_dir = os.path.dirname(os.path.abspath(__file__)) 2095 test_file = os.path.join( 2096 js_src_builtin_intl_dir, 2097 "../../tests/non262/Intl/Locale/likely-subtags-generated.js", 2098 ) 2099 with io.open(test_file, mode="w", encoding="utf-8", newline="") as f: 2100 println = partial(print, file=f) 2101 2102 println("// |reftest| skip-if(!this.hasOwnProperty('Intl'))") 2103 writeCLDRLanguageTagLikelySubtagsTest(println, data, url) 2104 2105 2106def flines(filepath, encoding="utf-8"): 2107 """ Open filepath and iterate over its content. """ 2108 with io.open(filepath, mode="r", encoding=encoding) as f: 2109 for line in f: 2110 yield line 2111 2112 2113@total_ordering 2114class Zone(object): 2115 """ Time zone with optional file name. """ 2116 2117 def __init__(self, name, filename=""): 2118 self.name = name 2119 self.filename = filename 2120 2121 def __eq__(self, other): 2122 return hasattr(other, "name") and self.name == other.name 2123 2124 def __lt__(self, other): 2125 return self.name < other.name 2126 2127 def __hash__(self): 2128 return hash(self.name) 2129 2130 def __str__(self): 2131 return self.name 2132 2133 def __repr__(self): 2134 return self.name 2135 2136 2137class TzDataDir(object): 2138 """ tzdata source from a directory. """ 2139 2140 def __init__(self, obj): 2141 self.name = partial(os.path.basename, obj) 2142 self.resolve = partial(os.path.join, obj) 2143 self.basename = os.path.basename 2144 self.isfile = os.path.isfile 2145 self.listdir = partial(os.listdir, obj) 2146 self.readlines = flines 2147 2148 2149class TzDataFile(object): 2150 """ tzdata source from a file (tar or gzipped). """ 2151 2152 def __init__(self, obj): 2153 self.name = lambda: os.path.splitext( 2154 os.path.splitext(os.path.basename(obj))[0] 2155 )[0] 2156 self.resolve = obj.getmember 2157 self.basename = attrgetter("name") 2158 self.isfile = tarfile.TarInfo.isfile 2159 self.listdir = obj.getnames 2160 self.readlines = partial(self._tarlines, obj) 2161 2162 def _tarlines(self, tar, m): 2163 with closing(tar.extractfile(m)) as f: 2164 for line in f: 2165 yield line.decode("utf-8") 2166 2167 2168def validateTimeZones(zones, links): 2169 """ Validate the zone and link entries. """ 2170 linkZones = set(links.keys()) 2171 intersect = linkZones.intersection(zones) 2172 if intersect: 2173 raise RuntimeError("Links also present in zones: %s" % intersect) 2174 2175 zoneNames = {z.name for z in zones} 2176 linkTargets = set(links.values()) 2177 if not linkTargets.issubset(zoneNames): 2178 raise RuntimeError( 2179 "Link targets not found: %s" % linkTargets.difference(zoneNames) 2180 ) 2181 2182 2183def partition(iterable, *predicates): 2184 def innerPartition(pred, it): 2185 it1, it2 = tee(it) 2186 return (filter(pred, it1), filterfalse(pred, it2)) 2187 2188 if len(predicates) == 0: 2189 return iterable 2190 (left, right) = innerPartition(predicates[0], iterable) 2191 if len(predicates) == 1: 2192 return (left, right) 2193 return tuple([left] + list(partition(right, *predicates[1:]))) 2194 2195 2196def listIANAFiles(tzdataDir): 2197 def isTzFile(d, m, f): 2198 return m(f) and d.isfile(d.resolve(f)) 2199 2200 return filter( 2201 partial(isTzFile, tzdataDir, re.compile("^[a-z0-9]+$").match), 2202 tzdataDir.listdir(), 2203 ) 2204 2205 2206def readIANAFiles(tzdataDir, files): 2207 """ Read all IANA time zone files from the given iterable. """ 2208 nameSyntax = "[\w/+\-]+" 2209 pZone = re.compile(r"Zone\s+(?P<name>%s)\s+.*" % nameSyntax) 2210 pLink = re.compile( 2211 r"Link\s+(?P<target>%s)\s+(?P<name>%s)(?:\s+#.*)?" % (nameSyntax, nameSyntax) 2212 ) 2213 2214 def createZone(line, fname): 2215 match = pZone.match(line) 2216 name = match.group("name") 2217 return Zone(name, fname) 2218 2219 def createLink(line, fname): 2220 match = pLink.match(line) 2221 (name, target) = match.group("name", "target") 2222 return (Zone(name, fname), target) 2223 2224 zones = set() 2225 links = dict() 2226 for filename in files: 2227 filepath = tzdataDir.resolve(filename) 2228 for line in tzdataDir.readlines(filepath): 2229 if line.startswith("Zone"): 2230 zones.add(createZone(line, filename)) 2231 if line.startswith("Link"): 2232 (link, target) = createLink(line, filename) 2233 links[link] = target 2234 2235 return (zones, links) 2236 2237 2238def readIANATimeZones(tzdataDir, ignoreBackzone, ignoreFactory): 2239 """ Read the IANA time zone information from `tzdataDir`. """ 2240 2241 backzoneFiles = {"backzone"} 2242 (bkfiles, tzfiles) = partition(listIANAFiles(tzdataDir), backzoneFiles.__contains__) 2243 2244 # Read zone and link infos. 2245 (zones, links) = readIANAFiles(tzdataDir, tzfiles) 2246 (backzones, backlinks) = readIANAFiles(tzdataDir, bkfiles) 2247 2248 # Remove the placeholder time zone "Factory". 2249 if ignoreFactory: 2250 zones.remove(Zone("Factory")) 2251 2252 # Merge with backzone data. 2253 if not ignoreBackzone: 2254 zones |= backzones 2255 links = { 2256 name: target for name, target in links.items() if name not in backzones 2257 } 2258 links.update(backlinks) 2259 2260 validateTimeZones(zones, links) 2261 2262 return (zones, links) 2263 2264 2265def readICUResourceFile(filename): 2266 """Read an ICU resource file. 2267 2268 Yields (<table-name>, <startOrEnd>, <value>) for each table. 2269 """ 2270 2271 numberValue = r"-?\d+" 2272 stringValue = r'".+?"' 2273 2274 def asVector(val): 2275 return r"%s(?:\s*,\s*%s)*" % (val, val) 2276 2277 numberVector = asVector(numberValue) 2278 stringVector = asVector(stringValue) 2279 2280 reNumberVector = re.compile(numberVector) 2281 reStringVector = re.compile(stringVector) 2282 reNumberValue = re.compile(numberValue) 2283 reStringValue = re.compile(stringValue) 2284 2285 def parseValue(value): 2286 m = reNumberVector.match(value) 2287 if m: 2288 return [int(v) for v in reNumberValue.findall(value)] 2289 m = reStringVector.match(value) 2290 if m: 2291 return [v[1:-1] for v in reStringValue.findall(value)] 2292 raise RuntimeError("unknown value type: %s" % value) 2293 2294 def extractValue(values): 2295 if len(values) == 0: 2296 return None 2297 if len(values) == 1: 2298 return values[0] 2299 return values 2300 2301 def line(*args): 2302 maybeMultiComments = r"(?:/\*[^*]*\*/)*" 2303 maybeSingleComment = r"(?://.*)?" 2304 lineStart = "^%s" % maybeMultiComments 2305 lineEnd = "%s\s*%s$" % (maybeMultiComments, maybeSingleComment) 2306 return re.compile(r"\s*".join(chain([lineStart], args, [lineEnd]))) 2307 2308 tableName = r'(?P<quote>"?)(?P<name>.+?)(?P=quote)' 2309 tableValue = r"(?P<value>%s|%s)" % (numberVector, stringVector) 2310 2311 reStartTable = line(tableName, r"\{") 2312 reEndTable = line(r"\}") 2313 reSingleValue = line(r",?", tableValue, r",?") 2314 reCompactTable = line(tableName, r"\{", tableValue, r"\}") 2315 reEmptyLine = line() 2316 2317 tables = [] 2318 2319 def currentTable(): 2320 return "|".join(tables) 2321 2322 values = [] 2323 for line in flines(filename, "utf-8-sig"): 2324 line = line.strip() 2325 if line == "": 2326 continue 2327 2328 m = reEmptyLine.match(line) 2329 if m: 2330 continue 2331 2332 m = reStartTable.match(line) 2333 if m: 2334 assert len(values) == 0 2335 tables.append(m.group("name")) 2336 continue 2337 2338 m = reEndTable.match(line) 2339 if m: 2340 yield (currentTable(), extractValue(values)) 2341 tables.pop() 2342 values = [] 2343 continue 2344 2345 m = reCompactTable.match(line) 2346 if m: 2347 assert len(values) == 0 2348 tables.append(m.group("name")) 2349 yield (currentTable(), extractValue(parseValue(m.group("value")))) 2350 tables.pop() 2351 continue 2352 2353 m = reSingleValue.match(line) 2354 if m and tables: 2355 values.extend(parseValue(m.group("value"))) 2356 continue 2357 2358 raise RuntimeError("unknown entry: %s" % line) 2359 2360 2361def readICUTimeZonesFromTimezoneTypes(icuTzDir): 2362 """Read the ICU time zone information from `icuTzDir`/timezoneTypes.txt 2363 and returns the tuple (zones, links). 2364 """ 2365 typeMapTimeZoneKey = "timezoneTypes:table(nofallback)|typeMap|timezone|" 2366 typeAliasTimeZoneKey = "timezoneTypes:table(nofallback)|typeAlias|timezone|" 2367 2368 def toTimeZone(name): 2369 return Zone(name.replace(":", "/")) 2370 2371 zones = set() 2372 links = dict() 2373 2374 for name, value in readICUResourceFile(os.path.join(icuTzDir, "timezoneTypes.txt")): 2375 if name.startswith(typeMapTimeZoneKey): 2376 zones.add(toTimeZone(name[len(typeMapTimeZoneKey) :])) 2377 if name.startswith(typeAliasTimeZoneKey): 2378 links[toTimeZone(name[len(typeAliasTimeZoneKey) :])] = value 2379 2380 validateTimeZones(zones, links) 2381 2382 return (zones, links) 2383 2384 2385def readICUTimeZonesFromZoneInfo(icuTzDir): 2386 """Read the ICU time zone information from `icuTzDir`/zoneinfo64.txt 2387 and returns the tuple (zones, links). 2388 """ 2389 zoneKey = "zoneinfo64:table(nofallback)|Zones:array|:table" 2390 linkKey = "zoneinfo64:table(nofallback)|Zones:array|:int" 2391 namesKey = "zoneinfo64:table(nofallback)|Names" 2392 2393 tzId = 0 2394 tzLinks = dict() 2395 tzNames = [] 2396 2397 for name, value in readICUResourceFile(os.path.join(icuTzDir, "zoneinfo64.txt")): 2398 if name == zoneKey: 2399 tzId += 1 2400 elif name == linkKey: 2401 tzLinks[tzId] = int(value) 2402 tzId += 1 2403 elif name == namesKey: 2404 tzNames.extend(value) 2405 2406 links = {Zone(tzNames[zone]): tzNames[target] for (zone, target) in tzLinks.items()} 2407 zones = {Zone(v) for v in tzNames if Zone(v) not in links} 2408 2409 validateTimeZones(zones, links) 2410 2411 return (zones, links) 2412 2413 2414def readICUTimeZones(icuDir, icuTzDir, ignoreFactory): 2415 # zoneinfo64.txt contains the supported time zones by ICU. This data is 2416 # generated from tzdata files, it doesn't include "backzone" in stock ICU. 2417 (zoneinfoZones, zoneinfoLinks) = readICUTimeZonesFromZoneInfo(icuTzDir) 2418 2419 # timezoneTypes.txt contains the canonicalization information for ICU. This 2420 # data is generated from CLDR files. It includes data about time zones from 2421 # tzdata's "backzone" file. 2422 (typesZones, typesLinks) = readICUTimeZonesFromTimezoneTypes(icuTzDir) 2423 2424 # Remove the placeholder time zone "Factory". 2425 # See also <https://github.com/eggert/tz/blob/master/factory>. 2426 if ignoreFactory: 2427 zoneinfoZones.remove(Zone("Factory")) 2428 2429 # Remove the ICU placeholder time zone "Etc/Unknown". 2430 # See also <https://unicode.org/reports/tr35/#Time_Zone_Identifiers>. 2431 for zones in (zoneinfoZones, typesZones): 2432 zones.remove(Zone("Etc/Unknown")) 2433 2434 # Remove any outdated ICU links. 2435 for links in (zoneinfoLinks, typesLinks): 2436 for zone in otherICULegacyLinks().keys(): 2437 if zone not in links: 2438 raise KeyError(f"Can't remove non-existent link from '{zone}'") 2439 del links[zone] 2440 2441 # Information in zoneinfo64 should be a superset of timezoneTypes. 2442 def inZoneInfo64(zone): 2443 return zone in zoneinfoZones or zone in zoneinfoLinks 2444 2445 notFoundInZoneInfo64 = [zone for zone in typesZones if not inZoneInfo64(zone)] 2446 if notFoundInZoneInfo64: 2447 raise RuntimeError( 2448 "Missing time zones in zoneinfo64.txt: %s" % notFoundInZoneInfo64 2449 ) 2450 2451 notFoundInZoneInfo64 = [ 2452 zone for zone in typesLinks.keys() if not inZoneInfo64(zone) 2453 ] 2454 if notFoundInZoneInfo64: 2455 raise RuntimeError( 2456 "Missing time zones in zoneinfo64.txt: %s" % notFoundInZoneInfo64 2457 ) 2458 2459 # zoneinfo64.txt only defines the supported time zones by ICU, the canonicalization 2460 # rules are defined through timezoneTypes.txt. Merge both to get the actual zones 2461 # and links used by ICU. 2462 icuZones = set( 2463 chain( 2464 (zone for zone in zoneinfoZones if zone not in typesLinks), 2465 (zone for zone in typesZones), 2466 ) 2467 ) 2468 icuLinks = dict( 2469 chain( 2470 ( 2471 (zone, target) 2472 for (zone, target) in zoneinfoLinks.items() 2473 if zone not in typesZones 2474 ), 2475 ((zone, target) for (zone, target) in typesLinks.items()), 2476 ) 2477 ) 2478 2479 return (icuZones, icuLinks) 2480 2481 2482def readICULegacyZones(icuDir): 2483 """Read the ICU legacy time zones from `icuTzDir`/tools/tzcode/icuzones 2484 and returns the tuple (zones, links). 2485 """ 2486 tzdir = TzDataDir(os.path.join(icuDir, "tools/tzcode")) 2487 2488 # Per spec we must recognize only IANA time zones and links, but ICU 2489 # recognizes various legacy, non-IANA time zones and links. Compute these 2490 # non-IANA time zones and links. 2491 2492 # Most legacy, non-IANA time zones and links are in the icuzones file. 2493 (zones, links) = readIANAFiles(tzdir, ["icuzones"]) 2494 2495 # Remove the ICU placeholder time zone "Etc/Unknown". 2496 # See also <https://unicode.org/reports/tr35/#Time_Zone_Identifiers>. 2497 zones.remove(Zone("Etc/Unknown")) 2498 2499 # A handful of non-IANA zones/links are not in icuzones and must be added 2500 # manually so that we won't invoke ICU with them. 2501 for (zone, target) in otherICULegacyLinks().items(): 2502 if zone in links: 2503 if links[zone] != target: 2504 raise KeyError( 2505 f"Can't overwrite link '{zone} -> {links[zone]}' with '{target}'" 2506 ) 2507 else: 2508 print( 2509 f"Info: Link '{zone} -> {target}' can be removed from otherICULegacyLinks()" 2510 ) 2511 links[zone] = target 2512 2513 return (zones, links) 2514 2515 2516def otherICULegacyLinks(): 2517 """The file `icuTzDir`/tools/tzcode/icuzones contains all ICU legacy time 2518 zones with the exception of time zones which are removed by IANA after an 2519 ICU release. 2520 2521 For example ICU 67 uses tzdata2018i, but tzdata2020b removed the link from 2522 "US/Pacific-New" to "America/Los_Angeles". ICU standalone tzdata updates 2523 don't include modified icuzones files, so we must manually record any IANA 2524 modifications here. 2525 2526 After an ICU update, we can remove any no longer needed entries from this 2527 function by checking if the relevant entries are now included in icuzones. 2528 """ 2529 2530 return { 2531 # Current ICU is up-to-date with IANA, so this dict is empty. 2532 } 2533 2534 2535def icuTzDataVersion(icuTzDir): 2536 """ Read the ICU time zone version from `icuTzDir`/zoneinfo64.txt. """ 2537 2538 def searchInFile(pattern, f): 2539 p = re.compile(pattern) 2540 for line in flines(f, "utf-8-sig"): 2541 m = p.search(line) 2542 if m: 2543 return m.group(1) 2544 return None 2545 2546 zoneinfo = os.path.join(icuTzDir, "zoneinfo64.txt") 2547 if not os.path.isfile(zoneinfo): 2548 raise RuntimeError("file not found: %s" % zoneinfo) 2549 version = searchInFile("^//\s+tz version:\s+([0-9]{4}[a-z])$", zoneinfo) 2550 if version is None: 2551 raise RuntimeError( 2552 "%s does not contain a valid tzdata version string" % zoneinfo 2553 ) 2554 return version 2555 2556 2557def findIncorrectICUZones(ianaZones, ianaLinks, icuZones, icuLinks, ignoreBackzone): 2558 """ Find incorrect ICU zone entries. """ 2559 2560 def isIANATimeZone(zone): 2561 return zone in ianaZones or zone in ianaLinks 2562 2563 def isICUTimeZone(zone): 2564 return zone in icuZones or zone in icuLinks 2565 2566 def isICULink(zone): 2567 return zone in icuLinks 2568 2569 # All IANA zones should be present in ICU. 2570 missingTimeZones = [zone for zone in ianaZones if not isICUTimeZone(zone)] 2571 # Normally zones in backzone are also present as links in one of the other 2572 # time zone files. The only exception to this rule is the Asia/Hanoi time 2573 # zone, this zone is only present in the backzone file. 2574 expectedMissing = [] if ignoreBackzone else [Zone("Asia/Hanoi")] 2575 if missingTimeZones != expectedMissing: 2576 raise RuntimeError( 2577 "Not all zones are present in ICU, did you forget " 2578 "to run intl/update-tzdata.sh? %s" % missingTimeZones 2579 ) 2580 2581 # Zones which are only present in ICU? 2582 additionalTimeZones = [zone for zone in icuZones if not isIANATimeZone(zone)] 2583 if additionalTimeZones: 2584 raise RuntimeError( 2585 "Additional zones present in ICU, did you forget " 2586 "to run intl/update-tzdata.sh? %s" % additionalTimeZones 2587 ) 2588 2589 # Zones which are marked as links in ICU. 2590 result = ((zone, icuLinks[zone]) for zone in ianaZones if isICULink(zone)) 2591 2592 # Remove unnecessary UTC mappings. 2593 utcnames = ["Etc/UTC", "Etc/UCT", "Etc/GMT"] 2594 result = ((zone, target) for (zone, target) in result if zone.name not in utcnames) 2595 2596 return sorted(result, key=itemgetter(0)) 2597 2598 2599def findIncorrectICULinks(ianaZones, ianaLinks, icuZones, icuLinks): 2600 """ Find incorrect ICU link entries. """ 2601 2602 def isIANATimeZone(zone): 2603 return zone in ianaZones or zone in ianaLinks 2604 2605 def isICUTimeZone(zone): 2606 return zone in icuZones or zone in icuLinks 2607 2608 def isICULink(zone): 2609 return zone in icuLinks 2610 2611 def isICUZone(zone): 2612 return zone in icuZones 2613 2614 # All links should be present in ICU. 2615 missingTimeZones = [zone for zone in ianaLinks.keys() if not isICUTimeZone(zone)] 2616 if missingTimeZones: 2617 raise RuntimeError( 2618 "Not all zones are present in ICU, did you forget " 2619 "to run intl/update-tzdata.sh? %s" % missingTimeZones 2620 ) 2621 2622 # Links which are only present in ICU? 2623 additionalTimeZones = [zone for zone in icuLinks.keys() if not isIANATimeZone(zone)] 2624 if additionalTimeZones: 2625 raise RuntimeError( 2626 "Additional links present in ICU, did you forget " 2627 "to run intl/update-tzdata.sh? %s" % additionalTimeZones 2628 ) 2629 2630 result = chain( 2631 # IANA links which have a different target in ICU. 2632 ( 2633 (zone, target, icuLinks[zone]) 2634 for (zone, target) in ianaLinks.items() 2635 if isICULink(zone) and target != icuLinks[zone] 2636 ), 2637 # IANA links which are zones in ICU. 2638 ( 2639 (zone, target, zone.name) 2640 for (zone, target) in ianaLinks.items() 2641 if isICUZone(zone) 2642 ), 2643 ) 2644 2645 # Remove unnecessary UTC mappings. 2646 utcnames = ["Etc/UTC", "Etc/UCT", "Etc/GMT"] 2647 result = ( 2648 (zone, target, icuTarget) 2649 for (zone, target, icuTarget) in result 2650 if target not in utcnames or icuTarget not in utcnames 2651 ) 2652 2653 return sorted(result, key=itemgetter(0)) 2654 2655 2656generatedFileWarning = "// Generated by make_intl_data.py. DO NOT EDIT." 2657tzdataVersionComment = "// tzdata version = {0}" 2658 2659 2660def processTimeZones( 2661 tzdataDir, icuDir, icuTzDir, version, ignoreBackzone, ignoreFactory, out 2662): 2663 """ Read the time zone info and create a new time zone cpp file. """ 2664 print("Processing tzdata mapping...") 2665 (ianaZones, ianaLinks) = readIANATimeZones(tzdataDir, ignoreBackzone, ignoreFactory) 2666 (icuZones, icuLinks) = readICUTimeZones(icuDir, icuTzDir, ignoreFactory) 2667 (legacyZones, legacyLinks) = readICULegacyZones(icuDir) 2668 2669 # Remove all legacy ICU time zones. 2670 icuZones = {zone for zone in icuZones if zone not in legacyZones} 2671 icuLinks = { 2672 zone: target for (zone, target) in icuLinks.items() if zone not in legacyLinks 2673 } 2674 2675 incorrectZones = findIncorrectICUZones( 2676 ianaZones, ianaLinks, icuZones, icuLinks, ignoreBackzone 2677 ) 2678 if not incorrectZones: 2679 print("<<< No incorrect ICU time zones found, please update Intl.js! >>>") 2680 print("<<< Maybe https://ssl.icu-project.org/trac/ticket/12044 was fixed? >>>") 2681 2682 incorrectLinks = findIncorrectICULinks(ianaZones, ianaLinks, icuZones, icuLinks) 2683 if not incorrectLinks: 2684 print("<<< No incorrect ICU time zone links found, please update Intl.js! >>>") 2685 print("<<< Maybe https://ssl.icu-project.org/trac/ticket/12044 was fixed? >>>") 2686 2687 print("Writing Intl tzdata file...") 2688 with io.open(out, mode="w", encoding="utf-8", newline="") as f: 2689 println = partial(print, file=f) 2690 2691 println(generatedFileWarning) 2692 println(tzdataVersionComment.format(version)) 2693 println("") 2694 2695 println("#ifndef builtin_intl_TimeZoneDataGenerated_h") 2696 println("#define builtin_intl_TimeZoneDataGenerated_h") 2697 println("") 2698 2699 println("namespace js {") 2700 println("namespace timezone {") 2701 println("") 2702 2703 println("// Format:") 2704 println('// "ZoneName" // ICU-Name [time zone file]') 2705 println("const char* const ianaZonesTreatedAsLinksByICU[] = {") 2706 for (zone, icuZone) in incorrectZones: 2707 println(' "%s", // %s [%s]' % (zone, icuZone, zone.filename)) 2708 println("};") 2709 println("") 2710 2711 println("// Format:") 2712 println('// "LinkName", "Target" // ICU-Target [time zone file]') 2713 println("struct LinkAndTarget") 2714 println("{") 2715 println(" const char* const link;") 2716 println(" const char* const target;") 2717 println("};") 2718 println("") 2719 println("const LinkAndTarget ianaLinksCanonicalizedDifferentlyByICU[] = {") 2720 for (zone, target, icuTarget) in incorrectLinks: 2721 println( 2722 ' { "%s", "%s" }, // %s [%s]' 2723 % (zone, target, icuTarget, zone.filename) 2724 ) 2725 println("};") 2726 println("") 2727 2728 println( 2729 "// Legacy ICU time zones, these are not valid IANA time zone names. We also" 2730 ) 2731 println("// disallow the old and deprecated System V time zones.") 2732 println( 2733 "// https://ssl.icu-project.org/repos/icu/trunk/icu4c/source/tools/tzcode/icuzones" 2734 ) # NOQA: E501 2735 println("const char* const legacyICUTimeZones[] = {") 2736 for zone in chain(sorted(legacyLinks.keys()), sorted(legacyZones)): 2737 println(' "%s",' % zone) 2738 println("};") 2739 println("") 2740 2741 println("} // namespace timezone") 2742 println("} // namespace js") 2743 println("") 2744 println("#endif /* builtin_intl_TimeZoneDataGenerated_h */") 2745 2746 2747def updateBackzoneLinks(tzdataDir, links): 2748 def withZone(fn): 2749 return lambda zone_target: fn(zone_target[0]) 2750 2751 (backzoneZones, backzoneLinks) = readIANAFiles(tzdataDir, ["backzone"]) 2752 (stableZones, updatedLinks, updatedZones) = partition( 2753 links.items(), 2754 # Link not changed in backzone. 2755 withZone(lambda zone: zone not in backzoneLinks and zone not in backzoneZones), 2756 # Link has a new target. 2757 withZone(lambda zone: zone in backzoneLinks), 2758 ) 2759 # Keep stable zones and links with updated target. 2760 return dict( 2761 chain( 2762 stableZones, 2763 map(withZone(lambda zone: (zone, backzoneLinks[zone])), updatedLinks), 2764 ) 2765 ) 2766 2767 2768def generateTzDataLinkTestContent(testDir, version, fileName, description, links): 2769 with io.open( 2770 os.path.join(testDir, fileName), mode="w", encoding="utf-8", newline="" 2771 ) as f: 2772 println = partial(print, file=f) 2773 2774 println('// |reftest| skip-if(!this.hasOwnProperty("Intl"))') 2775 println("") 2776 println(generatedFileWarning) 2777 println(tzdataVersionComment.format(version)) 2778 println( 2779 """ 2780const tzMapper = [ 2781 x => x, 2782 x => x.toUpperCase(), 2783 x => x.toLowerCase(), 2784]; 2785""" 2786 ) 2787 2788 println(description) 2789 println("const links = {") 2790 for (zone, target) in sorted(links, key=itemgetter(0)): 2791 println(' "%s": "%s",' % (zone, target)) 2792 println("};") 2793 2794 println( 2795 """ 2796for (let [linkName, target] of Object.entries(links)) { 2797 if (target === "Etc/UTC" || target === "Etc/GMT") 2798 target = "UTC"; 2799 2800 for (let map of tzMapper) { 2801 let dtf = new Intl.DateTimeFormat(undefined, {timeZone: map(linkName)}); 2802 let resolvedTimeZone = dtf.resolvedOptions().timeZone; 2803 assertEq(resolvedTimeZone, target, `${linkName} -> ${target}`); 2804 } 2805} 2806""" 2807 ) 2808 println( 2809 """ 2810if (typeof reportCompare === "function") 2811 reportCompare(0, 0, "ok"); 2812""" 2813 ) 2814 2815 2816def generateTzDataTestBackwardLinks(tzdataDir, version, ignoreBackzone, testDir): 2817 (zones, links) = readIANAFiles(tzdataDir, ["backward"]) 2818 assert len(zones) == 0 2819 2820 if not ignoreBackzone: 2821 links = updateBackzoneLinks(tzdataDir, links) 2822 2823 generateTzDataLinkTestContent( 2824 testDir, 2825 version, 2826 "timeZone_backward_links.js", 2827 "// Link names derived from IANA Time Zone Database, backward file.", 2828 links.items(), 2829 ) 2830 2831 2832def generateTzDataTestNotBackwardLinks(tzdataDir, version, ignoreBackzone, testDir): 2833 tzfiles = filterfalse( 2834 {"backward", "backzone"}.__contains__, listIANAFiles(tzdataDir) 2835 ) 2836 (zones, links) = readIANAFiles(tzdataDir, tzfiles) 2837 2838 if not ignoreBackzone: 2839 links = updateBackzoneLinks(tzdataDir, links) 2840 2841 generateTzDataLinkTestContent( 2842 testDir, 2843 version, 2844 "timeZone_notbackward_links.js", 2845 "// Link names derived from IANA Time Zone Database, excluding backward file.", 2846 links.items(), 2847 ) 2848 2849 2850def generateTzDataTestBackzone(tzdataDir, version, ignoreBackzone, testDir): 2851 backzoneFiles = {"backzone"} 2852 (bkfiles, tzfiles) = partition(listIANAFiles(tzdataDir), backzoneFiles.__contains__) 2853 2854 # Read zone and link infos. 2855 (zones, links) = readIANAFiles(tzdataDir, tzfiles) 2856 (backzones, backlinks) = readIANAFiles(tzdataDir, bkfiles) 2857 2858 if not ignoreBackzone: 2859 comment = """\ 2860// This file was generated with historical, pre-1970 backzone information 2861// respected. Therefore, every zone key listed below is its own Zone, not 2862// a Link to a modern-day target as IANA ignoring backzones would say. 2863 2864""" 2865 else: 2866 comment = """\ 2867// This file was generated while ignoring historical, pre-1970 backzone 2868// information. Therefore, every zone key listed below is part of a Link 2869// whose target is the corresponding value. 2870 2871""" 2872 2873 generateTzDataLinkTestContent( 2874 testDir, 2875 version, 2876 "timeZone_backzone.js", 2877 comment + "// Backzone zones derived from IANA Time Zone Database.", 2878 ( 2879 (zone, zone if not ignoreBackzone else links[zone]) 2880 for zone in backzones 2881 if zone in links 2882 ), 2883 ) 2884 2885 2886def generateTzDataTestBackzoneLinks(tzdataDir, version, ignoreBackzone, testDir): 2887 backzoneFiles = {"backzone"} 2888 (bkfiles, tzfiles) = partition(listIANAFiles(tzdataDir), backzoneFiles.__contains__) 2889 2890 # Read zone and link infos. 2891 (zones, links) = readIANAFiles(tzdataDir, tzfiles) 2892 (backzones, backlinks) = readIANAFiles(tzdataDir, bkfiles) 2893 2894 if not ignoreBackzone: 2895 comment = """\ 2896// This file was generated with historical, pre-1970 backzone information 2897// respected. Therefore, every zone key listed below points to a target 2898// in the backzone file and not to its modern-day target as IANA ignoring 2899// backzones would say. 2900 2901""" 2902 else: 2903 comment = """\ 2904// This file was generated while ignoring historical, pre-1970 backzone 2905// information. Therefore, every zone key listed below is part of a Link 2906// whose target is the corresponding value ignoring any backzone entries. 2907 2908""" 2909 2910 generateTzDataLinkTestContent( 2911 testDir, 2912 version, 2913 "timeZone_backzone_links.js", 2914 comment + "// Backzone links derived from IANA Time Zone Database.", 2915 ( 2916 (zone, target if not ignoreBackzone else links[zone]) 2917 for (zone, target) in backlinks.items() 2918 ), 2919 ) 2920 2921 2922def generateTzDataTestVersion(tzdataDir, version, testDir): 2923 fileName = "timeZone_version.js" 2924 2925 with io.open( 2926 os.path.join(testDir, fileName), mode="w", encoding="utf-8", newline="" 2927 ) as f: 2928 println = partial(print, file=f) 2929 2930 println('// |reftest| skip-if(!this.hasOwnProperty("Intl"))') 2931 println("") 2932 println(generatedFileWarning) 2933 println(tzdataVersionComment.format(version)) 2934 println("""const tzdata = "{0}";""".format(version)) 2935 2936 println( 2937 """ 2938if (typeof getICUOptions === "undefined") { 2939 var getICUOptions = SpecialPowers.Cu.getJSTestingFunctions().getICUOptions; 2940} 2941 2942var options = getICUOptions(); 2943 2944assertEq(options.tzdata, tzdata); 2945 2946if (typeof reportCompare === "function") 2947 reportCompare(0, 0, "ok"); 2948""" 2949 ) 2950 2951 2952def generateTzDataTests(tzdataDir, version, ignoreBackzone, testDir): 2953 generateTzDataTestBackwardLinks(tzdataDir, version, ignoreBackzone, testDir) 2954 generateTzDataTestNotBackwardLinks(tzdataDir, version, ignoreBackzone, testDir) 2955 generateTzDataTestBackzone(tzdataDir, version, ignoreBackzone, testDir) 2956 generateTzDataTestBackzoneLinks(tzdataDir, version, ignoreBackzone, testDir) 2957 generateTzDataTestVersion(tzdataDir, version, testDir) 2958 2959 2960def updateTzdata(topsrcdir, args): 2961 """ Update the time zone cpp file. """ 2962 2963 icuDir = os.path.join(topsrcdir, "intl/icu/source") 2964 if not os.path.isdir(icuDir): 2965 raise RuntimeError("not a directory: %s" % icuDir) 2966 2967 icuTzDir = os.path.join(topsrcdir, "intl/tzdata/source") 2968 if not os.path.isdir(icuTzDir): 2969 raise RuntimeError("not a directory: %s" % icuTzDir) 2970 2971 dateTimeFormatTestDir = os.path.join( 2972 topsrcdir, "js/src/tests/non262/Intl/DateTimeFormat" 2973 ) 2974 if not os.path.isdir(dateTimeFormatTestDir): 2975 raise RuntimeError("not a directory: %s" % dateTimeFormatTestDir) 2976 2977 tzDir = args.tz 2978 if tzDir is not None and not (os.path.isdir(tzDir) or os.path.isfile(tzDir)): 2979 raise RuntimeError("not a directory or file: %s" % tzDir) 2980 ignoreBackzone = args.ignore_backzone 2981 # TODO: Accept or ignore the placeholder time zone "Factory"? 2982 ignoreFactory = False 2983 out = args.out 2984 2985 version = icuTzDataVersion(icuTzDir) 2986 url = ( 2987 "https://www.iana.org/time-zones/repository/releases/tzdata%s.tar.gz" % version 2988 ) 2989 2990 print("Arguments:") 2991 print("\ttzdata version: %s" % version) 2992 print("\ttzdata URL: %s" % url) 2993 print("\ttzdata directory|file: %s" % tzDir) 2994 print("\tICU directory: %s" % icuDir) 2995 print("\tICU timezone directory: %s" % icuTzDir) 2996 print("\tIgnore backzone file: %s" % ignoreBackzone) 2997 print("\tOutput file: %s" % out) 2998 print("") 2999 3000 def updateFrom(f): 3001 if os.path.isfile(f) and tarfile.is_tarfile(f): 3002 with tarfile.open(f, "r:*") as tar: 3003 processTimeZones( 3004 TzDataFile(tar), 3005 icuDir, 3006 icuTzDir, 3007 version, 3008 ignoreBackzone, 3009 ignoreFactory, 3010 out, 3011 ) 3012 generateTzDataTests( 3013 TzDataFile(tar), version, ignoreBackzone, dateTimeFormatTestDir 3014 ) 3015 elif os.path.isdir(f): 3016 processTimeZones( 3017 TzDataDir(f), 3018 icuDir, 3019 icuTzDir, 3020 version, 3021 ignoreBackzone, 3022 ignoreFactory, 3023 out, 3024 ) 3025 generateTzDataTests( 3026 TzDataDir(f), version, ignoreBackzone, dateTimeFormatTestDir 3027 ) 3028 else: 3029 raise RuntimeError("unknown format") 3030 3031 if tzDir is None: 3032 print("Downloading tzdata file...") 3033 with closing(urlopen(url)) as tzfile: 3034 fname = urlsplit(tzfile.geturl()).path.split("/")[-1] 3035 with tempfile.NamedTemporaryFile(suffix=fname) as tztmpfile: 3036 print("File stored in %s" % tztmpfile.name) 3037 tztmpfile.write(tzfile.read()) 3038 tztmpfile.flush() 3039 updateFrom(tztmpfile.name) 3040 else: 3041 updateFrom(tzDir) 3042 3043 3044def readCurrencyFile(tree): 3045 reCurrency = re.compile(r"^[A-Z]{3}$") 3046 reIntMinorUnits = re.compile(r"^\d+$") 3047 3048 for country in tree.iterfind(".//CcyNtry"): 3049 # Skip entry if no currency information is available. 3050 currency = country.findtext("Ccy") 3051 if currency is None: 3052 continue 3053 assert reCurrency.match(currency) 3054 3055 minorUnits = country.findtext("CcyMnrUnts") 3056 assert minorUnits is not None 3057 3058 # Skip all entries without minorUnits or which use the default minorUnits. 3059 if reIntMinorUnits.match(minorUnits) and int(minorUnits) != 2: 3060 currencyName = country.findtext("CcyNm") 3061 countryName = country.findtext("CtryNm") 3062 yield (currency, int(minorUnits), currencyName, countryName) 3063 3064 3065def writeCurrencyFile(published, currencies, out): 3066 with io.open(out, mode="w", encoding="utf-8", newline="") as f: 3067 println = partial(print, file=f) 3068 3069 println(generatedFileWarning) 3070 println("// Version: {}".format(published)) 3071 3072 println( 3073 """ 3074/** 3075 * Mapping from currency codes to the number of decimal digits used for them. 3076 * Default is 2 digits. 3077 * 3078 * Spec: ISO 4217 Currency and Funds Code List. 3079 * http://www.currency-iso.org/en/home/tables/table-a1.html 3080 */""" 3081 ) 3082 println("var currencyDigits = {") 3083 for (currency, entries) in groupby( 3084 sorted(currencies, key=itemgetter(0)), itemgetter(0) 3085 ): 3086 for (_, minorUnits, currencyName, countryName) in entries: 3087 println(" // {} ({})".format(currencyName, countryName)) 3088 println(" {}: {},".format(currency, minorUnits)) 3089 println("};") 3090 3091 3092def updateCurrency(topsrcdir, args): 3093 """ Update the CurrencyDataGenerated.js file. """ 3094 import xml.etree.ElementTree as ET 3095 from random import randint 3096 3097 url = args.url 3098 out = args.out 3099 filename = args.file 3100 3101 print("Arguments:") 3102 print("\tDownload url: %s" % url) 3103 print("\tLocal currency file: %s" % filename) 3104 print("\tOutput file: %s" % out) 3105 print("") 3106 3107 def updateFrom(currencyFile): 3108 print("Processing currency code list file...") 3109 tree = ET.parse(currencyFile) 3110 published = tree.getroot().attrib["Pblshd"] 3111 currencies = readCurrencyFile(tree) 3112 3113 print("Writing CurrencyData file...") 3114 writeCurrencyFile(published, currencies, out) 3115 3116 if filename is not None: 3117 print("Always make sure you have the newest currency code list file!") 3118 updateFrom(filename) 3119 else: 3120 print("Downloading currency & funds code list...") 3121 request = UrlRequest(url) 3122 request.add_header( 3123 "User-agent", 3124 "Mozilla/5.0 (Mobile; rv:{0}.0) Gecko/{0}.0 Firefox/{0}.0".format( 3125 randint(1, 999) 3126 ), 3127 ) 3128 with closing(urlopen(request)) as currencyFile: 3129 fname = urlsplit(currencyFile.geturl()).path.split("/")[-1] 3130 with tempfile.NamedTemporaryFile(suffix=fname) as currencyTmpFile: 3131 print("File stored in %s" % currencyTmpFile.name) 3132 currencyTmpFile.write(currencyFile.read()) 3133 currencyTmpFile.flush() 3134 updateFrom(currencyTmpFile.name) 3135 3136 3137def writeUnicodeExtensionsMappings(println, mapping, extension): 3138 println( 3139 """ 3140template <size_t Length> 3141static inline bool Is{0}Key( 3142 mozilla::Span<const char> key, const char (&str)[Length]) {{ 3143 static_assert(Length == {0}KeyLength + 1, 3144 "{0} extension key is two characters long"); 3145 return memcmp(key.data(), str, Length - 1) == 0; 3146}} 3147 3148template <size_t Length> 3149static inline bool Is{0}Type( 3150 mozilla::Span<const char> type, const char (&str)[Length]) {{ 3151 static_assert(Length > {0}KeyLength + 1, 3152 "{0} extension type contains more than two characters"); 3153 return type.size() == (Length - 1) && 3154 memcmp(type.data(), str, Length - 1) == 0; 3155}} 3156""".format( 3157 extension 3158 ).rstrip( 3159 "\n" 3160 ) 3161 ) 3162 3163 linear_search_max_length = 4 3164 3165 needs_binary_search = any( 3166 len(replacements.items()) > linear_search_max_length 3167 for replacements in mapping.values() 3168 ) 3169 3170 if needs_binary_search: 3171 println( 3172 """ 3173static int32_t Compare{0}Type(const char* a, mozilla::Span<const char> b) {{ 3174 MOZ_ASSERT(!std::char_traits<char>::find(b.data(), b.size(), '\\0'), 3175 "unexpected null-character in string"); 3176 3177 using UnsignedChar = unsigned char; 3178 for (size_t i = 0; i < b.size(); i++) {{ 3179 // |a| is zero-terminated and |b| doesn't contain a null-terminator. So if 3180 // we've reached the end of |a|, the below if-statement will always be true. 3181 // That ensures we don't read past the end of |a|. 3182 if (int32_t r = UnsignedChar(a[i]) - UnsignedChar(b[i])) {{ 3183 return r; 3184 }} 3185 }} 3186 3187 // Return zero if both strings are equal or a negative number if |b| is a 3188 // prefix of |a|. 3189 return -int32_t(UnsignedChar(a[b.size()])); 3190}} 3191 3192template <size_t Length> 3193static inline const char* Search{0}Replacement( 3194 const char* (&types)[Length], const char* (&aliases)[Length], 3195 mozilla::Span<const char> type) {{ 3196 3197 auto p = std::lower_bound(std::begin(types), std::end(types), type, 3198 [](const auto& a, const auto& b) {{ 3199 return Compare{0}Type(a, b) < 0; 3200 }}); 3201 if (p != std::end(types) && Compare{0}Type(*p, type) == 0) {{ 3202 return aliases[std::distance(std::begin(types), p)]; 3203 }} 3204 return nullptr; 3205}} 3206""".format( 3207 extension 3208 ).rstrip( 3209 "\n" 3210 ) 3211 ) 3212 3213 println( 3214 """ 3215/** 3216 * Mapping from deprecated BCP 47 {0} extension types to their preferred 3217 * values. 3218 * 3219 * Spec: https://www.unicode.org/reports/tr35/#Unicode_Locale_Extension_Data_Files 3220 * Spec: https://www.unicode.org/reports/tr35/#t_Extension 3221 */ 3222const char* js::intl::LanguageTag::replace{0}ExtensionType( 3223 mozilla::Span<const char> key, mozilla::Span<const char> type) {{ 3224 MOZ_ASSERT(key.size() == {0}KeyLength); 3225 MOZ_ASSERT(IsCanonicallyCased{0}Key(key)); 3226 3227 MOZ_ASSERT(type.size() > {0}KeyLength); 3228 MOZ_ASSERT(IsCanonicallyCased{0}Type(type)); 3229""".format( 3230 extension 3231 ) 3232 ) 3233 3234 def to_hash_key(replacements): 3235 return str(sorted(replacements.items())) 3236 3237 def write_array(subtags, name, length): 3238 max_entries = (80 - len(" ")) // (length + len('"", ')) 3239 3240 println(" static const char* {}[{}] = {{".format(name, len(subtags))) 3241 3242 for entries in grouper(subtags, max_entries): 3243 entries = ( 3244 '"{}"'.format(tag).rjust(length + 2) 3245 for tag in entries 3246 if tag is not None 3247 ) 3248 println(" {},".format(", ".join(entries))) 3249 3250 println(" };") 3251 3252 # Merge duplicate keys. 3253 key_aliases = {} 3254 for (key, replacements) in sorted(mapping.items(), key=itemgetter(0)): 3255 hash_key = to_hash_key(replacements) 3256 if hash_key not in key_aliases: 3257 key_aliases[hash_key] = [] 3258 else: 3259 key_aliases[hash_key].append(key) 3260 3261 first_key = True 3262 for (key, replacements) in sorted(mapping.items(), key=itemgetter(0)): 3263 hash_key = to_hash_key(replacements) 3264 if key in key_aliases[hash_key]: 3265 continue 3266 3267 cond = ( 3268 'Is{}Key(key, "{}")'.format(extension, k) 3269 for k in [key] + key_aliases[hash_key] 3270 ) 3271 3272 if_kind = "if" if first_key else "else if" 3273 cond = (" ||\n" + " " * (2 + len(if_kind) + 2)).join(cond) 3274 println( 3275 """ 3276 {} ({}) {{""".format( 3277 if_kind, cond 3278 ).strip( 3279 "\n" 3280 ) 3281 ) 3282 first_key = False 3283 3284 replacements = sorted(replacements.items(), key=itemgetter(0)) 3285 3286 if len(replacements) > linear_search_max_length: 3287 types = [t for (t, _) in replacements] 3288 preferred = [r for (_, r) in replacements] 3289 max_len = max(len(k) for k in types + preferred) 3290 3291 write_array(types, "types", max_len) 3292 write_array(preferred, "aliases", max_len) 3293 println( 3294 """ 3295 return Search{}Replacement(types, aliases, type); 3296""".format( 3297 extension 3298 ).strip( 3299 "\n" 3300 ) 3301 ) 3302 else: 3303 for (type, replacement) in replacements: 3304 println( 3305 """ 3306 if (Is{}Type(type, "{}")) {{ 3307 return "{}"; 3308 }}""".format( 3309 extension, type, replacement 3310 ).strip( 3311 "\n" 3312 ) 3313 ) 3314 3315 println( 3316 """ 3317 }""".lstrip( 3318 "\n" 3319 ) 3320 ) 3321 3322 println( 3323 """ 3324 return nullptr; 3325} 3326""".strip( 3327 "\n" 3328 ) 3329 ) 3330 3331 3332def readICUUnitResourceFile(filepath): 3333 """Return a set of unit descriptor pairs where the first entry denotes the unit type and the 3334 second entry the unit name. 3335 3336 Example: 3337 3338 root{ 3339 units{ 3340 compound{ 3341 } 3342 coordinate{ 3343 } 3344 length{ 3345 meter{ 3346 } 3347 } 3348 } 3349 unitsNarrow:alias{"/LOCALE/unitsShort"} 3350 unitsShort{ 3351 duration{ 3352 day{ 3353 } 3354 day-person:alias{"/LOCALE/unitsShort/duration/day"} 3355 } 3356 length{ 3357 meter{ 3358 } 3359 } 3360 } 3361 } 3362 3363 Returns {("length", "meter"), ("duration", "day"), ("duration", "day-person")} 3364 """ 3365 3366 start_table_re = re.compile(r"^([\w\-%:\"]+)\{$") 3367 end_table_re = re.compile(r"^\}$") 3368 table_entry_re = re.compile(r"^([\w\-%:\"]+)\{\"(.*?)\"\}$") 3369 3370 # The current resource table. 3371 table = {} 3372 3373 # List of parent tables when parsing. 3374 parents = [] 3375 3376 # Track multi-line comments state. 3377 in_multiline_comment = False 3378 3379 for line in flines(filepath, "utf-8-sig"): 3380 # Remove leading and trailing whitespace. 3381 line = line.strip() 3382 3383 # Skip over comments. 3384 if in_multiline_comment: 3385 if line.endswith("*/"): 3386 in_multiline_comment = False 3387 continue 3388 3389 if line.startswith("//"): 3390 continue 3391 3392 if line.startswith("/*"): 3393 in_multiline_comment = True 3394 continue 3395 3396 # Try to match the start of a table, e.g. `length{` or `meter{`. 3397 match = start_table_re.match(line) 3398 if match: 3399 parents.append(table) 3400 table_name = match.group(1) 3401 new_table = {} 3402 table[table_name] = new_table 3403 table = new_table 3404 continue 3405 3406 # Try to match the end of a table. 3407 match = end_table_re.match(line) 3408 if match: 3409 table = parents.pop() 3410 continue 3411 3412 # Try to match a table entry, e.g. `dnam{"meter"}`. 3413 match = table_entry_re.match(line) 3414 if match: 3415 entry_key = match.group(1) 3416 entry_value = match.group(2) 3417 table[entry_key] = entry_value 3418 continue 3419 3420 raise Exception("unexpected line: '{}' in {}".format(line, filepath)) 3421 3422 assert len(parents) == 0, "Not all tables closed" 3423 assert len(table) == 1, "More than one root table" 3424 3425 # Remove the top-level language identifier table. 3426 (_, unit_table) = table.popitem() 3427 3428 # Add all units for the three display formats "units", "unitsNarrow", and "unitsShort". 3429 # But exclude the pseudo-units "compound" and "ccoordinate". 3430 return { 3431 (unit_type, unit_name if not unit_name.endswith(":alias") else unit_name[:-6]) 3432 for unit_display in ("units", "unitsNarrow", "unitsShort") 3433 if unit_display in unit_table 3434 for (unit_type, unit_names) in unit_table[unit_display].items() 3435 if unit_type != "compound" and unit_type != "coordinate" 3436 for unit_name in unit_names.keys() 3437 } 3438 3439 3440def computeSupportedUnits(all_units, sanctioned_units): 3441 """Given the set of all possible ICU unit identifiers and the set of sanctioned unit 3442 identifiers, compute the set of effectively supported ICU unit identifiers. 3443 """ 3444 3445 def find_match(unit): 3446 unit_match = [ 3447 (unit_type, unit_name) 3448 for (unit_type, unit_name) in all_units 3449 if unit_name == unit 3450 ] 3451 if unit_match: 3452 assert len(unit_match) == 1 3453 return unit_match[0] 3454 return None 3455 3456 def compound_unit_identifiers(): 3457 for numerator in sanctioned_units: 3458 for denominator in sanctioned_units: 3459 yield "{}-per-{}".format(numerator, denominator) 3460 3461 supported_simple_units = {find_match(unit) for unit in sanctioned_units} 3462 assert None not in supported_simple_units 3463 3464 supported_compound_units = { 3465 unit_match 3466 for unit_match in (find_match(unit) for unit in compound_unit_identifiers()) 3467 if unit_match 3468 } 3469 3470 return supported_simple_units | supported_compound_units 3471 3472 3473def readICUDataFilterForUnits(data_filter_file): 3474 with io.open(data_filter_file, mode="r", encoding="utf-8") as f: 3475 data_filter = json.load(f) 3476 3477 # Find the rule set for the "unit_tree". 3478 unit_tree_rules = [ 3479 entry["rules"] 3480 for entry in data_filter["resourceFilters"] 3481 if entry["categories"] == ["unit_tree"] 3482 ] 3483 assert len(unit_tree_rules) == 1 3484 3485 # Compute the list of included units from that rule set. The regular expression must match 3486 # "+/*/length/meter" and mustn't match either "-/*" or "+/*/compound". 3487 included_unit_re = re.compile(r"^\+/\*/(.+?)/(.+)$") 3488 filtered_units = (included_unit_re.match(unit) for unit in unit_tree_rules[0]) 3489 3490 return {(unit.group(1), unit.group(2)) for unit in filtered_units if unit} 3491 3492 3493def writeSanctionedSimpleUnitIdentifiersFiles(all_units, sanctioned_units): 3494 js_src_builtin_intl_dir = os.path.dirname(os.path.abspath(__file__)) 3495 3496 def find_unit_type(unit): 3497 result = [ 3498 unit_type for (unit_type, unit_name) in all_units if unit_name == unit 3499 ] 3500 assert result and len(result) == 1 3501 return result[0] 3502 3503 sanctioned_js_file = os.path.join( 3504 js_src_builtin_intl_dir, "SanctionedSimpleUnitIdentifiersGenerated.js" 3505 ) 3506 with io.open(sanctioned_js_file, mode="w", encoding="utf-8", newline="") as f: 3507 println = partial(print, file=f) 3508 3509 sanctioned_units_object = json.dumps( 3510 {unit: True for unit in sorted(sanctioned_units)}, 3511 sort_keys=True, 3512 indent=4, 3513 separators=(",", ": "), 3514 ) 3515 3516 println(generatedFileWarning) 3517 3518 println( 3519 """ 3520/** 3521 * The list of currently supported simple unit identifiers. 3522 * 3523 * Intl.NumberFormat Unified API Proposal 3524 */""" 3525 ) 3526 3527 println( 3528 "var sanctionedSimpleUnitIdentifiers = {};".format(sanctioned_units_object) 3529 ) 3530 3531 sanctioned_cpp_file = os.path.join( 3532 js_src_builtin_intl_dir, "MeasureUnitGenerated.h" 3533 ) 3534 with io.open(sanctioned_cpp_file, mode="w", encoding="utf-8", newline="") as f: 3535 println = partial(print, file=f) 3536 3537 println(generatedFileWarning) 3538 3539 println( 3540 """ 3541struct MeasureUnit { 3542 const char* const type; 3543 const char* const name; 3544}; 3545 3546/** 3547 * The list of currently supported simple unit identifiers. 3548 * 3549 * The list must be kept in alphabetical order of |name|. 3550 */ 3551inline constexpr MeasureUnit simpleMeasureUnits[] = { 3552 // clang-format off""" 3553 ) 3554 3555 for unit_name in sorted(sanctioned_units): 3556 println(' {{"{}", "{}"}},'.format(find_unit_type(unit_name), unit_name)) 3557 3558 println( 3559 """ 3560 // clang-format on 3561};""".lstrip( 3562 "\n" 3563 ) 3564 ) 3565 3566 shutil.copyfile( 3567 sanctioned_cpp_file, 3568 os.path.join( 3569 js_src_builtin_intl_dir, 3570 "../../../../intl/components/src/MeasureUnitGenerated.h", 3571 ), 3572 ) 3573 3574 writeUnitTestFiles(all_units, sanctioned_units) 3575 3576 3577def writeUnitTestFiles(all_units, sanctioned_units): 3578 """ Generate test files for unit number formatters. """ 3579 3580 js_src_builtin_intl_dir = os.path.dirname(os.path.abspath(__file__)) 3581 test_dir = os.path.join( 3582 js_src_builtin_intl_dir, "../../tests/non262/Intl/NumberFormat" 3583 ) 3584 3585 def write_test(file_name, test_content, indent=4): 3586 file_path = os.path.join(test_dir, file_name) 3587 with io.open(file_path, mode="w", encoding="utf-8", newline="") as f: 3588 println = partial(print, file=f) 3589 3590 println('// |reftest| skip-if(!this.hasOwnProperty("Intl"))') 3591 println("") 3592 println(generatedFileWarning) 3593 println("") 3594 3595 sanctioned_units_array = json.dumps( 3596 [unit for unit in sorted(sanctioned_units)], 3597 indent=indent, 3598 separators=(",", ": "), 3599 ) 3600 3601 println( 3602 "const sanctionedSimpleUnitIdentifiers = {};".format( 3603 sanctioned_units_array 3604 ) 3605 ) 3606 3607 println(test_content) 3608 3609 println( 3610 """ 3611if (typeof reportCompare === "function") 3612{}reportCompare(true, true);""".format( 3613 " " * indent 3614 ) 3615 ) 3616 3617 write_test( 3618 "unit-compound-combinations.js", 3619 """ 3620// Test all simple unit identifier combinations are allowed. 3621 3622for (const numerator of sanctionedSimpleUnitIdentifiers) { 3623 for (const denominator of sanctionedSimpleUnitIdentifiers) { 3624 const unit = `${numerator}-per-${denominator}`; 3625 const nf = new Intl.NumberFormat("en", {style: "unit", unit}); 3626 3627 assertEq(nf.format(1), nf.formatToParts(1).map(p => p.value).join("")); 3628 } 3629}""", 3630 ) 3631 3632 all_units_array = json.dumps( 3633 ["-".join(unit) for unit in sorted(all_units)], indent=4, separators=(",", ": ") 3634 ) 3635 3636 write_test( 3637 "unit-well-formed.js", 3638 """ 3639const allUnits = {}; 3640""".format( 3641 all_units_array 3642 ) 3643 + """ 3644// Test only sanctioned unit identifiers are allowed. 3645 3646for (const typeAndUnit of allUnits) { 3647 const [_, type, unit] = typeAndUnit.match(/(\w+)-(.+)/); 3648 3649 let allowed; 3650 if (unit.includes("-per-")) { 3651 const [numerator, denominator] = unit.split("-per-"); 3652 allowed = sanctionedSimpleUnitIdentifiers.includes(numerator) && 3653 sanctionedSimpleUnitIdentifiers.includes(denominator); 3654 } else { 3655 allowed = sanctionedSimpleUnitIdentifiers.includes(unit); 3656 } 3657 3658 if (allowed) { 3659 const nf = new Intl.NumberFormat("en", {style: "unit", unit}); 3660 assertEq(nf.format(1), nf.formatToParts(1).map(p => p.value).join("")); 3661 } else { 3662 assertThrowsInstanceOf(() => new Intl.NumberFormat("en", {style: "unit", unit}), 3663 RangeError, `Missing error for "${typeAndUnit}"`); 3664 } 3665}""", 3666 ) 3667 3668 write_test( 3669 "unit-formatToParts-has-unit-field.js", 3670 """ 3671// Test only English and Chinese to keep the overall runtime reasonable. 3672// 3673// Chinese is included because it contains more than one "unit" element for 3674// certain unit combinations. 3675const locales = ["en", "zh"]; 3676 3677// Plural rules for English only differentiate between "one" and "other". Plural 3678// rules for Chinese only use "other". That means we only need to test two values 3679// per unit. 3680const values = [0, 1]; 3681 3682// Ensure unit formatters contain at least one "unit" element. 3683 3684for (const locale of locales) { 3685 for (const unit of sanctionedSimpleUnitIdentifiers) { 3686 const nf = new Intl.NumberFormat(locale, {style: "unit", unit}); 3687 3688 for (const value of values) { 3689 assertEq(nf.formatToParts(value).some(e => e.type === "unit"), true, 3690 `locale=${locale}, unit=${unit}`); 3691 } 3692 } 3693 3694 for (const numerator of sanctionedSimpleUnitIdentifiers) { 3695 for (const denominator of sanctionedSimpleUnitIdentifiers) { 3696 const unit = `${numerator}-per-${denominator}`; 3697 const nf = new Intl.NumberFormat(locale, {style: "unit", unit}); 3698 3699 for (const value of values) { 3700 assertEq(nf.formatToParts(value).some(e => e.type === "unit"), true, 3701 `locale=${locale}, unit=${unit}`); 3702 } 3703 } 3704 } 3705}""", 3706 indent=2, 3707 ) 3708 3709 3710def updateUnits(topsrcdir, args): 3711 icu_path = os.path.join(topsrcdir, "intl", "icu") 3712 icu_unit_path = os.path.join(icu_path, "source", "data", "unit") 3713 3714 with io.open( 3715 "SanctionedSimpleUnitIdentifiers.yaml", mode="r", encoding="utf-8" 3716 ) as f: 3717 sanctioned_units = yaml.safe_load(f) 3718 3719 # Read all possible ICU unit identifiers from the "unit/root.txt" resource. 3720 unit_root_file = os.path.join(icu_unit_path, "root.txt") 3721 all_units = readICUUnitResourceFile(unit_root_file) 3722 3723 # Compute the set of effectively supported ICU unit identifiers. 3724 supported_units = computeSupportedUnits(all_units, sanctioned_units) 3725 3726 # Read the list of units we're including into the ICU data file. 3727 data_filter_file = os.path.join(icu_path, "data_filter.json") 3728 filtered_units = readICUDataFilterForUnits(data_filter_file) 3729 3730 # Both sets must match to avoid resource loading errors at runtime. 3731 if supported_units != filtered_units: 3732 3733 def units_to_string(units): 3734 return ", ".join("/".join(u) for u in units) 3735 3736 missing = supported_units - filtered_units 3737 if missing: 3738 raise RuntimeError("Missing units: {}".format(units_to_string(missing))) 3739 3740 # Not exactly an error, but we currently don't have a use case where we need to support 3741 # more units than required by ECMA-402. 3742 extra = filtered_units - supported_units 3743 if extra: 3744 raise RuntimeError("Unnecessary units: {}".format(units_to_string(extra))) 3745 3746 writeSanctionedSimpleUnitIdentifiersFiles(all_units, sanctioned_units) 3747 3748 3749def readICUNumberingSystemsResourceFile(filepath): 3750 """Returns a dictionary of numbering systems where the key denotes the numbering system name 3751 and the value a dictionary with additional numbering system data. 3752 3753 Example: 3754 3755 numberingSystems:table(nofallback){ 3756 numberingSystems{ 3757 latn{ 3758 algorithmic:int{0} 3759 desc{"0123456789"} 3760 radix:int{10} 3761 } 3762 roman{ 3763 algorithmic:int{1} 3764 desc{"%roman-upper"} 3765 radix:int{10} 3766 } 3767 } 3768 } 3769 3770 Returns {"latn": {"digits": "0123456789", "algorithmic": False}, 3771 "roman": {"algorithmic": True}} 3772 """ 3773 3774 start_table_re = re.compile(r"^(\w+)(?:\:[\w\(\)]+)?\{$") 3775 end_table_re = re.compile(r"^\}$") 3776 table_entry_re = re.compile(r"^(\w+)(?:\:[\w\(\)]+)?\{(?:(?:\"(.*?)\")|(\d+))\}$") 3777 3778 # The current resource table. 3779 table = {} 3780 3781 # List of parent tables when parsing. 3782 parents = [] 3783 3784 # Track multi-line comments state. 3785 in_multiline_comment = False 3786 3787 for line in flines(filepath, "utf-8-sig"): 3788 # Remove leading and trailing whitespace. 3789 line = line.strip() 3790 3791 # Skip over comments. 3792 if in_multiline_comment: 3793 if line.endswith("*/"): 3794 in_multiline_comment = False 3795 continue 3796 3797 if line.startswith("//"): 3798 continue 3799 3800 if line.startswith("/*"): 3801 in_multiline_comment = True 3802 continue 3803 3804 # Try to match the start of a table, e.g. `latn{`. 3805 match = start_table_re.match(line) 3806 if match: 3807 parents.append(table) 3808 table_name = match.group(1) 3809 new_table = {} 3810 table[table_name] = new_table 3811 table = new_table 3812 continue 3813 3814 # Try to match the end of a table. 3815 match = end_table_re.match(line) 3816 if match: 3817 table = parents.pop() 3818 continue 3819 3820 # Try to match a table entry, e.g. `desc{"0123456789"}`. 3821 match = table_entry_re.match(line) 3822 if match: 3823 entry_key = match.group(1) 3824 entry_value = ( 3825 match.group(2) if match.group(2) is not None else int(match.group(3)) 3826 ) 3827 table[entry_key] = entry_value 3828 continue 3829 3830 raise Exception("unexpected line: '{}' in {}".format(line, filepath)) 3831 3832 assert len(parents) == 0, "Not all tables closed" 3833 assert len(table) == 1, "More than one root table" 3834 3835 # Remove the two top-level "numberingSystems" tables. 3836 (_, numbering_systems) = table.popitem() 3837 (_, numbering_systems) = numbering_systems.popitem() 3838 3839 # Assert all numbering systems use base 10. 3840 assert all(ns["radix"] == 10 for ns in numbering_systems.values()) 3841 3842 # Return the numbering systems. 3843 return { 3844 key: {"digits": value["desc"], "algorithmic": False} 3845 if not bool(value["algorithmic"]) 3846 else {"algorithmic": True} 3847 for (key, value) in numbering_systems.items() 3848 } 3849 3850 3851def writeNumberingSystemFiles(numbering_systems): 3852 js_src_builtin_intl_dir = os.path.dirname(os.path.abspath(__file__)) 3853 3854 numbering_systems_js_file = os.path.join( 3855 js_src_builtin_intl_dir, "NumberingSystemsGenerated.h" 3856 ) 3857 with io.open( 3858 numbering_systems_js_file, mode="w", encoding="utf-8", newline="" 3859 ) as f: 3860 println = partial(print, file=f) 3861 3862 println(generatedFileWarning) 3863 3864 println( 3865 """ 3866/** 3867 * The list of numbering systems with simple digit mappings. 3868 */ 3869 3870#ifndef builtin_intl_NumberingSystemsGenerated_h 3871#define builtin_intl_NumberingSystemsGenerated_h 3872""" 3873 ) 3874 3875 simple_numbering_systems = sorted( 3876 name 3877 for (name, value) in numbering_systems.items() 3878 if not value["algorithmic"] 3879 ) 3880 3881 println("// clang-format off") 3882 println("#define NUMBERING_SYSTEMS_WITH_SIMPLE_DIGIT_MAPPINGS \\") 3883 println( 3884 "{}".format( 3885 ", \\\n".join( 3886 ' "{}"'.format(name) for name in simple_numbering_systems 3887 ) 3888 ) 3889 ) 3890 println("// clang-format on") 3891 println("") 3892 3893 println("#endif // builtin_intl_NumberingSystemsGenerated_h") 3894 3895 js_src_builtin_intl_dir = os.path.dirname(os.path.abspath(__file__)) 3896 test_dir = os.path.join(js_src_builtin_intl_dir, "../../tests/non262/Intl") 3897 3898 intl_shell_js_file = os.path.join(test_dir, "shell.js") 3899 3900 with io.open(intl_shell_js_file, mode="w", encoding="utf-8", newline="") as f: 3901 println = partial(print, file=f) 3902 3903 println(generatedFileWarning) 3904 3905 println( 3906 """ 3907// source: CLDR file common/bcp47/number.xml; version CLDR {}. 3908// https://github.com/unicode-org/cldr/blob/master/common/bcp47/number.xml 3909// https://github.com/unicode-org/cldr/blob/master/common/supplemental/numberingSystems.xml 3910""".format( 3911 readCLDRVersionFromICU() 3912 ).rstrip() 3913 ) 3914 3915 numbering_systems_object = json.dumps( 3916 numbering_systems, 3917 indent=2, 3918 separators=(",", ": "), 3919 sort_keys=True, 3920 ensure_ascii=False, 3921 ) 3922 println("const numberingSystems = {};".format(numbering_systems_object)) 3923 3924 3925def updateNumberingSystems(topsrcdir, args): 3926 icu_path = os.path.join(topsrcdir, "intl", "icu") 3927 icu_misc_path = os.path.join(icu_path, "source", "data", "misc") 3928 3929 with io.open("NumberingSystems.yaml", mode="r", encoding="utf-8") as f: 3930 numbering_systems = yaml.safe_load(f) 3931 3932 # Read all possible ICU unit identifiers from the "misc/numberingSystems.txt" resource. 3933 misc_ns_file = os.path.join(icu_misc_path, "numberingSystems.txt") 3934 all_numbering_systems = readICUNumberingSystemsResourceFile(misc_ns_file) 3935 3936 all_numbering_systems_simple_digits = { 3937 name 3938 for (name, value) in all_numbering_systems.items() 3939 if not value["algorithmic"] 3940 } 3941 3942 # Assert ICU includes support for all required numbering systems. If this assertion fails, 3943 # something is broken in ICU. 3944 assert all_numbering_systems_simple_digits.issuperset( 3945 numbering_systems 3946 ), "{}".format(numbering_systems.difference(all_numbering_systems_simple_digits)) 3947 3948 # Assert the spec requires support for all numbering systems with simple digit mappings. If 3949 # this assertion fails, file a PR at <https://github.com/tc39/ecma402> to include any new 3950 # numbering systems. 3951 assert all_numbering_systems_simple_digits.issubset(numbering_systems), "{}".format( 3952 all_numbering_systems_simple_digits.difference(numbering_systems) 3953 ) 3954 3955 writeNumberingSystemFiles(all_numbering_systems) 3956 3957 3958if __name__ == "__main__": 3959 import argparse 3960 3961 # This script must reside in js/src/builtin/intl to work correctly. 3962 (thisDir, thisFile) = os.path.split(os.path.abspath(sys.argv[0])) 3963 dirPaths = os.path.normpath(thisDir).split(os.sep) 3964 if "/".join(dirPaths[-4:]) != "js/src/builtin/intl": 3965 raise RuntimeError("%s must reside in js/src/builtin/intl" % sys.argv[0]) 3966 topsrcdir = "/".join(dirPaths[:-4]) 3967 3968 def EnsureHttps(v): 3969 if not v.startswith("https:"): 3970 raise argparse.ArgumentTypeError("URL protocol must be https: " % v) 3971 return v 3972 3973 parser = argparse.ArgumentParser(description="Update intl data.") 3974 subparsers = parser.add_subparsers(help="Select update mode") 3975 3976 parser_cldr_tags = subparsers.add_parser( 3977 "langtags", help="Update CLDR language tags data" 3978 ) 3979 parser_cldr_tags.add_argument( 3980 "--version", metavar="VERSION", help="CLDR version number" 3981 ) 3982 parser_cldr_tags.add_argument( 3983 "--url", 3984 metavar="URL", 3985 default="https://unicode.org/Public/cldr/<VERSION>/core.zip", 3986 type=EnsureHttps, 3987 help="Download url CLDR data (default: %(default)s)", 3988 ) 3989 parser_cldr_tags.add_argument( 3990 "--out", 3991 default="LanguageTagGenerated.cpp", 3992 help="Output file (default: %(default)s)", 3993 ) 3994 parser_cldr_tags.add_argument( 3995 "file", nargs="?", help="Local cldr-core.zip file, if omitted uses <URL>" 3996 ) 3997 parser_cldr_tags.set_defaults(func=updateCLDRLangTags) 3998 3999 parser_tz = subparsers.add_parser("tzdata", help="Update tzdata") 4000 parser_tz.add_argument( 4001 "--tz", 4002 help="Local tzdata directory or file, if omitted downloads tzdata " 4003 "distribution from https://www.iana.org/time-zones/", 4004 ) 4005 # ICU doesn't include the backzone file by default, but we still like to 4006 # use the backzone time zone names to avoid user confusion. This does lead 4007 # to formatting "historic" dates (pre-1970 era) with the wrong time zone, 4008 # but that's probably acceptable for now. 4009 parser_tz.add_argument( 4010 "--ignore-backzone", 4011 action="store_true", 4012 help="Ignore tzdata's 'backzone' file. Can be enabled to generate more " 4013 "accurate time zone canonicalization reflecting the actual time " 4014 "zones as used by ICU.", 4015 ) 4016 parser_tz.add_argument( 4017 "--out", 4018 default="TimeZoneDataGenerated.h", 4019 help="Output file (default: %(default)s)", 4020 ) 4021 parser_tz.set_defaults(func=partial(updateTzdata, topsrcdir)) 4022 4023 parser_currency = subparsers.add_parser( 4024 "currency", help="Update currency digits mapping" 4025 ) 4026 parser_currency.add_argument( 4027 "--url", 4028 metavar="URL", 4029 default="https://www.currency-iso.org/dam/downloads/lists/list_one.xml", # NOQA: E501 4030 type=EnsureHttps, 4031 help="Download url for the currency & funds code list (default: " 4032 "%(default)s)", 4033 ) 4034 parser_currency.add_argument( 4035 "--out", 4036 default="CurrencyDataGenerated.js", 4037 help="Output file (default: %(default)s)", 4038 ) 4039 parser_currency.add_argument( 4040 "file", nargs="?", help="Local currency code list file, if omitted uses <URL>" 4041 ) 4042 parser_currency.set_defaults(func=partial(updateCurrency, topsrcdir)) 4043 4044 parser_units = subparsers.add_parser( 4045 "units", help="Update sanctioned unit identifiers mapping" 4046 ) 4047 parser_units.set_defaults(func=partial(updateUnits, topsrcdir)) 4048 4049 parser_numbering_systems = subparsers.add_parser( 4050 "numbering", help="Update numbering systems with simple " "digit mappings" 4051 ) 4052 parser_numbering_systems.set_defaults( 4053 func=partial(updateNumberingSystems, topsrcdir) 4054 ) 4055 4056 args = parser.parse_args() 4057 args.func(args) 4058