1#!/usr/bin/env python 2# -*- coding: utf-8 -*- 3# 4# This Source Code Form is subject to the terms of the Mozilla Public 5# License, v. 2.0. If a copy of the MPL was not distributed with this 6# file, You can obtain one at http://mozilla.org/MPL/2.0/. 7 8""" Usage: 9 make_intl_data.py langtags [cldr_core.zip] 10 make_intl_data.py tzdata 11 make_intl_data.py currency 12 make_intl_data.py units 13 make_intl_data.py numbering 14 15 16 Target "langtags": 17 This script extracts information about 1) mappings between deprecated and 18 current Unicode BCP 47 locale identifiers, and 2) deprecated and current 19 BCP 47 Unicode extension value from CLDR, and converts it to C++ mapping 20 code in intl/components/LocaleGenerated.cpp. The code is used in 21 intl/components/Locale.cpp. 22 23 24 Target "tzdata": 25 This script computes which time zone informations are not up-to-date in ICU 26 and provides the necessary mappings to workaround this problem. 27 https://ssl.icu-project.org/trac/ticket/12044 28 29 30 Target "currency": 31 Generates the mapping from currency codes to decimal digits used for them. 32 33 34 Target "units": 35 Generate source and test files using the list of so-called "sanctioned unit 36 identifiers" and verifies that the ICU data filter includes these units. 37 38 39 Target "numbering": 40 Generate source and test files using the list of numbering systems with 41 simple digit mappings and verifies that it's in sync with ICU/CLDR. 42""" 43 44from __future__ import print_function 45import os 46import re 47import io 48import json 49import sys 50import tarfile 51import tempfile 52import yaml 53from contextlib import closing 54from functools import partial, total_ordering 55from itertools import chain, groupby, tee 56from operator import attrgetter, itemgetter 57from zipfile import ZipFile 58 59if sys.version_info.major == 2: 60 from itertools import ( 61 ifilter as filter, 62 ifilterfalse as filterfalse, 63 imap as map, 64 izip_longest as zip_longest, 65 ) 66 from urllib2 import urlopen, Request as UrlRequest 67 from urlparse import urlsplit 68else: 69 from itertools import filterfalse, zip_longest 70 from urllib.request import urlopen, Request as UrlRequest 71 from urllib.parse import urlsplit 72 73 74# From https://docs.python.org/3/library/itertools.html 75def grouper(iterable, n, fillvalue=None): 76 "Collect data into fixed-length chunks or blocks" 77 # grouper('ABCDEFG', 3, 'x') --> ABC DEF Gxx" 78 args = [iter(iterable)] * n 79 return zip_longest(*args, fillvalue=fillvalue) 80 81 82def writeMappingHeader(println, description, source, url): 83 if type(description) is not list: 84 description = [description] 85 for desc in description: 86 println("// {0}".format(desc)) 87 println("// Derived from {0}.".format(source)) 88 println("// {0}".format(url)) 89 90 91def writeMappingsVar(println, mapping, name, description, source, url): 92 """Writes a variable definition with a mapping table. 93 94 Writes the contents of dictionary |mapping| through the |println| 95 function with the given variable name and a comment with description, 96 fileDate, and URL. 97 """ 98 println("") 99 writeMappingHeader(println, description, source, url) 100 println("var {0} = {{".format(name)) 101 for (key, value) in sorted(mapping.items(), key=itemgetter(0)): 102 println(' "{0}": "{1}",'.format(key, value)) 103 println("};") 104 105 106def writeMappingsBinarySearch( 107 println, 108 fn_name, 109 type_name, 110 name, 111 validate_fn, 112 validate_case_fn, 113 mappings, 114 tag_maxlength, 115 description, 116 source, 117 url, 118): 119 """Emit code to perform a binary search on language tag subtags. 120 121 Uses the contents of |mapping|, which can either be a dictionary or set, 122 to emit a mapping function to find subtag replacements. 123 """ 124 println("") 125 writeMappingHeader(println, description, source, url) 126 println( 127 """ 128bool mozilla::intl::Locale::{0}({1} {2}) {{ 129 MOZ_ASSERT({3}({2}.Span())); 130 MOZ_ASSERT({4}({2}.Span())); 131""".format( 132 fn_name, type_name, name, validate_fn, validate_case_fn 133 ).strip() 134 ) 135 writeMappingsBinarySearchBody(println, name, name, mappings, tag_maxlength) 136 137 println( 138 """ 139}""".lstrip( 140 "\n" 141 ) 142 ) 143 144 145def writeMappingsBinarySearchBody( 146 println, source_name, target_name, mappings, tag_maxlength 147): 148 def write_array(subtags, name, length, fixed): 149 if fixed: 150 println( 151 " static const char {}[{}][{}] = {{".format( 152 name, len(subtags), length + 1 153 ) 154 ) 155 else: 156 println(" static const char* {}[{}] = {{".format(name, len(subtags))) 157 158 # Group in pairs of ten to not exceed the 80 line column limit. 159 for entries in grouper(subtags, 10): 160 entries = ( 161 '"{}"'.format(tag).rjust(length + 2) 162 for tag in entries 163 if tag is not None 164 ) 165 println(" {},".format(", ".join(entries))) 166 167 println(" };") 168 169 trailing_return = True 170 171 # Sort the subtags by length. That enables using an optimized comparator 172 # for the binary search, which only performs a single |memcmp| for multiple 173 # of two subtag lengths. 174 mappings_keys = mappings.keys() if type(mappings) == dict else mappings 175 for (length, subtags) in groupby(sorted(mappings_keys, key=len), len): 176 # Omit the length check if the current length is the maximum length. 177 if length != tag_maxlength: 178 println( 179 """ 180 if ({}.Length() == {}) {{ 181""".format( 182 source_name, length 183 ).rstrip( 184 "\n" 185 ) 186 ) 187 else: 188 trailing_return = False 189 println( 190 """ 191 { 192""".rstrip( 193 "\n" 194 ) 195 ) 196 197 # The subtags need to be sorted for binary search to work. 198 subtags = sorted(subtags) 199 200 def equals(subtag): 201 return """{}.EqualTo("{}")""".format(source_name, subtag) 202 203 # Don't emit a binary search for short lists. 204 if len(subtags) == 1: 205 if type(mappings) == dict: 206 println( 207 """ 208 if ({}) {{ 209 {}.Set(mozilla::MakeStringSpan("{}")); 210 return true; 211 }} 212 return false; 213""".format( 214 equals(subtags[0]), target_name, mappings[subtags[0]] 215 ).strip( 216 "\n" 217 ) 218 ) 219 else: 220 println( 221 """ 222 return {}; 223""".format( 224 equals(subtags[0]) 225 ).strip( 226 "\n" 227 ) 228 ) 229 elif len(subtags) <= 4: 230 if type(mappings) == dict: 231 for subtag in subtags: 232 println( 233 """ 234 if ({}) {{ 235 {}.Set("{}"); 236 return true; 237 }} 238""".format( 239 equals(subtag), target_name, mappings[subtag] 240 ).strip( 241 "\n" 242 ) 243 ) 244 245 println( 246 """ 247 return false; 248""".strip( 249 "\n" 250 ) 251 ) 252 else: 253 cond = (equals(subtag) for subtag in subtags) 254 cond = (" ||\n" + " " * (4 + len("return "))).join(cond) 255 println( 256 """ 257 return {}; 258""".format( 259 cond 260 ).strip( 261 "\n" 262 ) 263 ) 264 else: 265 write_array(subtags, source_name + "s", length, True) 266 267 if type(mappings) == dict: 268 write_array([mappings[k] for k in subtags], "aliases", length, False) 269 270 println( 271 """ 272 if (const char* replacement = SearchReplacement({0}s, aliases, {0})) {{ 273 {1}.Set(mozilla::MakeStringSpan(replacement)); 274 return true; 275 }} 276 return false; 277""".format( 278 source_name, target_name 279 ).rstrip() 280 ) 281 else: 282 println( 283 """ 284 return HasReplacement({0}s, {0}); 285""".format( 286 source_name 287 ).rstrip() 288 ) 289 290 println( 291 """ 292 } 293""".strip( 294 "\n" 295 ) 296 ) 297 298 if trailing_return: 299 println( 300 """ 301 return false;""" 302 ) 303 304 305def writeComplexLanguageTagMappings( 306 println, complex_language_mappings, description, source, url 307): 308 println("") 309 writeMappingHeader(println, description, source, url) 310 println( 311 """ 312void mozilla::intl::Locale::PerformComplexLanguageMappings() { 313 MOZ_ASSERT(IsStructurallyValidLanguageTag(Language().Span())); 314 MOZ_ASSERT(IsCanonicallyCasedLanguageTag(Language().Span())); 315""".lstrip() 316 ) 317 318 # Merge duplicate language entries. 319 language_aliases = {} 320 for (deprecated_language, (language, script, region)) in sorted( 321 complex_language_mappings.items(), key=itemgetter(0) 322 ): 323 key = (language, script, region) 324 if key not in language_aliases: 325 language_aliases[key] = [] 326 else: 327 language_aliases[key].append(deprecated_language) 328 329 first_language = True 330 for (deprecated_language, (language, script, region)) in sorted( 331 complex_language_mappings.items(), key=itemgetter(0) 332 ): 333 key = (language, script, region) 334 if deprecated_language in language_aliases[key]: 335 continue 336 337 if_kind = "if" if first_language else "else if" 338 first_language = False 339 340 cond = ( 341 'Language().EqualTo("{}")'.format(lang) 342 for lang in [deprecated_language] + language_aliases[key] 343 ) 344 cond = (" ||\n" + " " * (2 + len(if_kind) + 2)).join(cond) 345 346 println( 347 """ 348 {} ({}) {{""".format( 349 if_kind, cond 350 ).strip( 351 "\n" 352 ) 353 ) 354 355 println( 356 """ 357 SetLanguage("{}");""".format( 358 language 359 ).strip( 360 "\n" 361 ) 362 ) 363 364 if script is not None: 365 println( 366 """ 367 if (Script().Missing()) {{ 368 SetScript("{}"); 369 }}""".format( 370 script 371 ).strip( 372 "\n" 373 ) 374 ) 375 if region is not None: 376 println( 377 """ 378 if (Region().Missing()) {{ 379 SetRegion("{}"); 380 }}""".format( 381 region 382 ).strip( 383 "\n" 384 ) 385 ) 386 println( 387 """ 388 }""".strip( 389 "\n" 390 ) 391 ) 392 393 println( 394 """ 395} 396""".strip( 397 "\n" 398 ) 399 ) 400 401 402def writeComplexRegionTagMappings( 403 println, complex_region_mappings, description, source, url 404): 405 println("") 406 writeMappingHeader(println, description, source, url) 407 println( 408 """ 409void mozilla::intl::Locale::PerformComplexRegionMappings() { 410 MOZ_ASSERT(IsStructurallyValidLanguageTag(Language().Span())); 411 MOZ_ASSERT(IsCanonicallyCasedLanguageTag(Language().Span())); 412 MOZ_ASSERT(IsStructurallyValidRegionTag(Region().Span())); 413 MOZ_ASSERT(IsCanonicallyCasedRegionTag(Region().Span())); 414""".lstrip() 415 ) 416 417 # |non_default_replacements| is a list and hence not hashable. Convert it 418 # to a string to get a proper hashable value. 419 def hash_key(default, non_default_replacements): 420 return (default, str(sorted(str(v) for v in non_default_replacements))) 421 422 # Merge duplicate region entries. 423 region_aliases = {} 424 for (deprecated_region, (default, non_default_replacements)) in sorted( 425 complex_region_mappings.items(), key=itemgetter(0) 426 ): 427 key = hash_key(default, non_default_replacements) 428 if key not in region_aliases: 429 region_aliases[key] = [] 430 else: 431 region_aliases[key].append(deprecated_region) 432 433 first_region = True 434 for (deprecated_region, (default, non_default_replacements)) in sorted( 435 complex_region_mappings.items(), key=itemgetter(0) 436 ): 437 key = hash_key(default, non_default_replacements) 438 if deprecated_region in region_aliases[key]: 439 continue 440 441 if_kind = "if" if first_region else "else if" 442 first_region = False 443 444 cond = ( 445 'Region().EqualTo("{}")'.format(region) 446 for region in [deprecated_region] + region_aliases[key] 447 ) 448 cond = (" ||\n" + " " * (2 + len(if_kind) + 2)).join(cond) 449 450 println( 451 """ 452 {} ({}) {{""".format( 453 if_kind, cond 454 ).strip( 455 "\n" 456 ) 457 ) 458 459 replacement_regions = sorted( 460 {region for (_, _, region) in non_default_replacements} 461 ) 462 463 first_case = True 464 for replacement_region in replacement_regions: 465 replacement_language_script = sorted( 466 (language, script) 467 for (language, script, region) in (non_default_replacements) 468 if region == replacement_region 469 ) 470 471 if_kind = "if" if first_case else "else if" 472 first_case = False 473 474 def compare_tags(language, script): 475 if script is None: 476 return 'Language().EqualTo("{}")'.format(language) 477 return '(Language().EqualTo("{}") && Script().EqualTo("{}"))'.format( 478 language, script 479 ) 480 481 cond = ( 482 compare_tags(language, script) 483 for (language, script) in replacement_language_script 484 ) 485 cond = (" ||\n" + " " * (4 + len(if_kind) + 2)).join(cond) 486 487 println( 488 """ 489 {} ({}) {{ 490 SetRegion("{}"); 491 }}""".format( 492 if_kind, cond, replacement_region 493 ) 494 .rstrip() 495 .strip("\n") 496 ) 497 498 println( 499 """ 500 else {{ 501 SetRegion("{}"); 502 }} 503 }}""".format( 504 default 505 ) 506 .rstrip() 507 .strip("\n") 508 ) 509 510 println( 511 """ 512} 513""".strip( 514 "\n" 515 ) 516 ) 517 518 519def writeVariantTagMappings(println, variant_mappings, description, source, url): 520 """Writes a function definition that maps variant subtags.""" 521 println( 522 """ 523static const char* ToCharPointer(const char* str) { 524 return str; 525} 526 527static const char* ToCharPointer(const mozilla::intl::UniqueChars& str) { 528 return str.get(); 529} 530 531template <typename T, typename U = T> 532static bool IsLessThan(const T& a, const U& b) { 533 return strcmp(ToCharPointer(a), ToCharPointer(b)) < 0; 534} 535""" 536 ) 537 writeMappingHeader(println, description, source, url) 538 println( 539 """ 540bool mozilla::intl::Locale::PerformVariantMappings() { 541 // The variant subtags need to be sorted for binary search. 542 MOZ_ASSERT(std::is_sorted(mVariants.begin(), mVariants.end(), 543 IsLessThan<decltype(mVariants)::ElementType>)); 544 545 auto removeVariantAt = [&](size_t index) { 546 mVariants.erase(mVariants.begin() + index); 547 }; 548 549 auto insertVariantSortedIfNotPresent = [&](const char* variant) { 550 auto* p = std::lower_bound( 551 mVariants.begin(), mVariants.end(), variant, 552 IsLessThan<decltype(mVariants)::ElementType, decltype(variant)>); 553 554 // Don't insert the replacement when already present. 555 if (p != mVariants.end() && strcmp(p->get(), variant) == 0) { 556 return true; 557 } 558 559 // Insert the preferred variant in sort order. 560 auto preferred = DuplicateStringToUniqueChars(variant); 561 return !!mVariants.insert(p, std::move(preferred)); 562 }; 563 564 for (size_t i = 0; i < mVariants.length();) { 565 const char* variant = mVariants[i].get(); 566 MOZ_ASSERT(IsCanonicallyCasedVariantTag(mozilla::MakeStringSpan(variant))); 567""".lstrip() 568 ) 569 570 (no_alias, with_alias) = partition( 571 variant_mappings.items(), lambda item: item[1] is None 572 ) 573 574 no_replacements = " ||\n ".join( 575 f"""strcmp(variant, "{deprecated_variant}") == 0""" 576 for (deprecated_variant, _) in sorted(no_alias, key=itemgetter(0)) 577 ) 578 579 println( 580 f""" 581 if ({no_replacements}) {{ 582 removeVariantAt(i); 583 }} 584""".strip( 585 "\n" 586 ) 587 ) 588 589 for (deprecated_variant, (type, replacement)) in sorted( 590 with_alias, key=itemgetter(0) 591 ): 592 println( 593 f""" 594 else if (strcmp(variant, "{deprecated_variant}") == 0) {{ 595 removeVariantAt(i); 596""".strip( 597 "\n" 598 ) 599 ) 600 601 if type == "language": 602 println( 603 f""" 604 SetLanguage("{replacement}"); 605""".strip( 606 "\n" 607 ) 608 ) 609 elif type == "region": 610 println( 611 f""" 612 SetRegion("{replacement}"); 613""".strip( 614 "\n" 615 ) 616 ) 617 else: 618 assert type == "variant" 619 println( 620 f""" 621 if (!insertVariantSortedIfNotPresent("{replacement}")) {{ 622 return false; 623 }} 624""".strip( 625 "\n" 626 ) 627 ) 628 629 println( 630 """ 631 } 632""".strip( 633 "\n" 634 ) 635 ) 636 637 println( 638 """ 639 else { 640 i++; 641 } 642 } 643 return true; 644} 645""".strip( 646 "\n" 647 ) 648 ) 649 650 651def writeLegacyMappingsFunction(println, legacy_mappings, description, source, url): 652 """Writes a function definition that maps legacy language tags.""" 653 println("") 654 writeMappingHeader(println, description, source, url) 655 println( 656 """\ 657bool mozilla::intl::Locale::UpdateLegacyMappings() { 658 // We're mapping legacy tags to non-legacy form here. 659 // Other tags remain unchanged. 660 // 661 // Legacy tags are either sign language tags ("sgn") or have one or multiple 662 // variant subtags. Therefore we can quickly exclude most tags by checking 663 // these two subtags. 664 665 MOZ_ASSERT(IsCanonicallyCasedLanguageTag(Language().Span())); 666 667 if (!Language().EqualTo("sgn") && mVariants.length() == 0) { 668 return true; 669 } 670 671#ifdef DEBUG 672 for (const auto& variant : Variants()) { 673 MOZ_ASSERT(IsStructurallyValidVariantTag(variant)); 674 MOZ_ASSERT(IsCanonicallyCasedVariantTag(variant)); 675 } 676#endif 677 678 // The variant subtags need to be sorted for binary search. 679 MOZ_ASSERT(std::is_sorted(mVariants.begin(), mVariants.end(), 680 IsLessThan<decltype(mVariants)::ElementType>)); 681 682 auto findVariant = [this](const char* variant) { 683 auto* p = std::lower_bound(mVariants.begin(), mVariants.end(), variant, 684 IsLessThan<decltype(mVariants)::ElementType, 685 decltype(variant)>); 686 687 if (p != mVariants.end() && strcmp(p->get(), variant) == 0) { 688 return p; 689 } 690 return static_cast<decltype(p)>(nullptr); 691 }; 692 693 auto insertVariantSortedIfNotPresent = [&](const char* variant) { 694 auto* p = std::lower_bound(mVariants.begin(), mVariants.end(), variant, 695 IsLessThan<decltype(mVariants)::ElementType, 696 decltype(variant)>); 697 698 // Don't insert the replacement when already present. 699 if (p != mVariants.end() && strcmp(p->get(), variant) == 0) { 700 return true; 701 } 702 703 // Insert the preferred variant in sort order. 704 auto preferred = DuplicateStringToUniqueChars(variant); 705 return !!mVariants.insert(p, std::move(preferred)); 706 }; 707 708 auto removeVariant = [&](auto* p) { 709 size_t index = std::distance(mVariants.begin(), p); 710 mVariants.erase(mVariants.begin() + index); 711 }; 712 713 auto removeVariants = [&](auto* p, auto* q) { 714 size_t pIndex = std::distance(mVariants.begin(), p); 715 size_t qIndex = std::distance(mVariants.begin(), q); 716 MOZ_ASSERT(pIndex < qIndex, "variant subtags are sorted"); 717 718 mVariants.erase(mVariants.begin() + qIndex); 719 mVariants.erase(mVariants.begin() + pIndex); 720 };""" 721 ) 722 723 # Helper class for pattern matching. 724 class AnyClass: 725 def __eq__(self, obj): 726 return obj is not None 727 728 Any = AnyClass() 729 730 # Group the mappings by language. 731 legacy_mappings_by_language = {} 732 for (type, replacement) in legacy_mappings.items(): 733 (language, _, _, _) = type 734 legacy_mappings_by_language.setdefault(language, {})[type] = replacement 735 736 # Handle the empty language case first. 737 if None in legacy_mappings_by_language: 738 # Get the mappings and remove them from the dict. 739 mappings = legacy_mappings_by_language.pop(None) 740 741 # This case only applies for the "hepburn-heploc" -> "alalc97" 742 # mapping, so just inline it here. 743 from_tag = (None, None, None, "hepburn-heploc") 744 to_tag = (None, None, None, "alalc97") 745 746 assert len(mappings) == 1 747 assert mappings[from_tag] == to_tag 748 749 println( 750 """ 751 if (mVariants.length() >= 2) { 752 if (auto* hepburn = findVariant("hepburn")) { 753 if (auto* heploc = findVariant("heploc")) { 754 removeVariants(hepburn, heploc); 755 756 if (!insertVariantSortedIfNotPresent("alalc97")) { 757 return false; 758 } 759 } 760 } 761 } 762""" 763 ) 764 765 # Handle sign languages next. 766 if "sgn" in legacy_mappings_by_language: 767 mappings = legacy_mappings_by_language.pop("sgn") 768 769 # Legacy sign language mappings have the form "sgn-XX" where "XX" is 770 # some region code. 771 assert all(type == ("sgn", None, Any, None) for type in mappings.keys()) 772 773 # Legacy sign languages are mapped to a single language subtag. 774 assert all( 775 replacement == (Any, None, None, None) for replacement in mappings.values() 776 ) 777 778 println( 779 """ 780 if (Language().EqualTo("sgn")) { 781 if (Region().Present() && SignLanguageMapping(mLanguage, Region())) { 782 mRegion.Set(mozilla::MakeStringSpan("")); 783 } 784 } 785""".rstrip().lstrip( 786 "\n" 787 ) 788 ) 789 790 # Finally handle all remaining cases. 791 792 # The remaining mappings have neither script nor region subtags in the source locale. 793 assert all( 794 type == (Any, None, None, Any) 795 for mappings in legacy_mappings_by_language.values() 796 for type in mappings.keys() 797 ) 798 799 # And they have neither script nor region nor variant subtags in the target locale. 800 assert all( 801 replacement == (Any, None, None, None) 802 for mappings in legacy_mappings_by_language.values() 803 for replacement in mappings.values() 804 ) 805 806 # Compact the mappings table by removing empty fields. 807 legacy_mappings_by_language = { 808 lang: { 809 variants: r_language 810 for ((_, _, _, variants), (r_language, _, _, _)) in mappings.items() 811 } 812 for (lang, mappings) in legacy_mappings_by_language.items() 813 } 814 815 # Try to combine the remaining cases. 816 legacy_mappings_compact = {} 817 818 # Python can't hash dicts or lists, so use the string representation as the hash key. 819 def hash_key(mappings): 820 return str(sorted(mappings.items(), key=itemgetter(0))) 821 822 for (lang, mappings) in sorted( 823 legacy_mappings_by_language.items(), key=itemgetter(0) 824 ): 825 key = hash_key(mappings) 826 legacy_mappings_compact.setdefault(key, []).append(lang) 827 828 for langs in legacy_mappings_compact.values(): 829 language_equal_to = ( 830 f"""Language().EqualTo("{lang}")""" for lang in sorted(langs) 831 ) 832 cond = f""" ||\n{" " * len(" else if (")}""".join(language_equal_to) 833 834 println( 835 f""" 836 else if ({cond}) {{ 837""".rstrip().lstrip( 838 "\n" 839 ) 840 ) 841 842 mappings = legacy_mappings_by_language[langs[0]] 843 844 # Count the variant subtags to determine the sort order. 845 def variant_size(m): 846 (k, _) = m 847 return len(k.split("-")) 848 849 # Alias rules are applied by largest union size first. 850 for (size, mappings_by_size) in groupby( 851 sorted(mappings.items(), key=variant_size, reverse=True), key=variant_size 852 ): 853 854 # Convert grouper object to dict. 855 mappings_by_size = dict(mappings_by_size) 856 857 is_first = True 858 chain_if = size == 1 859 860 # Alias rules are applied in alphabetical order 861 for (variants, r_language) in sorted( 862 mappings_by_size.items(), key=itemgetter(0) 863 ): 864 sorted_variants = sorted(variants.split("-")) 865 len_variants = len(sorted_variants) 866 867 maybe_else = "else " if chain_if and not is_first else "" 868 is_first = False 869 870 for (i, variant) in enumerate(sorted_variants): 871 println( 872 f""" 873 {" " * i}{maybe_else}if (auto* {variant} = findVariant("{variant}")) {{ 874""".rstrip().lstrip( 875 "\n" 876 ) 877 ) 878 879 indent = " " * len_variants 880 881 println( 882 f""" 883 {indent}removeVariant{"s" if len_variants > 1 else ""}({", ".join(sorted_variants)}); 884 {indent}SetLanguage("{r_language}"); 885 {indent}{"return true;" if not chain_if else ""} 886""".rstrip().lstrip( 887 "\n" 888 ) 889 ) 890 891 for i in range(len_variants, 0, -1): 892 println( 893 f""" 894 {" " * (i - 1)}}} 895""".rstrip().lstrip( 896 "\n" 897 ) 898 ) 899 900 println( 901 """ 902 } 903""".rstrip().lstrip( 904 "\n" 905 ) 906 ) 907 908 println( 909 """ 910 return true; 911}""" 912 ) 913 914 915def writeSignLanguageMappingsFunction( 916 println, legacy_mappings, description, source, url 917): 918 """Writes a function definition that maps legacy sign language tags.""" 919 println("") 920 writeMappingHeader(println, description, source, url) 921 println( 922 """\ 923bool mozilla::intl::Locale::SignLanguageMapping(LanguageSubtag& language, 924 const RegionSubtag& region) { 925 MOZ_ASSERT(language.EqualTo("sgn")); 926 MOZ_ASSERT(IsStructurallyValidRegionTag(region.Span())); 927 MOZ_ASSERT(IsCanonicallyCasedRegionTag(region.Span())); 928""".rstrip() 929 ) 930 931 region_mappings = { 932 rg: lg 933 for ((lang, _, rg, _), (lg, _, _, _)) in legacy_mappings.items() 934 if lang == "sgn" 935 } 936 937 source_name = "region" 938 target_name = "language" 939 tag_maxlength = 3 940 writeMappingsBinarySearchBody( 941 println, source_name, target_name, region_mappings, tag_maxlength 942 ) 943 944 println( 945 """ 946}""".lstrip() 947 ) 948 949 950def readSupplementalData(core_file): 951 """Reads CLDR Supplemental Data and extracts information for Intl.js. 952 953 Information extracted: 954 - legacyMappings: mappings from legacy tags to preferred complete language tags 955 - languageMappings: mappings from language subtags to preferred subtags 956 - complexLanguageMappings: mappings from language subtags with complex rules 957 - regionMappings: mappings from region subtags to preferred subtags 958 - complexRegionMappings: mappings from region subtags with complex rules 959 - variantMappings: mappings from variant subtags to preferred subtags 960 - likelySubtags: likely subtags used for generating test data only 961 Returns these mappings as dictionaries. 962 """ 963 import xml.etree.ElementTree as ET 964 965 # From Unicode BCP 47 locale identifier <https://unicode.org/reports/tr35/>. 966 re_unicode_language_id = re.compile( 967 r""" 968 ^ 969 # unicode_language_id = unicode_language_subtag 970 # unicode_language_subtag = alpha{2,3} | alpha{5,8} 971 (?P<language>[a-z]{2,3}|[a-z]{5,8}) 972 973 # (sep unicode_script_subtag)? 974 # unicode_script_subtag = alpha{4} 975 (?:-(?P<script>[a-z]{4}))? 976 977 # (sep unicode_region_subtag)? 978 # unicode_region_subtag = (alpha{2} | digit{3}) 979 (?:-(?P<region>([a-z]{2}|[0-9]{3})))? 980 981 # (sep unicode_variant_subtag)* 982 # unicode_variant_subtag = (alphanum{5,8} | digit alphanum{3}) 983 (?P<variants>(-([a-z0-9]{5,8}|[0-9][a-z0-9]{3}))+)? 984 $ 985 """, 986 re.IGNORECASE | re.VERBOSE, 987 ) 988 989 # CLDR uses "_" as the separator for some elements. Replace it with "-". 990 def bcp47_id(cldr_id): 991 return cldr_id.replace("_", "-") 992 993 # Return the tuple (language, script, region, variants) and assert all 994 # subtags are in canonical case. 995 def bcp47_canonical(language, script, region, variants): 996 # Canonical case for language subtags is lower case. 997 assert language is None or language.lower() == language 998 999 # Canonical case for script subtags is title case. 1000 assert script is None or script.title() == script 1001 1002 # Canonical case for region subtags is upper case. 1003 assert region is None or region.upper() == region 1004 1005 # Canonical case for variant subtags is lower case. 1006 assert variants is None or variants.lower() == variants 1007 1008 return (language, script, region, variants[1:] if variants else None) 1009 1010 # Language ids are interpreted as multi-maps in 1011 # <https://www.unicode.org/reports/tr35/#LocaleId_Canonicalization>. 1012 # 1013 # See UTS35, §Annex C, Definitions - 1. Multimap interpretation. 1014 def language_id_to_multimap(language_id): 1015 match = re_unicode_language_id.match(language_id) 1016 assert ( 1017 match is not None 1018 ), f"{language_id} invalid Unicode BCP 47 locale identifier" 1019 1020 canonical_language_id = bcp47_canonical( 1021 *match.group("language", "script", "region", "variants") 1022 ) 1023 (language, _, _, _) = canonical_language_id 1024 1025 # Normalize "und" language to None, but keep the rest as is. 1026 return (language if language != "und" else None,) + canonical_language_id[1:] 1027 1028 rules = {} 1029 territory_exception_rules = {} 1030 1031 tree = ET.parse(core_file.open("common/supplemental/supplementalMetadata.xml")) 1032 1033 # Load the rules from supplementalMetadata.xml. 1034 # 1035 # See UTS35, §Annex C, Definitions - 2. Alias elements. 1036 # See UTS35, §Annex C, Preprocessing. 1037 for alias_name in [ 1038 "languageAlias", 1039 "scriptAlias", 1040 "territoryAlias", 1041 "variantAlias", 1042 ]: 1043 for alias in tree.iterfind(".//" + alias_name): 1044 # Replace '_' by '-'. 1045 type = bcp47_id(alias.get("type")) 1046 replacement = bcp47_id(alias.get("replacement")) 1047 1048 # Prefix with "und-". 1049 if alias_name != "languageAlias": 1050 type = "und-" + type 1051 1052 # Discard all rules where the type is an invalid languageId. 1053 if re_unicode_language_id.match(type) is None: 1054 continue 1055 1056 type = language_id_to_multimap(type) 1057 1058 # Multiple, whitespace-separated territory replacements may be present. 1059 if alias_name == "territoryAlias" and " " in replacement: 1060 replacements = replacement.split(" ") 1061 replacement_list = [ 1062 language_id_to_multimap("und-" + r) for r in replacements 1063 ] 1064 1065 assert ( 1066 type not in territory_exception_rules 1067 ), f"Duplicate alias rule: {type}" 1068 1069 territory_exception_rules[type] = replacement_list 1070 1071 # The first element is the default territory replacement. 1072 replacement = replacements[0] 1073 1074 # Prefix with "und-". 1075 if alias_name != "languageAlias": 1076 replacement = "und-" + replacement 1077 1078 replacement = language_id_to_multimap(replacement) 1079 1080 assert type not in rules, f"Duplicate alias rule: {type}" 1081 1082 rules[type] = replacement 1083 1084 # Helper class for pattern matching. 1085 class AnyClass: 1086 def __eq__(self, obj): 1087 return obj is not None 1088 1089 Any = AnyClass() 1090 1091 modified_rules = True 1092 loop_count = 0 1093 1094 while modified_rules: 1095 modified_rules = False 1096 loop_count += 1 1097 1098 # UTS 35 defines that canonicalization is applied until a fixed point has 1099 # been reached. This iterative application of the canonicalization algorithm 1100 # is only needed for a relatively small set of rules, so we can precompute 1101 # the transitive closure of all rules here and then perform a single pass 1102 # when canonicalizing language tags at runtime. 1103 transitive_rules = {} 1104 1105 # Compute the transitive closure. 1106 # Any case which currently doesn't occur in the CLDR sources isn't supported 1107 # and will lead to throwing an error. 1108 for (type, replacement) in rules.items(): 1109 (language, script, region, variants) = type 1110 (r_language, r_script, r_region, r_variants) = replacement 1111 1112 for (i_type, i_replacement) in rules.items(): 1113 (i_language, i_script, i_region, i_variants) = i_type 1114 (i_r_language, i_r_script, i_r_region, i_r_variants) = i_replacement 1115 1116 if i_language is not None and i_language == r_language: 1117 # This case currently only occurs when neither script nor region 1118 # subtags are present. A single variant subtags may be present 1119 # in |type|. And |i_type| definitely has a single variant subtag. 1120 # Should this ever change, update this code accordingly. 1121 assert type == (Any, None, None, None) or type == ( 1122 Any, 1123 None, 1124 None, 1125 Any, 1126 ) 1127 assert replacement == (Any, None, None, None) 1128 assert i_type == (Any, None, None, Any) 1129 assert i_replacement == (Any, None, None, None) 1130 1131 # This case happens for the rules 1132 # "zh-guoyu -> zh", 1133 # "zh-hakka -> hak", and 1134 # "und-hakka -> und". 1135 # Given the possible input "zh-guoyu-hakka", the first rule will 1136 # change it to "zh-hakka", and then the second rule can be 1137 # applied. (The third rule isn't applied ever.) 1138 # 1139 # Let's assume there's a hypothetical rule 1140 # "zh-aaaaa" -> "en" 1141 # And we have the input "zh-aaaaa-hakka", then "zh-aaaaa -> en" 1142 # is applied before "zh-hakka -> hak", because rules are sorted 1143 # alphabetically. That means the overall result is "en": 1144 # "zh-aaaaa-hakka" is first canonicalized to "en-hakka" and then 1145 # "hakka" is removed through the third rule. 1146 # 1147 # No current rule requires to handle this special case, so we 1148 # don't yet support it. 1149 assert variants is None or variants <= i_variants 1150 1151 # Combine all variants and remove duplicates. 1152 vars = set( 1153 i_variants.split("-") 1154 + (variants.split("-") if variants else []) 1155 ) 1156 1157 # Add the variants alphabetically sorted. 1158 n_type = (language, None, None, "-".join(sorted(vars))) 1159 1160 assert ( 1161 n_type not in transitive_rules 1162 or transitive_rules[n_type] == i_replacement 1163 ) 1164 transitive_rules[n_type] = i_replacement 1165 1166 continue 1167 1168 if i_script is not None and i_script == r_script: 1169 # This case currently doesn't occur, so we don't yet support it. 1170 raise ValueError( 1171 f"{type} -> {replacement} :: {i_type} -> {i_replacement}" 1172 ) 1173 if i_region is not None and i_region == r_region: 1174 # This case currently only applies for sign language 1175 # replacements. Similar to the language subtag case any other 1176 # combination isn't currently supported. 1177 assert type == (None, None, Any, None) 1178 assert replacement == (None, None, Any, None) 1179 assert i_type == ("sgn", None, Any, None) 1180 assert i_replacement == (Any, None, None, None) 1181 1182 n_type = ("sgn", None, region, None) 1183 1184 assert n_type not in transitive_rules 1185 transitive_rules[n_type] = i_replacement 1186 1187 continue 1188 1189 if i_variants is not None and i_variants == r_variants: 1190 # This case currently doesn't occur, so we don't yet support it. 1191 raise ValueError( 1192 f"{type} -> {replacement} :: {i_type} -> {i_replacement}" 1193 ) 1194 1195 # Ensure there are no contradicting rules. 1196 assert all( 1197 rules[type] == replacement 1198 for (type, replacement) in transitive_rules.items() 1199 if type in rules 1200 ) 1201 1202 # If |transitive_rules| is not a subset of |rules|, new rules will be added. 1203 modified_rules = not (transitive_rules.keys() <= rules.keys()) 1204 1205 # Ensure we only have to iterate more than once for the "guoyo-{hakka,xiang}" 1206 # case. Failing this assertion means either there's a bug when computing the 1207 # stop condition of this loop or a new kind of legacy language tags was added. 1208 if modified_rules and loop_count > 1: 1209 new_rules = {k for k in transitive_rules.keys() if k not in rules} 1210 for k in new_rules: 1211 assert k == (Any, None, None, "guoyu-hakka") or k == ( 1212 Any, 1213 None, 1214 None, 1215 "guoyu-xiang", 1216 ) 1217 1218 # Merge the transitive rules. 1219 rules.update(transitive_rules) 1220 1221 # Computes the size of the union of all field value sets. 1222 def multi_map_size(locale_id): 1223 (language, script, region, variants) = locale_id 1224 1225 return ( 1226 (1 if language is not None else 0) 1227 + (1 if script is not None else 0) 1228 + (1 if region is not None else 0) 1229 + (len(variants.split("-")) if variants is not None else 0) 1230 ) 1231 1232 # Dictionary of legacy mappings, contains raw rules, e.g. 1233 # (None, None, None, "hepburn-heploc") -> (None, None, None, "alalc97"). 1234 legacy_mappings = {} 1235 1236 # Dictionary of simple language subtag mappings, e.g. "in" -> "id". 1237 language_mappings = {} 1238 1239 # Dictionary of complex language subtag mappings, modifying more than one 1240 # subtag, e.g. "sh" -> ("sr", "Latn", None) and "cnr" -> ("sr", None, "ME"). 1241 complex_language_mappings = {} 1242 1243 # Dictionary of simple script subtag mappings, e.g. "Qaai" -> "Zinh". 1244 script_mappings = {} 1245 1246 # Dictionary of simple region subtag mappings, e.g. "DD" -> "DE". 1247 region_mappings = {} 1248 1249 # Dictionary of complex region subtag mappings, containing more than one 1250 # replacement, e.g. "SU" -> ("RU", ["AM", "AZ", "BY", ...]). 1251 complex_region_mappings = {} 1252 1253 # Dictionary of aliased variant subtags to a tuple of preferred replacement 1254 # type and replacement, e.g. "arevela" -> ("language", "hy") or 1255 # "aaland" -> ("region", "AX") or "heploc" -> ("variant", "alalc97"). 1256 variant_mappings = {} 1257 1258 # Preprocess all rules so we can perform a single lookup per subtag at runtime. 1259 for (type, replacement) in rules.items(): 1260 (language, script, region, variants) = type 1261 (r_language, r_script, r_region, r_variants) = replacement 1262 1263 type_map_size = multi_map_size(type) 1264 1265 # Most mappings are one-to-one and can be encoded through lookup tables. 1266 if type_map_size == 1: 1267 if language is not None: 1268 assert r_language is not None, "Can't remove a language subtag" 1269 1270 # We don't yet support this case. 1271 assert ( 1272 r_variants is None 1273 ), f"Unhandled variant replacement in language alias: {replacement}" 1274 1275 if replacement == (Any, None, None, None): 1276 language_mappings[language] = r_language 1277 else: 1278 complex_language_mappings[language] = replacement[:-1] 1279 elif script is not None: 1280 # We don't support removing script subtags. 1281 assert ( 1282 r_script is not None 1283 ), f"Can't remove a script subtag: {replacement}" 1284 1285 # We only support one-to-one script mappings for now. 1286 assert replacement == ( 1287 None, 1288 Any, 1289 None, 1290 None, 1291 ), f"Unhandled replacement in script alias: {replacement}" 1292 1293 script_mappings[script] = r_script 1294 elif region is not None: 1295 # We don't support removing region subtags. 1296 assert ( 1297 r_region is not None 1298 ), f"Can't remove a region subtag: {replacement}" 1299 1300 # We only support one-to-one region mappings for now. 1301 assert replacement == ( 1302 None, 1303 None, 1304 Any, 1305 None, 1306 ), f"Unhandled replacement in region alias: {replacement}" 1307 1308 if type not in territory_exception_rules: 1309 region_mappings[region] = r_region 1310 else: 1311 complex_region_mappings[region] = [ 1312 r_region 1313 for (_, _, r_region, _) in territory_exception_rules[type] 1314 ] 1315 else: 1316 assert variants is not None 1317 assert len(variants.split("-")) == 1 1318 1319 # We only support one-to-one variant mappings for now. 1320 assert ( 1321 multi_map_size(replacement) <= 1 1322 ), f"Unhandled replacement in variant alias: {replacement}" 1323 1324 if r_language is not None: 1325 variant_mappings[variants] = ("language", r_language) 1326 elif r_script is not None: 1327 variant_mappings[variants] = ("script", r_script) 1328 elif r_region is not None: 1329 variant_mappings[variants] = ("region", r_region) 1330 elif r_variants is not None: 1331 assert len(r_variants.split("-")) == 1 1332 variant_mappings[variants] = ("variant", r_variants) 1333 else: 1334 variant_mappings[variants] = None 1335 else: 1336 # Alias rules which have multiple input fields must be processed 1337 # first. This applies only to a handful of rules, so our generated 1338 # code adds fast paths to skip these rules in the common case. 1339 1340 # Case 1: Language and at least one variant subtag. 1341 if language is not None and variants is not None: 1342 pass 1343 1344 # Case 2: Sign language and a region subtag. 1345 elif language == "sgn" and region is not None: 1346 pass 1347 1348 # Case 3: "hepburn-heploc" to "alalc97" canonicalization. 1349 elif ( 1350 language is None 1351 and variants is not None 1352 and len(variants.split("-")) == 2 1353 ): 1354 pass 1355 1356 # Any other combination is currently unsupported. 1357 else: 1358 raise ValueError(f"{type} -> {replacement}") 1359 1360 legacy_mappings[type] = replacement 1361 1362 tree = ET.parse(core_file.open("common/supplemental/likelySubtags.xml")) 1363 1364 likely_subtags = {} 1365 1366 for likely_subtag in tree.iterfind(".//likelySubtag"): 1367 from_tag = bcp47_id(likely_subtag.get("from")) 1368 from_match = re_unicode_language_id.match(from_tag) 1369 assert ( 1370 from_match is not None 1371 ), f"{from_tag} invalid Unicode BCP 47 locale identifier" 1372 assert ( 1373 from_match.group("variants") is None 1374 ), f"unexpected variant subtags in {from_tag}" 1375 1376 to_tag = bcp47_id(likely_subtag.get("to")) 1377 to_match = re_unicode_language_id.match(to_tag) 1378 assert ( 1379 to_match is not None 1380 ), f"{to_tag} invalid Unicode BCP 47 locale identifier" 1381 assert ( 1382 to_match.group("variants") is None 1383 ), f"unexpected variant subtags in {to_tag}" 1384 1385 from_canonical = bcp47_canonical( 1386 *from_match.group("language", "script", "region", "variants") 1387 ) 1388 1389 to_canonical = bcp47_canonical( 1390 *to_match.group("language", "script", "region", "variants") 1391 ) 1392 1393 # Remove the empty variant subtags. 1394 from_canonical = from_canonical[:-1] 1395 to_canonical = to_canonical[:-1] 1396 1397 likely_subtags[from_canonical] = to_canonical 1398 1399 complex_region_mappings_final = {} 1400 1401 for (deprecated_region, replacements) in complex_region_mappings.items(): 1402 # Find all likely subtag entries which don't already contain a region 1403 # subtag and whose target region is in the list of replacement regions. 1404 region_likely_subtags = [ 1405 (from_language, from_script, to_region) 1406 for ( 1407 (from_language, from_script, from_region), 1408 (_, _, to_region), 1409 ) in likely_subtags.items() 1410 if from_region is None and to_region in replacements 1411 ] 1412 1413 # The first replacement entry is the default region. 1414 default = replacements[0] 1415 1416 # Find all likely subtag entries whose region matches the default region. 1417 default_replacements = { 1418 (language, script) 1419 for (language, script, region) in region_likely_subtags 1420 if region == default 1421 } 1422 1423 # And finally find those entries which don't use the default region. 1424 # These are the entries we're actually interested in, because those need 1425 # to be handled specially when selecting the correct preferred region. 1426 non_default_replacements = [ 1427 (language, script, region) 1428 for (language, script, region) in region_likely_subtags 1429 if (language, script) not in default_replacements 1430 ] 1431 1432 # If there are no non-default replacements, we can handle the region as 1433 # part of the simple region mapping. 1434 if non_default_replacements: 1435 complex_region_mappings_final[deprecated_region] = ( 1436 default, 1437 non_default_replacements, 1438 ) 1439 else: 1440 region_mappings[deprecated_region] = default 1441 1442 return { 1443 "legacyMappings": legacy_mappings, 1444 "languageMappings": language_mappings, 1445 "complexLanguageMappings": complex_language_mappings, 1446 "scriptMappings": script_mappings, 1447 "regionMappings": region_mappings, 1448 "complexRegionMappings": complex_region_mappings_final, 1449 "variantMappings": variant_mappings, 1450 "likelySubtags": likely_subtags, 1451 } 1452 1453 1454def readUnicodeExtensions(core_file): 1455 import xml.etree.ElementTree as ET 1456 1457 # Match all xml-files in the BCP 47 directory. 1458 bcpFileRE = re.compile(r"^common/bcp47/.+\.xml$") 1459 1460 # https://www.unicode.org/reports/tr35/#Unicode_locale_identifier 1461 # 1462 # type = alphanum{3,8} (sep alphanum{3,8})* ; 1463 typeRE = re.compile(r"^[a-z0-9]{3,8}(-[a-z0-9]{3,8})*$") 1464 1465 # https://www.unicode.org/reports/tr35/#Unicode_language_identifier 1466 # 1467 # unicode_region_subtag = alpha{2} ; 1468 alphaRegionRE = re.compile(r"^[A-Z]{2}$", re.IGNORECASE) 1469 1470 # Mapping from Unicode extension types to dict of deprecated to 1471 # preferred values. 1472 mapping = { 1473 # Unicode BCP 47 U Extension 1474 "u": {}, 1475 # Unicode BCP 47 T Extension 1476 "t": {}, 1477 } 1478 1479 def readBCP47File(file): 1480 tree = ET.parse(file) 1481 for keyword in tree.iterfind(".//keyword/key"): 1482 extension = keyword.get("extension", "u") 1483 assert ( 1484 extension == "u" or extension == "t" 1485 ), "unknown extension type: {}".format(extension) 1486 1487 extension_name = keyword.get("name") 1488 1489 for type in keyword.iterfind("type"): 1490 # <https://unicode.org/reports/tr35/#Unicode_Locale_Extension_Data_Files>: 1491 # 1492 # The key or type name used by Unicode locale extension with 'u' extension 1493 # syntax or the 't' extensions syntax. When alias below is absent, this name 1494 # can be also used with the old style "@key=type" syntax. 1495 name = type.get("name") 1496 1497 # Ignore the special name: 1498 # - <https://unicode.org/reports/tr35/#CODEPOINTS> 1499 # - <https://unicode.org/reports/tr35/#REORDER_CODE> 1500 # - <https://unicode.org/reports/tr35/#RG_KEY_VALUE> 1501 # - <https://unicode.org/reports/tr35/#SCRIPT_CODE> 1502 # - <https://unicode.org/reports/tr35/#SUBDIVISION_CODE> 1503 # - <https://unicode.org/reports/tr35/#PRIVATE_USE> 1504 if name in ( 1505 "CODEPOINTS", 1506 "REORDER_CODE", 1507 "RG_KEY_VALUE", 1508 "SCRIPT_CODE", 1509 "SUBDIVISION_CODE", 1510 "PRIVATE_USE", 1511 ): 1512 continue 1513 1514 # All other names should match the 'type' production. 1515 assert ( 1516 typeRE.match(name) is not None 1517 ), "{} matches the 'type' production".format(name) 1518 1519 # <https://unicode.org/reports/tr35/#Unicode_Locale_Extension_Data_Files>: 1520 # 1521 # The preferred value of the deprecated key, type or attribute element. 1522 # When a key, type or attribute element is deprecated, this attribute is 1523 # used for specifying a new canonical form if available. 1524 preferred = type.get("preferred") 1525 1526 # <https://unicode.org/reports/tr35/#Unicode_Locale_Extension_Data_Files>: 1527 # 1528 # The BCP 47 form is the canonical form, and recommended. Other aliases are 1529 # included only for backwards compatibility. 1530 alias = type.get("alias") 1531 1532 # <https://unicode.org/reports/tr35/#Canonical_Unicode_Locale_Identifiers> 1533 # 1534 # Use the bcp47 data to replace keys, types, tfields, and tvalues by their 1535 # canonical forms. See Section 3.6.4 U Extension Data Files) and Section 1536 # 3.7.1 T Extension Data Files. The aliases are in the alias attribute 1537 # value, while the canonical is in the name attribute value. 1538 1539 # 'preferred' contains the new preferred name, 'alias' the compatibility 1540 # name, but then there's this entry where 'preferred' and 'alias' are the 1541 # same. So which one to choose? Assume 'preferred' is the actual canonical 1542 # name. 1543 # 1544 # <type name="islamicc" 1545 # description="Civil (algorithmic) Arabic calendar" 1546 # deprecated="true" 1547 # preferred="islamic-civil" 1548 # alias="islamic-civil"/> 1549 1550 if preferred is not None: 1551 assert typeRE.match(preferred), preferred 1552 mapping[extension].setdefault(extension_name, {})[name] = preferred 1553 1554 if alias is not None: 1555 for alias_name in alias.lower().split(" "): 1556 # Ignore alias entries which don't match the 'type' production. 1557 if typeRE.match(alias_name) is None: 1558 continue 1559 1560 # See comment above when 'alias' and 'preferred' are both present. 1561 if ( 1562 preferred is not None 1563 and name in mapping[extension][extension_name] 1564 ): 1565 continue 1566 1567 # Skip over entries where 'name' and 'alias' are equal. 1568 # 1569 # <type name="pst8pdt" 1570 # description="POSIX style time zone for US Pacific Time" 1571 # alias="PST8PDT" 1572 # since="1.8"/> 1573 if name == alias_name: 1574 continue 1575 1576 mapping[extension].setdefault(extension_name, {})[ 1577 alias_name 1578 ] = name 1579 1580 def readSupplementalMetadata(file): 1581 # Find subdivision and region replacements. 1582 # 1583 # <https://www.unicode.org/reports/tr35/#Canonical_Unicode_Locale_Identifiers> 1584 # 1585 # Replace aliases in special key values: 1586 # - If there is an 'sd' or 'rg' key, replace any subdivision alias 1587 # in its value in the same way, using subdivisionAlias data. 1588 tree = ET.parse(file) 1589 for alias in tree.iterfind(".//subdivisionAlias"): 1590 type = alias.get("type") 1591 assert ( 1592 typeRE.match(type) is not None 1593 ), "{} matches the 'type' production".format(type) 1594 1595 # Take the first replacement when multiple ones are present. 1596 replacement = alias.get("replacement").split(" ")[0].lower() 1597 1598 # Append "zzzz" if the replacement is a two-letter region code. 1599 if alphaRegionRE.match(replacement) is not None: 1600 replacement += "zzzz" 1601 1602 # Assert the replacement is syntactically correct. 1603 assert ( 1604 typeRE.match(replacement) is not None 1605 ), "replacement {} matches the 'type' production".format(replacement) 1606 1607 # 'subdivisionAlias' applies to 'rg' and 'sd' keys. 1608 mapping["u"].setdefault("rg", {})[type] = replacement 1609 mapping["u"].setdefault("sd", {})[type] = replacement 1610 1611 for name in core_file.namelist(): 1612 if bcpFileRE.match(name): 1613 readBCP47File(core_file.open(name)) 1614 1615 readSupplementalMetadata( 1616 core_file.open("common/supplemental/supplementalMetadata.xml") 1617 ) 1618 1619 return { 1620 "unicodeMappings": mapping["u"], 1621 "transformMappings": mapping["t"], 1622 } 1623 1624 1625def writeCLDRLanguageTagData(println, data, url): 1626 """Writes the language tag data to the Intl data file.""" 1627 1628 println(generatedFileWarning) 1629 println("// Version: CLDR-{}".format(data["version"])) 1630 println("// URL: {}".format(url)) 1631 1632 println( 1633 """ 1634#include "mozilla/Assertions.h" 1635#include "mozilla/Span.h" 1636#include "mozilla/TextUtils.h" 1637 1638#include <algorithm> 1639#include <cstdint> 1640#include <cstring> 1641#include <iterator> 1642#include <string> 1643#include <type_traits> 1644 1645#include "mozilla/intl/Locale.h" 1646 1647using namespace mozilla::intl::LanguageTagLimits; 1648 1649template <size_t Length, size_t TagLength, size_t SubtagLength> 1650static inline bool HasReplacement( 1651 const char (&subtags)[Length][TagLength], 1652 const mozilla::intl::LanguageTagSubtag<SubtagLength>& subtag) { 1653 MOZ_ASSERT(subtag.Length() == TagLength - 1, 1654 "subtag must have the same length as the list of subtags"); 1655 1656 const char* ptr = subtag.Span().data(); 1657 return std::binary_search(std::begin(subtags), std::end(subtags), ptr, 1658 [](const char* a, const char* b) { 1659 return memcmp(a, b, TagLength - 1) < 0; 1660 }); 1661} 1662 1663template <size_t Length, size_t TagLength, size_t SubtagLength> 1664static inline const char* SearchReplacement( 1665 const char (&subtags)[Length][TagLength], const char* (&aliases)[Length], 1666 const mozilla::intl::LanguageTagSubtag<SubtagLength>& subtag) { 1667 MOZ_ASSERT(subtag.Length() == TagLength - 1, 1668 "subtag must have the same length as the list of subtags"); 1669 1670 const char* ptr = subtag.Span().data(); 1671 auto p = std::lower_bound(std::begin(subtags), std::end(subtags), ptr, 1672 [](const char* a, const char* b) { 1673 return memcmp(a, b, TagLength - 1) < 0; 1674 }); 1675 if (p != std::end(subtags) && memcmp(*p, ptr, TagLength - 1) == 0) { 1676 return aliases[std::distance(std::begin(subtags), p)]; 1677 } 1678 return nullptr; 1679} 1680 1681#ifdef DEBUG 1682static bool IsAsciiLowercaseAlphanumeric(char c) { 1683 return mozilla::IsAsciiLowercaseAlpha(c) || mozilla::IsAsciiDigit(c); 1684} 1685 1686static bool IsAsciiLowercaseAlphanumericOrDash(char c) { 1687 return IsAsciiLowercaseAlphanumeric(c) || c == '-'; 1688} 1689 1690static bool IsCanonicallyCasedLanguageTag(mozilla::Span<const char> span) { 1691 return std::all_of(span.begin(), span.end(), 1692 mozilla::IsAsciiLowercaseAlpha<char>); 1693} 1694 1695static bool IsCanonicallyCasedScriptTag(mozilla::Span<const char> span) { 1696 return mozilla::IsAsciiUppercaseAlpha(span[0]) && 1697 std::all_of(span.begin() + 1, span.end(), 1698 mozilla::IsAsciiLowercaseAlpha<char>); 1699} 1700 1701static bool IsCanonicallyCasedRegionTag(mozilla::Span<const char> span) { 1702 return std::all_of(span.begin(), span.end(), 1703 mozilla::IsAsciiUppercaseAlpha<char>) || 1704 std::all_of(span.begin(), span.end(), mozilla::IsAsciiDigit<char>); 1705} 1706 1707static bool IsCanonicallyCasedVariantTag(mozilla::Span<const char> span) { 1708 return std::all_of(span.begin(), span.end(), IsAsciiLowercaseAlphanumeric); 1709} 1710 1711static bool IsCanonicallyCasedUnicodeKey(mozilla::Span<const char> key) { 1712 return std::all_of(key.begin(), key.end(), IsAsciiLowercaseAlphanumeric); 1713} 1714 1715static bool IsCanonicallyCasedUnicodeType(mozilla::Span<const char> type) { 1716 return std::all_of(type.begin(), type.end(), 1717 IsAsciiLowercaseAlphanumericOrDash); 1718} 1719 1720static bool IsCanonicallyCasedTransformKey(mozilla::Span<const char> key) { 1721 return std::all_of(key.begin(), key.end(), IsAsciiLowercaseAlphanumeric); 1722} 1723 1724static bool IsCanonicallyCasedTransformType(mozilla::Span<const char> type) { 1725 return std::all_of(type.begin(), type.end(), 1726 IsAsciiLowercaseAlphanumericOrDash); 1727} 1728#endif 1729""".rstrip() 1730 ) 1731 1732 source = "CLDR Supplemental Data, version {}".format(data["version"]) 1733 legacy_mappings = data["legacyMappings"] 1734 language_mappings = data["languageMappings"] 1735 complex_language_mappings = data["complexLanguageMappings"] 1736 script_mappings = data["scriptMappings"] 1737 region_mappings = data["regionMappings"] 1738 complex_region_mappings = data["complexRegionMappings"] 1739 variant_mappings = data["variantMappings"] 1740 unicode_mappings = data["unicodeMappings"] 1741 transform_mappings = data["transformMappings"] 1742 1743 # unicode_language_subtag = alpha{2,3} | alpha{5,8} ; 1744 language_maxlength = 8 1745 1746 # unicode_script_subtag = alpha{4} ; 1747 script_maxlength = 4 1748 1749 # unicode_region_subtag = (alpha{2} | digit{3}) ; 1750 region_maxlength = 3 1751 1752 writeMappingsBinarySearch( 1753 println, 1754 "LanguageMapping", 1755 "LanguageSubtag&", 1756 "language", 1757 "IsStructurallyValidLanguageTag", 1758 "IsCanonicallyCasedLanguageTag", 1759 language_mappings, 1760 language_maxlength, 1761 "Mappings from language subtags to preferred values.", 1762 source, 1763 url, 1764 ) 1765 writeMappingsBinarySearch( 1766 println, 1767 "ComplexLanguageMapping", 1768 "const LanguageSubtag&", 1769 "language", 1770 "IsStructurallyValidLanguageTag", 1771 "IsCanonicallyCasedLanguageTag", 1772 complex_language_mappings.keys(), 1773 language_maxlength, 1774 "Language subtags with complex mappings.", 1775 source, 1776 url, 1777 ) 1778 writeMappingsBinarySearch( 1779 println, 1780 "ScriptMapping", 1781 "ScriptSubtag&", 1782 "script", 1783 "IsStructurallyValidScriptTag", 1784 "IsCanonicallyCasedScriptTag", 1785 script_mappings, 1786 script_maxlength, 1787 "Mappings from script subtags to preferred values.", 1788 source, 1789 url, 1790 ) 1791 writeMappingsBinarySearch( 1792 println, 1793 "RegionMapping", 1794 "RegionSubtag&", 1795 "region", 1796 "IsStructurallyValidRegionTag", 1797 "IsCanonicallyCasedRegionTag", 1798 region_mappings, 1799 region_maxlength, 1800 "Mappings from region subtags to preferred values.", 1801 source, 1802 url, 1803 ) 1804 writeMappingsBinarySearch( 1805 println, 1806 "ComplexRegionMapping", 1807 "const RegionSubtag&", 1808 "region", 1809 "IsStructurallyValidRegionTag", 1810 "IsCanonicallyCasedRegionTag", 1811 complex_region_mappings.keys(), 1812 region_maxlength, 1813 "Region subtags with complex mappings.", 1814 source, 1815 url, 1816 ) 1817 1818 writeComplexLanguageTagMappings( 1819 println, 1820 complex_language_mappings, 1821 "Language subtags with complex mappings.", 1822 source, 1823 url, 1824 ) 1825 writeComplexRegionTagMappings( 1826 println, 1827 complex_region_mappings, 1828 "Region subtags with complex mappings.", 1829 source, 1830 url, 1831 ) 1832 1833 writeVariantTagMappings( 1834 println, 1835 variant_mappings, 1836 "Mappings from variant subtags to preferred values.", 1837 source, 1838 url, 1839 ) 1840 1841 writeLegacyMappingsFunction( 1842 println, legacy_mappings, "Canonicalize legacy locale identifiers.", source, url 1843 ) 1844 1845 writeSignLanguageMappingsFunction( 1846 println, legacy_mappings, "Mappings from legacy sign languages.", source, url 1847 ) 1848 1849 writeUnicodeExtensionsMappings(println, unicode_mappings, "Unicode") 1850 writeUnicodeExtensionsMappings(println, transform_mappings, "Transform") 1851 1852 1853def writeCLDRLanguageTagLikelySubtagsTest(println, data, url): 1854 """Writes the likely-subtags test file.""" 1855 1856 println(generatedFileWarning) 1857 1858 source = "CLDR Supplemental Data, version {}".format(data["version"]) 1859 language_mappings = data["languageMappings"] 1860 complex_language_mappings = data["complexLanguageMappings"] 1861 script_mappings = data["scriptMappings"] 1862 region_mappings = data["regionMappings"] 1863 complex_region_mappings = data["complexRegionMappings"] 1864 likely_subtags = data["likelySubtags"] 1865 1866 def bcp47(tag): 1867 (language, script, region) = tag 1868 return "{}{}{}".format( 1869 language, "-" + script if script else "", "-" + region if region else "" 1870 ) 1871 1872 def canonical(tag): 1873 (language, script, region) = tag 1874 1875 # Map deprecated language subtags. 1876 if language in language_mappings: 1877 language = language_mappings[language] 1878 elif language in complex_language_mappings: 1879 (language2, script2, region2) = complex_language_mappings[language] 1880 (language, script, region) = ( 1881 language2, 1882 script if script else script2, 1883 region if region else region2, 1884 ) 1885 1886 # Map deprecated script subtags. 1887 if script in script_mappings: 1888 script = script_mappings[script] 1889 1890 # Map deprecated region subtags. 1891 if region in region_mappings: 1892 region = region_mappings[region] 1893 else: 1894 # Assume no complex region mappings are needed for now. 1895 assert ( 1896 region not in complex_region_mappings 1897 ), "unexpected region with complex mappings: {}".format(region) 1898 1899 return (language, script, region) 1900 1901 # https://unicode.org/reports/tr35/#Likely_Subtags 1902 1903 def addLikelySubtags(tag): 1904 # Step 1: Canonicalize. 1905 (language, script, region) = canonical(tag) 1906 if script == "Zzzz": 1907 script = None 1908 if region == "ZZ": 1909 region = None 1910 1911 # Step 2: Lookup. 1912 searches = ( 1913 (language, script, region), 1914 (language, None, region), 1915 (language, script, None), 1916 (language, None, None), 1917 ("und", script, None), 1918 ) 1919 search = next(search for search in searches if search in likely_subtags) 1920 1921 (language_s, script_s, region_s) = search 1922 (language_m, script_m, region_m) = likely_subtags[search] 1923 1924 # Step 3: Return. 1925 return ( 1926 language if language != language_s else language_m, 1927 script if script != script_s else script_m, 1928 region if region != region_s else region_m, 1929 ) 1930 1931 # https://unicode.org/reports/tr35/#Likely_Subtags 1932 def removeLikelySubtags(tag): 1933 # Step 1: Add likely subtags. 1934 max = addLikelySubtags(tag) 1935 1936 # Step 2: Remove variants (doesn't apply here). 1937 1938 # Step 3: Find a match. 1939 (language, script, region) = max 1940 for trial in ( 1941 (language, None, None), 1942 (language, None, region), 1943 (language, script, None), 1944 ): 1945 if addLikelySubtags(trial) == max: 1946 return trial 1947 1948 # Step 4: Return maximized if no match found. 1949 return max 1950 1951 def likely_canonical(from_tag, to_tag): 1952 # Canonicalize the input tag. 1953 from_tag = canonical(from_tag) 1954 1955 # Update the expected result if necessary. 1956 if from_tag in likely_subtags: 1957 to_tag = likely_subtags[from_tag] 1958 1959 # Canonicalize the expected output. 1960 to_canonical = canonical(to_tag) 1961 1962 # Sanity check: This should match the result of |addLikelySubtags|. 1963 assert to_canonical == addLikelySubtags(from_tag) 1964 1965 return to_canonical 1966 1967 # |likely_subtags| contains non-canonicalized tags, so canonicalize it first. 1968 likely_subtags_canonical = { 1969 k: likely_canonical(k, v) for (k, v) in likely_subtags.items() 1970 } 1971 1972 # Add test data for |Intl.Locale.prototype.maximize()|. 1973 writeMappingsVar( 1974 println, 1975 {bcp47(k): bcp47(v) for (k, v) in likely_subtags_canonical.items()}, 1976 "maxLikelySubtags", 1977 "Extracted from likelySubtags.xml.", 1978 source, 1979 url, 1980 ) 1981 1982 # Use the maximalized tags as the input for the remove likely-subtags test. 1983 minimized = { 1984 tag: removeLikelySubtags(tag) for tag in likely_subtags_canonical.values() 1985 } 1986 1987 # Add test data for |Intl.Locale.prototype.minimize()|. 1988 writeMappingsVar( 1989 println, 1990 {bcp47(k): bcp47(v) for (k, v) in minimized.items()}, 1991 "minLikelySubtags", 1992 "Extracted from likelySubtags.xml.", 1993 source, 1994 url, 1995 ) 1996 1997 println( 1998 """ 1999for (let [tag, maximal] of Object.entries(maxLikelySubtags)) { 2000 assertEq(new Intl.Locale(tag).maximize().toString(), maximal); 2001}""" 2002 ) 2003 2004 println( 2005 """ 2006for (let [tag, minimal] of Object.entries(minLikelySubtags)) { 2007 assertEq(new Intl.Locale(tag).minimize().toString(), minimal); 2008}""" 2009 ) 2010 2011 println( 2012 """ 2013if (typeof reportCompare === "function") 2014 reportCompare(0, 0);""" 2015 ) 2016 2017 2018def readCLDRVersionFromICU(): 2019 icuDir = os.path.join(topsrcdir, "intl/icu/source") 2020 if not os.path.isdir(icuDir): 2021 raise RuntimeError("not a directory: {}".format(icuDir)) 2022 2023 reVersion = re.compile(r'\s*cldrVersion\{"(\d+(?:\.\d+)?)"\}') 2024 2025 for line in flines(os.path.join(icuDir, "data/misc/supplementalData.txt")): 2026 m = reVersion.match(line) 2027 if m: 2028 version = m.group(1) 2029 break 2030 2031 if version is None: 2032 raise RuntimeError("can't resolve CLDR version") 2033 2034 return version 2035 2036 2037def updateCLDRLangTags(args): 2038 """Update the LanguageTagGenerated.cpp file.""" 2039 version = args.version 2040 url = args.url 2041 out = args.out 2042 filename = args.file 2043 2044 # Determine current CLDR version from ICU. 2045 if version is None: 2046 version = readCLDRVersionFromICU() 2047 2048 url = url.replace("<VERSION>", version) 2049 2050 print("Arguments:") 2051 print("\tCLDR version: %s" % version) 2052 print("\tDownload url: %s" % url) 2053 if filename is not None: 2054 print("\tLocal CLDR core.zip file: %s" % filename) 2055 print("\tOutput file: %s" % out) 2056 print("") 2057 2058 data = { 2059 "version": version, 2060 } 2061 2062 def readFiles(cldr_file): 2063 with ZipFile(cldr_file) as zip_file: 2064 data.update(readSupplementalData(zip_file)) 2065 data.update(readUnicodeExtensions(zip_file)) 2066 2067 print("Processing CLDR data...") 2068 if filename is not None: 2069 print("Always make sure you have the newest CLDR core.zip!") 2070 with open(filename, "rb") as cldr_file: 2071 readFiles(cldr_file) 2072 else: 2073 print("Downloading CLDR core.zip...") 2074 with closing(urlopen(url)) as cldr_file: 2075 cldr_data = io.BytesIO(cldr_file.read()) 2076 readFiles(cldr_data) 2077 2078 print("Writing Intl data...") 2079 with io.open(out, mode="w", encoding="utf-8", newline="") as f: 2080 println = partial(print, file=f) 2081 2082 writeCLDRLanguageTagData(println, data, url) 2083 2084 print("Writing Intl test data...") 2085 js_src_builtin_intl_dir = os.path.dirname(os.path.abspath(__file__)) 2086 test_file = os.path.join( 2087 js_src_builtin_intl_dir, 2088 "../../tests/non262/Intl/Locale/likely-subtags-generated.js", 2089 ) 2090 with io.open(test_file, mode="w", encoding="utf-8", newline="") as f: 2091 println = partial(print, file=f) 2092 2093 println("// |reftest| skip-if(!this.hasOwnProperty('Intl'))") 2094 writeCLDRLanguageTagLikelySubtagsTest(println, data, url) 2095 2096 2097def flines(filepath, encoding="utf-8"): 2098 """Open filepath and iterate over its content.""" 2099 with io.open(filepath, mode="r", encoding=encoding) as f: 2100 for line in f: 2101 yield line 2102 2103 2104@total_ordering 2105class Zone(object): 2106 """Time zone with optional file name.""" 2107 2108 def __init__(self, name, filename=""): 2109 self.name = name 2110 self.filename = filename 2111 2112 def __eq__(self, other): 2113 return hasattr(other, "name") and self.name == other.name 2114 2115 def __lt__(self, other): 2116 return self.name < other.name 2117 2118 def __hash__(self): 2119 return hash(self.name) 2120 2121 def __str__(self): 2122 return self.name 2123 2124 def __repr__(self): 2125 return self.name 2126 2127 2128class TzDataDir(object): 2129 """tzdata source from a directory.""" 2130 2131 def __init__(self, obj): 2132 self.name = partial(os.path.basename, obj) 2133 self.resolve = partial(os.path.join, obj) 2134 self.basename = os.path.basename 2135 self.isfile = os.path.isfile 2136 self.listdir = partial(os.listdir, obj) 2137 self.readlines = flines 2138 2139 2140class TzDataFile(object): 2141 """tzdata source from a file (tar or gzipped).""" 2142 2143 def __init__(self, obj): 2144 self.name = lambda: os.path.splitext( 2145 os.path.splitext(os.path.basename(obj))[0] 2146 )[0] 2147 self.resolve = obj.getmember 2148 self.basename = attrgetter("name") 2149 self.isfile = tarfile.TarInfo.isfile 2150 self.listdir = obj.getnames 2151 self.readlines = partial(self._tarlines, obj) 2152 2153 def _tarlines(self, tar, m): 2154 with closing(tar.extractfile(m)) as f: 2155 for line in f: 2156 yield line.decode("utf-8") 2157 2158 2159def validateTimeZones(zones, links): 2160 """Validate the zone and link entries.""" 2161 linkZones = set(links.keys()) 2162 intersect = linkZones.intersection(zones) 2163 if intersect: 2164 raise RuntimeError("Links also present in zones: %s" % intersect) 2165 2166 zoneNames = {z.name for z in zones} 2167 linkTargets = set(links.values()) 2168 if not linkTargets.issubset(zoneNames): 2169 raise RuntimeError( 2170 "Link targets not found: %s" % linkTargets.difference(zoneNames) 2171 ) 2172 2173 2174def partition(iterable, *predicates): 2175 def innerPartition(pred, it): 2176 it1, it2 = tee(it) 2177 return (filter(pred, it1), filterfalse(pred, it2)) 2178 2179 if len(predicates) == 0: 2180 return iterable 2181 (left, right) = innerPartition(predicates[0], iterable) 2182 if len(predicates) == 1: 2183 return (left, right) 2184 return tuple([left] + list(partition(right, *predicates[1:]))) 2185 2186 2187def listIANAFiles(tzdataDir): 2188 def isTzFile(d, m, f): 2189 return m(f) and d.isfile(d.resolve(f)) 2190 2191 return filter( 2192 partial(isTzFile, tzdataDir, re.compile("^[a-z0-9]+$").match), 2193 tzdataDir.listdir(), 2194 ) 2195 2196 2197def readIANAFiles(tzdataDir, files): 2198 """Read all IANA time zone files from the given iterable.""" 2199 nameSyntax = "[\w/+\-]+" 2200 pZone = re.compile(r"Zone\s+(?P<name>%s)\s+.*" % nameSyntax) 2201 pLink = re.compile( 2202 r"Link\s+(?P<target>%s)\s+(?P<name>%s)(?:\s+#.*)?" % (nameSyntax, nameSyntax) 2203 ) 2204 2205 def createZone(line, fname): 2206 match = pZone.match(line) 2207 name = match.group("name") 2208 return Zone(name, fname) 2209 2210 def createLink(line, fname): 2211 match = pLink.match(line) 2212 (name, target) = match.group("name", "target") 2213 return (Zone(name, fname), target) 2214 2215 zones = set() 2216 links = dict() 2217 for filename in files: 2218 filepath = tzdataDir.resolve(filename) 2219 for line in tzdataDir.readlines(filepath): 2220 if line.startswith("Zone"): 2221 zones.add(createZone(line, filename)) 2222 if line.startswith("Link"): 2223 (link, target) = createLink(line, filename) 2224 links[link] = target 2225 2226 return (zones, links) 2227 2228 2229def readIANATimeZones(tzdataDir, ignoreBackzone, ignoreFactory): 2230 """Read the IANA time zone information from `tzdataDir`.""" 2231 2232 backzoneFiles = {"backzone"} 2233 (bkfiles, tzfiles) = partition(listIANAFiles(tzdataDir), backzoneFiles.__contains__) 2234 2235 # Read zone and link infos. 2236 (zones, links) = readIANAFiles(tzdataDir, tzfiles) 2237 (backzones, backlinks) = readIANAFiles(tzdataDir, bkfiles) 2238 2239 # Remove the placeholder time zone "Factory". 2240 if ignoreFactory: 2241 zones.remove(Zone("Factory")) 2242 2243 # Merge with backzone data. 2244 if not ignoreBackzone: 2245 zones |= backzones 2246 links = { 2247 name: target for name, target in links.items() if name not in backzones 2248 } 2249 links.update(backlinks) 2250 2251 validateTimeZones(zones, links) 2252 2253 return (zones, links) 2254 2255 2256def readICUResourceFile(filename): 2257 """Read an ICU resource file. 2258 2259 Yields (<table-name>, <startOrEnd>, <value>) for each table. 2260 """ 2261 2262 numberValue = r"-?\d+" 2263 stringValue = r'".+?"' 2264 2265 def asVector(val): 2266 return r"%s(?:\s*,\s*%s)*" % (val, val) 2267 2268 numberVector = asVector(numberValue) 2269 stringVector = asVector(stringValue) 2270 2271 reNumberVector = re.compile(numberVector) 2272 reStringVector = re.compile(stringVector) 2273 reNumberValue = re.compile(numberValue) 2274 reStringValue = re.compile(stringValue) 2275 2276 def parseValue(value): 2277 m = reNumberVector.match(value) 2278 if m: 2279 return [int(v) for v in reNumberValue.findall(value)] 2280 m = reStringVector.match(value) 2281 if m: 2282 return [v[1:-1] for v in reStringValue.findall(value)] 2283 raise RuntimeError("unknown value type: %s" % value) 2284 2285 def extractValue(values): 2286 if len(values) == 0: 2287 return None 2288 if len(values) == 1: 2289 return values[0] 2290 return values 2291 2292 def line(*args): 2293 maybeMultiComments = r"(?:/\*[^*]*\*/)*" 2294 maybeSingleComment = r"(?://.*)?" 2295 lineStart = "^%s" % maybeMultiComments 2296 lineEnd = "%s\s*%s$" % (maybeMultiComments, maybeSingleComment) 2297 return re.compile(r"\s*".join(chain([lineStart], args, [lineEnd]))) 2298 2299 tableName = r'(?P<quote>"?)(?P<name>.+?)(?P=quote)' 2300 tableValue = r"(?P<value>%s|%s)" % (numberVector, stringVector) 2301 2302 reStartTable = line(tableName, r"\{") 2303 reEndTable = line(r"\}") 2304 reSingleValue = line(r",?", tableValue, r",?") 2305 reCompactTable = line(tableName, r"\{", tableValue, r"\}") 2306 reEmptyLine = line() 2307 2308 tables = [] 2309 2310 def currentTable(): 2311 return "|".join(tables) 2312 2313 values = [] 2314 for line in flines(filename, "utf-8-sig"): 2315 line = line.strip() 2316 if line == "": 2317 continue 2318 2319 m = reEmptyLine.match(line) 2320 if m: 2321 continue 2322 2323 m = reStartTable.match(line) 2324 if m: 2325 assert len(values) == 0 2326 tables.append(m.group("name")) 2327 continue 2328 2329 m = reEndTable.match(line) 2330 if m: 2331 yield (currentTable(), extractValue(values)) 2332 tables.pop() 2333 values = [] 2334 continue 2335 2336 m = reCompactTable.match(line) 2337 if m: 2338 assert len(values) == 0 2339 tables.append(m.group("name")) 2340 yield (currentTable(), extractValue(parseValue(m.group("value")))) 2341 tables.pop() 2342 continue 2343 2344 m = reSingleValue.match(line) 2345 if m and tables: 2346 values.extend(parseValue(m.group("value"))) 2347 continue 2348 2349 raise RuntimeError("unknown entry: %s" % line) 2350 2351 2352def readICUTimeZonesFromTimezoneTypes(icuTzDir): 2353 """Read the ICU time zone information from `icuTzDir`/timezoneTypes.txt 2354 and returns the tuple (zones, links). 2355 """ 2356 typeMapTimeZoneKey = "timezoneTypes:table(nofallback)|typeMap|timezone|" 2357 typeAliasTimeZoneKey = "timezoneTypes:table(nofallback)|typeAlias|timezone|" 2358 2359 def toTimeZone(name): 2360 return Zone(name.replace(":", "/")) 2361 2362 zones = set() 2363 links = dict() 2364 2365 for name, value in readICUResourceFile(os.path.join(icuTzDir, "timezoneTypes.txt")): 2366 if name.startswith(typeMapTimeZoneKey): 2367 zones.add(toTimeZone(name[len(typeMapTimeZoneKey) :])) 2368 if name.startswith(typeAliasTimeZoneKey): 2369 links[toTimeZone(name[len(typeAliasTimeZoneKey) :])] = value 2370 2371 validateTimeZones(zones, links) 2372 2373 return (zones, links) 2374 2375 2376def readICUTimeZonesFromZoneInfo(icuTzDir): 2377 """Read the ICU time zone information from `icuTzDir`/zoneinfo64.txt 2378 and returns the tuple (zones, links). 2379 """ 2380 zoneKey = "zoneinfo64:table(nofallback)|Zones:array|:table" 2381 linkKey = "zoneinfo64:table(nofallback)|Zones:array|:int" 2382 namesKey = "zoneinfo64:table(nofallback)|Names" 2383 2384 tzId = 0 2385 tzLinks = dict() 2386 tzNames = [] 2387 2388 for name, value in readICUResourceFile(os.path.join(icuTzDir, "zoneinfo64.txt")): 2389 if name == zoneKey: 2390 tzId += 1 2391 elif name == linkKey: 2392 tzLinks[tzId] = int(value) 2393 tzId += 1 2394 elif name == namesKey: 2395 tzNames.extend(value) 2396 2397 links = {Zone(tzNames[zone]): tzNames[target] for (zone, target) in tzLinks.items()} 2398 zones = {Zone(v) for v in tzNames if Zone(v) not in links} 2399 2400 validateTimeZones(zones, links) 2401 2402 return (zones, links) 2403 2404 2405def readICUTimeZones(icuDir, icuTzDir, ignoreFactory): 2406 # zoneinfo64.txt contains the supported time zones by ICU. This data is 2407 # generated from tzdata files, it doesn't include "backzone" in stock ICU. 2408 (zoneinfoZones, zoneinfoLinks) = readICUTimeZonesFromZoneInfo(icuTzDir) 2409 2410 # timezoneTypes.txt contains the canonicalization information for ICU. This 2411 # data is generated from CLDR files. It includes data about time zones from 2412 # tzdata's "backzone" file. 2413 (typesZones, typesLinks) = readICUTimeZonesFromTimezoneTypes(icuTzDir) 2414 2415 # Remove the placeholder time zone "Factory". 2416 # See also <https://github.com/eggert/tz/blob/master/factory>. 2417 if ignoreFactory: 2418 zoneinfoZones.remove(Zone("Factory")) 2419 2420 # Remove the ICU placeholder time zone "Etc/Unknown". 2421 # See also <https://unicode.org/reports/tr35/#Time_Zone_Identifiers>. 2422 for zones in (zoneinfoZones, typesZones): 2423 zones.remove(Zone("Etc/Unknown")) 2424 2425 # Remove any outdated ICU links. 2426 for links in (zoneinfoLinks, typesLinks): 2427 for zone in otherICULegacyLinks().keys(): 2428 if zone not in links: 2429 raise KeyError(f"Can't remove non-existent link from '{zone}'") 2430 del links[zone] 2431 2432 # Information in zoneinfo64 should be a superset of timezoneTypes. 2433 def inZoneInfo64(zone): 2434 return zone in zoneinfoZones or zone in zoneinfoLinks 2435 2436 notFoundInZoneInfo64 = [zone for zone in typesZones if not inZoneInfo64(zone)] 2437 if notFoundInZoneInfo64: 2438 raise RuntimeError( 2439 "Missing time zones in zoneinfo64.txt: %s" % notFoundInZoneInfo64 2440 ) 2441 2442 notFoundInZoneInfo64 = [ 2443 zone for zone in typesLinks.keys() if not inZoneInfo64(zone) 2444 ] 2445 if notFoundInZoneInfo64: 2446 raise RuntimeError( 2447 "Missing time zones in zoneinfo64.txt: %s" % notFoundInZoneInfo64 2448 ) 2449 2450 # zoneinfo64.txt only defines the supported time zones by ICU, the canonicalization 2451 # rules are defined through timezoneTypes.txt. Merge both to get the actual zones 2452 # and links used by ICU. 2453 icuZones = set( 2454 chain( 2455 (zone for zone in zoneinfoZones if zone not in typesLinks), 2456 (zone for zone in typesZones), 2457 ) 2458 ) 2459 icuLinks = dict( 2460 chain( 2461 ( 2462 (zone, target) 2463 for (zone, target) in zoneinfoLinks.items() 2464 if zone not in typesZones 2465 ), 2466 ((zone, target) for (zone, target) in typesLinks.items()), 2467 ) 2468 ) 2469 2470 return (icuZones, icuLinks) 2471 2472 2473def readICULegacyZones(icuDir): 2474 """Read the ICU legacy time zones from `icuTzDir`/tools/tzcode/icuzones 2475 and returns the tuple (zones, links). 2476 """ 2477 tzdir = TzDataDir(os.path.join(icuDir, "tools/tzcode")) 2478 2479 # Per spec we must recognize only IANA time zones and links, but ICU 2480 # recognizes various legacy, non-IANA time zones and links. Compute these 2481 # non-IANA time zones and links. 2482 2483 # Most legacy, non-IANA time zones and links are in the icuzones file. 2484 (zones, links) = readIANAFiles(tzdir, ["icuzones"]) 2485 2486 # Remove the ICU placeholder time zone "Etc/Unknown". 2487 # See also <https://unicode.org/reports/tr35/#Time_Zone_Identifiers>. 2488 zones.remove(Zone("Etc/Unknown")) 2489 2490 # A handful of non-IANA zones/links are not in icuzones and must be added 2491 # manually so that we won't invoke ICU with them. 2492 for (zone, target) in otherICULegacyLinks().items(): 2493 if zone in links: 2494 if links[zone] != target: 2495 raise KeyError( 2496 f"Can't overwrite link '{zone} -> {links[zone]}' with '{target}'" 2497 ) 2498 else: 2499 print( 2500 f"Info: Link '{zone} -> {target}' can be removed from otherICULegacyLinks()" 2501 ) 2502 links[zone] = target 2503 2504 return (zones, links) 2505 2506 2507def otherICULegacyLinks(): 2508 """The file `icuTzDir`/tools/tzcode/icuzones contains all ICU legacy time 2509 zones with the exception of time zones which are removed by IANA after an 2510 ICU release. 2511 2512 For example ICU 67 uses tzdata2018i, but tzdata2020b removed the link from 2513 "US/Pacific-New" to "America/Los_Angeles". ICU standalone tzdata updates 2514 don't include modified icuzones files, so we must manually record any IANA 2515 modifications here. 2516 2517 After an ICU update, we can remove any no longer needed entries from this 2518 function by checking if the relevant entries are now included in icuzones. 2519 """ 2520 2521 return { 2522 # Current ICU is up-to-date with IANA, so this dict is empty. 2523 } 2524 2525 2526def icuTzDataVersion(icuTzDir): 2527 """Read the ICU time zone version from `icuTzDir`/zoneinfo64.txt.""" 2528 2529 def searchInFile(pattern, f): 2530 p = re.compile(pattern) 2531 for line in flines(f, "utf-8-sig"): 2532 m = p.search(line) 2533 if m: 2534 return m.group(1) 2535 return None 2536 2537 zoneinfo = os.path.join(icuTzDir, "zoneinfo64.txt") 2538 if not os.path.isfile(zoneinfo): 2539 raise RuntimeError("file not found: %s" % zoneinfo) 2540 version = searchInFile("^//\s+tz version:\s+([0-9]{4}[a-z])$", zoneinfo) 2541 if version is None: 2542 raise RuntimeError( 2543 "%s does not contain a valid tzdata version string" % zoneinfo 2544 ) 2545 return version 2546 2547 2548def findIncorrectICUZones(ianaZones, ianaLinks, icuZones, icuLinks, ignoreBackzone): 2549 """Find incorrect ICU zone entries.""" 2550 2551 def isIANATimeZone(zone): 2552 return zone in ianaZones or zone in ianaLinks 2553 2554 def isICUTimeZone(zone): 2555 return zone in icuZones or zone in icuLinks 2556 2557 def isICULink(zone): 2558 return zone in icuLinks 2559 2560 # All IANA zones should be present in ICU. 2561 missingTimeZones = [zone for zone in ianaZones if not isICUTimeZone(zone)] 2562 # Normally zones in backzone are also present as links in one of the other 2563 # time zone files. The only exception to this rule is the Asia/Hanoi time 2564 # zone, this zone is only present in the backzone file. 2565 expectedMissing = [] if ignoreBackzone else [Zone("Asia/Hanoi")] 2566 if missingTimeZones != expectedMissing: 2567 raise RuntimeError( 2568 "Not all zones are present in ICU, did you forget " 2569 "to run intl/update-tzdata.sh? %s" % missingTimeZones 2570 ) 2571 2572 # Zones which are only present in ICU? 2573 additionalTimeZones = [zone for zone in icuZones if not isIANATimeZone(zone)] 2574 if additionalTimeZones: 2575 raise RuntimeError( 2576 "Additional zones present in ICU, did you forget " 2577 "to run intl/update-tzdata.sh? %s" % additionalTimeZones 2578 ) 2579 2580 # Zones which are marked as links in ICU. 2581 result = ((zone, icuLinks[zone]) for zone in ianaZones if isICULink(zone)) 2582 2583 # Remove unnecessary UTC mappings. 2584 utcnames = ["Etc/UTC", "Etc/UCT", "Etc/GMT"] 2585 result = ((zone, target) for (zone, target) in result if zone.name not in utcnames) 2586 2587 return sorted(result, key=itemgetter(0)) 2588 2589 2590def findIncorrectICULinks(ianaZones, ianaLinks, icuZones, icuLinks): 2591 """Find incorrect ICU link entries.""" 2592 2593 def isIANATimeZone(zone): 2594 return zone in ianaZones or zone in ianaLinks 2595 2596 def isICUTimeZone(zone): 2597 return zone in icuZones or zone in icuLinks 2598 2599 def isICULink(zone): 2600 return zone in icuLinks 2601 2602 def isICUZone(zone): 2603 return zone in icuZones 2604 2605 # All links should be present in ICU. 2606 missingTimeZones = [zone for zone in ianaLinks.keys() if not isICUTimeZone(zone)] 2607 if missingTimeZones: 2608 raise RuntimeError( 2609 "Not all zones are present in ICU, did you forget " 2610 "to run intl/update-tzdata.sh? %s" % missingTimeZones 2611 ) 2612 2613 # Links which are only present in ICU? 2614 additionalTimeZones = [zone for zone in icuLinks.keys() if not isIANATimeZone(zone)] 2615 if additionalTimeZones: 2616 raise RuntimeError( 2617 "Additional links present in ICU, did you forget " 2618 "to run intl/update-tzdata.sh? %s" % additionalTimeZones 2619 ) 2620 2621 result = chain( 2622 # IANA links which have a different target in ICU. 2623 ( 2624 (zone, target, icuLinks[zone]) 2625 for (zone, target) in ianaLinks.items() 2626 if isICULink(zone) and target != icuLinks[zone] 2627 ), 2628 # IANA links which are zones in ICU. 2629 ( 2630 (zone, target, zone.name) 2631 for (zone, target) in ianaLinks.items() 2632 if isICUZone(zone) 2633 ), 2634 ) 2635 2636 # Remove unnecessary UTC mappings. 2637 utcnames = ["Etc/UTC", "Etc/UCT", "Etc/GMT"] 2638 result = ( 2639 (zone, target, icuTarget) 2640 for (zone, target, icuTarget) in result 2641 if target not in utcnames or icuTarget not in utcnames 2642 ) 2643 2644 return sorted(result, key=itemgetter(0)) 2645 2646 2647generatedFileWarning = "// Generated by make_intl_data.py. DO NOT EDIT." 2648tzdataVersionComment = "// tzdata version = {0}" 2649 2650 2651def processTimeZones( 2652 tzdataDir, icuDir, icuTzDir, version, ignoreBackzone, ignoreFactory, out 2653): 2654 """Read the time zone info and create a new time zone cpp file.""" 2655 print("Processing tzdata mapping...") 2656 (ianaZones, ianaLinks) = readIANATimeZones(tzdataDir, ignoreBackzone, ignoreFactory) 2657 (icuZones, icuLinks) = readICUTimeZones(icuDir, icuTzDir, ignoreFactory) 2658 (legacyZones, legacyLinks) = readICULegacyZones(icuDir) 2659 2660 # Remove all legacy ICU time zones. 2661 icuZones = {zone for zone in icuZones if zone not in legacyZones} 2662 icuLinks = { 2663 zone: target for (zone, target) in icuLinks.items() if zone not in legacyLinks 2664 } 2665 2666 incorrectZones = findIncorrectICUZones( 2667 ianaZones, ianaLinks, icuZones, icuLinks, ignoreBackzone 2668 ) 2669 if not incorrectZones: 2670 print("<<< No incorrect ICU time zones found, please update Intl.js! >>>") 2671 print("<<< Maybe https://ssl.icu-project.org/trac/ticket/12044 was fixed? >>>") 2672 2673 incorrectLinks = findIncorrectICULinks(ianaZones, ianaLinks, icuZones, icuLinks) 2674 if not incorrectLinks: 2675 print("<<< No incorrect ICU time zone links found, please update Intl.js! >>>") 2676 print("<<< Maybe https://ssl.icu-project.org/trac/ticket/12044 was fixed? >>>") 2677 2678 print("Writing Intl tzdata file...") 2679 with io.open(out, mode="w", encoding="utf-8", newline="") as f: 2680 println = partial(print, file=f) 2681 2682 println(generatedFileWarning) 2683 println(tzdataVersionComment.format(version)) 2684 println("") 2685 2686 println("#ifndef builtin_intl_TimeZoneDataGenerated_h") 2687 println("#define builtin_intl_TimeZoneDataGenerated_h") 2688 println("") 2689 2690 println("namespace js {") 2691 println("namespace timezone {") 2692 println("") 2693 2694 println("// Format:") 2695 println('// "ZoneName" // ICU-Name [time zone file]') 2696 println("const char* const ianaZonesTreatedAsLinksByICU[] = {") 2697 for (zone, icuZone) in incorrectZones: 2698 println(' "%s", // %s [%s]' % (zone, icuZone, zone.filename)) 2699 println("};") 2700 println("") 2701 2702 println("// Format:") 2703 println('// "LinkName", "Target" // ICU-Target [time zone file]') 2704 println("struct LinkAndTarget") 2705 println("{") 2706 println(" const char* const link;") 2707 println(" const char* const target;") 2708 println("};") 2709 println("") 2710 println("const LinkAndTarget ianaLinksCanonicalizedDifferentlyByICU[] = {") 2711 for (zone, target, icuTarget) in incorrectLinks: 2712 println( 2713 ' { "%s", "%s" }, // %s [%s]' 2714 % (zone, target, icuTarget, zone.filename) 2715 ) 2716 println("};") 2717 println("") 2718 2719 println( 2720 "// Legacy ICU time zones, these are not valid IANA time zone names. We also" 2721 ) 2722 println("// disallow the old and deprecated System V time zones.") 2723 println( 2724 "// https://ssl.icu-project.org/repos/icu/trunk/icu4c/source/tools/tzcode/icuzones" 2725 ) # NOQA: E501 2726 println("const char* const legacyICUTimeZones[] = {") 2727 for zone in chain(sorted(legacyLinks.keys()), sorted(legacyZones)): 2728 println(' "%s",' % zone) 2729 println("};") 2730 println("") 2731 2732 println("} // namespace timezone") 2733 println("} // namespace js") 2734 println("") 2735 println("#endif /* builtin_intl_TimeZoneDataGenerated_h */") 2736 2737 2738def updateBackzoneLinks(tzdataDir, links): 2739 def withZone(fn): 2740 return lambda zone_target: fn(zone_target[0]) 2741 2742 (backzoneZones, backzoneLinks) = readIANAFiles(tzdataDir, ["backzone"]) 2743 (stableZones, updatedLinks, updatedZones) = partition( 2744 links.items(), 2745 # Link not changed in backzone. 2746 withZone(lambda zone: zone not in backzoneLinks and zone not in backzoneZones), 2747 # Link has a new target. 2748 withZone(lambda zone: zone in backzoneLinks), 2749 ) 2750 # Keep stable zones and links with updated target. 2751 return dict( 2752 chain( 2753 stableZones, 2754 map(withZone(lambda zone: (zone, backzoneLinks[zone])), updatedLinks), 2755 ) 2756 ) 2757 2758 2759def generateTzDataLinkTestContent(testDir, version, fileName, description, links): 2760 with io.open( 2761 os.path.join(testDir, fileName), mode="w", encoding="utf-8", newline="" 2762 ) as f: 2763 println = partial(print, file=f) 2764 2765 println('// |reftest| skip-if(!this.hasOwnProperty("Intl"))') 2766 println("") 2767 println(generatedFileWarning) 2768 println(tzdataVersionComment.format(version)) 2769 println( 2770 """ 2771const tzMapper = [ 2772 x => x, 2773 x => x.toUpperCase(), 2774 x => x.toLowerCase(), 2775]; 2776""" 2777 ) 2778 2779 println(description) 2780 println("const links = {") 2781 for (zone, target) in sorted(links, key=itemgetter(0)): 2782 println(' "%s": "%s",' % (zone, target)) 2783 println("};") 2784 2785 println( 2786 """ 2787for (let [linkName, target] of Object.entries(links)) { 2788 if (target === "Etc/UTC" || target === "Etc/GMT") 2789 target = "UTC"; 2790 2791 for (let map of tzMapper) { 2792 let dtf = new Intl.DateTimeFormat(undefined, {timeZone: map(linkName)}); 2793 let resolvedTimeZone = dtf.resolvedOptions().timeZone; 2794 assertEq(resolvedTimeZone, target, `${linkName} -> ${target}`); 2795 } 2796} 2797""" 2798 ) 2799 println( 2800 """ 2801if (typeof reportCompare === "function") 2802 reportCompare(0, 0, "ok"); 2803""" 2804 ) 2805 2806 2807def generateTzDataTestBackwardLinks(tzdataDir, version, ignoreBackzone, testDir): 2808 (zones, links) = readIANAFiles(tzdataDir, ["backward"]) 2809 assert len(zones) == 0 2810 2811 if not ignoreBackzone: 2812 links = updateBackzoneLinks(tzdataDir, links) 2813 2814 generateTzDataLinkTestContent( 2815 testDir, 2816 version, 2817 "timeZone_backward_links.js", 2818 "// Link names derived from IANA Time Zone Database, backward file.", 2819 links.items(), 2820 ) 2821 2822 2823def generateTzDataTestNotBackwardLinks(tzdataDir, version, ignoreBackzone, testDir): 2824 tzfiles = filterfalse( 2825 {"backward", "backzone"}.__contains__, listIANAFiles(tzdataDir) 2826 ) 2827 (zones, links) = readIANAFiles(tzdataDir, tzfiles) 2828 2829 if not ignoreBackzone: 2830 links = updateBackzoneLinks(tzdataDir, links) 2831 2832 generateTzDataLinkTestContent( 2833 testDir, 2834 version, 2835 "timeZone_notbackward_links.js", 2836 "// Link names derived from IANA Time Zone Database, excluding backward file.", 2837 links.items(), 2838 ) 2839 2840 2841def generateTzDataTestBackzone(tzdataDir, version, ignoreBackzone, testDir): 2842 backzoneFiles = {"backzone"} 2843 (bkfiles, tzfiles) = partition(listIANAFiles(tzdataDir), backzoneFiles.__contains__) 2844 2845 # Read zone and link infos. 2846 (zones, links) = readIANAFiles(tzdataDir, tzfiles) 2847 (backzones, backlinks) = readIANAFiles(tzdataDir, bkfiles) 2848 2849 if not ignoreBackzone: 2850 comment = """\ 2851// This file was generated with historical, pre-1970 backzone information 2852// respected. Therefore, every zone key listed below is its own Zone, not 2853// a Link to a modern-day target as IANA ignoring backzones would say. 2854 2855""" 2856 else: 2857 comment = """\ 2858// This file was generated while ignoring historical, pre-1970 backzone 2859// information. Therefore, every zone key listed below is part of a Link 2860// whose target is the corresponding value. 2861 2862""" 2863 2864 generateTzDataLinkTestContent( 2865 testDir, 2866 version, 2867 "timeZone_backzone.js", 2868 comment + "// Backzone zones derived from IANA Time Zone Database.", 2869 ( 2870 (zone, zone if not ignoreBackzone else links[zone]) 2871 for zone in backzones 2872 if zone in links 2873 ), 2874 ) 2875 2876 2877def generateTzDataTestBackzoneLinks(tzdataDir, version, ignoreBackzone, testDir): 2878 backzoneFiles = {"backzone"} 2879 (bkfiles, tzfiles) = partition(listIANAFiles(tzdataDir), backzoneFiles.__contains__) 2880 2881 # Read zone and link infos. 2882 (zones, links) = readIANAFiles(tzdataDir, tzfiles) 2883 (backzones, backlinks) = readIANAFiles(tzdataDir, bkfiles) 2884 2885 if not ignoreBackzone: 2886 comment = """\ 2887// This file was generated with historical, pre-1970 backzone information 2888// respected. Therefore, every zone key listed below points to a target 2889// in the backzone file and not to its modern-day target as IANA ignoring 2890// backzones would say. 2891 2892""" 2893 else: 2894 comment = """\ 2895// This file was generated while ignoring historical, pre-1970 backzone 2896// information. Therefore, every zone key listed below is part of a Link 2897// whose target is the corresponding value ignoring any backzone entries. 2898 2899""" 2900 2901 generateTzDataLinkTestContent( 2902 testDir, 2903 version, 2904 "timeZone_backzone_links.js", 2905 comment + "// Backzone links derived from IANA Time Zone Database.", 2906 ( 2907 (zone, target if not ignoreBackzone else links[zone]) 2908 for (zone, target) in backlinks.items() 2909 ), 2910 ) 2911 2912 2913def generateTzDataTestVersion(tzdataDir, version, testDir): 2914 fileName = "timeZone_version.js" 2915 2916 with io.open( 2917 os.path.join(testDir, fileName), mode="w", encoding="utf-8", newline="" 2918 ) as f: 2919 println = partial(print, file=f) 2920 2921 println('// |reftest| skip-if(!this.hasOwnProperty("Intl"))') 2922 println("") 2923 println(generatedFileWarning) 2924 println(tzdataVersionComment.format(version)) 2925 println("""const tzdata = "{0}";""".format(version)) 2926 2927 println( 2928 """ 2929if (typeof getICUOptions === "undefined") { 2930 var getICUOptions = SpecialPowers.Cu.getJSTestingFunctions().getICUOptions; 2931} 2932 2933var options = getICUOptions(); 2934 2935assertEq(options.tzdata, tzdata); 2936 2937if (typeof reportCompare === "function") 2938 reportCompare(0, 0, "ok"); 2939""" 2940 ) 2941 2942 2943def generateTzDataTestCanonicalZones( 2944 tzdataDir, version, ignoreBackzone, ignoreFactory, testDir 2945): 2946 fileName = "supportedValuesOf-timeZones-canonical.js" 2947 2948 # Read zone and link infos. 2949 (ianaZones, _) = readIANATimeZones(tzdataDir, ignoreBackzone, ignoreFactory) 2950 2951 # Replace Etc/GMT and Etc/UTC with UTC. 2952 ianaZones.remove(Zone("Etc/GMT")) 2953 ianaZones.remove(Zone("Etc/UTC")) 2954 ianaZones.add(Zone("UTC")) 2955 2956 # See findIncorrectICUZones() for why Asia/Hanoi has to be special-cased. 2957 ianaZones.remove(Zone("Asia/Hanoi")) 2958 2959 if not ignoreBackzone: 2960 comment = """\ 2961// This file was generated with historical, pre-1970 backzone information 2962// respected. 2963""" 2964 else: 2965 comment = """\ 2966// This file was generated while ignoring historical, pre-1970 backzone 2967// information. 2968""" 2969 2970 with io.open( 2971 os.path.join(testDir, fileName), mode="w", encoding="utf-8", newline="" 2972 ) as f: 2973 println = partial(print, file=f) 2974 2975 println('// |reftest| skip-if(!this.hasOwnProperty("Intl"))') 2976 println("") 2977 println(generatedFileWarning) 2978 println(tzdataVersionComment.format(version)) 2979 println("") 2980 println(comment) 2981 2982 println("const zones = [") 2983 for zone in sorted(ianaZones): 2984 println(f' "{zone}",') 2985 println("];") 2986 2987 println( 2988 """ 2989let supported = Intl.supportedValuesOf("timeZone"); 2990 2991assertEqArray(supported, zones); 2992 2993if (typeof reportCompare === "function") 2994 reportCompare(0, 0, "ok"); 2995""" 2996 ) 2997 2998 2999def generateTzDataTests(tzdataDir, version, ignoreBackzone, ignoreFactory, testDir): 3000 dtfTestDir = os.path.join(testDir, "DateTimeFormat") 3001 if not os.path.isdir(dtfTestDir): 3002 raise RuntimeError("not a directory: %s" % dtfTestDir) 3003 3004 generateTzDataTestBackwardLinks(tzdataDir, version, ignoreBackzone, dtfTestDir) 3005 generateTzDataTestNotBackwardLinks(tzdataDir, version, ignoreBackzone, dtfTestDir) 3006 generateTzDataTestBackzone(tzdataDir, version, ignoreBackzone, dtfTestDir) 3007 generateTzDataTestBackzoneLinks(tzdataDir, version, ignoreBackzone, dtfTestDir) 3008 generateTzDataTestVersion(tzdataDir, version, dtfTestDir) 3009 generateTzDataTestCanonicalZones( 3010 tzdataDir, version, ignoreBackzone, ignoreFactory, testDir 3011 ) 3012 3013 3014def updateTzdata(topsrcdir, args): 3015 """Update the time zone cpp file.""" 3016 3017 icuDir = os.path.join(topsrcdir, "intl/icu/source") 3018 if not os.path.isdir(icuDir): 3019 raise RuntimeError("not a directory: %s" % icuDir) 3020 3021 icuTzDir = os.path.join(topsrcdir, "intl/tzdata/source") 3022 if not os.path.isdir(icuTzDir): 3023 raise RuntimeError("not a directory: %s" % icuTzDir) 3024 3025 intlTestDir = os.path.join(topsrcdir, "js/src/tests/non262/Intl") 3026 if not os.path.isdir(intlTestDir): 3027 raise RuntimeError("not a directory: %s" % intlTestDir) 3028 3029 tzDir = args.tz 3030 if tzDir is not None and not (os.path.isdir(tzDir) or os.path.isfile(tzDir)): 3031 raise RuntimeError("not a directory or file: %s" % tzDir) 3032 ignoreBackzone = args.ignore_backzone 3033 # TODO: Accept or ignore the placeholder time zone "Factory"? 3034 ignoreFactory = False 3035 out = args.out 3036 3037 version = icuTzDataVersion(icuTzDir) 3038 url = ( 3039 "https://www.iana.org/time-zones/repository/releases/tzdata%s.tar.gz" % version 3040 ) 3041 3042 print("Arguments:") 3043 print("\ttzdata version: %s" % version) 3044 print("\ttzdata URL: %s" % url) 3045 print("\ttzdata directory|file: %s" % tzDir) 3046 print("\tICU directory: %s" % icuDir) 3047 print("\tICU timezone directory: %s" % icuTzDir) 3048 print("\tIgnore backzone file: %s" % ignoreBackzone) 3049 print("\tOutput file: %s" % out) 3050 print("") 3051 3052 def updateFrom(f): 3053 if os.path.isfile(f) and tarfile.is_tarfile(f): 3054 with tarfile.open(f, "r:*") as tar: 3055 processTimeZones( 3056 TzDataFile(tar), 3057 icuDir, 3058 icuTzDir, 3059 version, 3060 ignoreBackzone, 3061 ignoreFactory, 3062 out, 3063 ) 3064 generateTzDataTests( 3065 TzDataFile(tar), version, ignoreBackzone, ignoreFactory, intlTestDir 3066 ) 3067 elif os.path.isdir(f): 3068 processTimeZones( 3069 TzDataDir(f), 3070 icuDir, 3071 icuTzDir, 3072 version, 3073 ignoreBackzone, 3074 ignoreFactory, 3075 out, 3076 ) 3077 generateTzDataTests( 3078 TzDataDir(f), version, ignoreBackzone, ignoreFactory, intlTestDir 3079 ) 3080 else: 3081 raise RuntimeError("unknown format") 3082 3083 if tzDir is None: 3084 print("Downloading tzdata file...") 3085 with closing(urlopen(url)) as tzfile: 3086 fname = urlsplit(tzfile.geturl()).path.split("/")[-1] 3087 with tempfile.NamedTemporaryFile(suffix=fname) as tztmpfile: 3088 print("File stored in %s" % tztmpfile.name) 3089 tztmpfile.write(tzfile.read()) 3090 tztmpfile.flush() 3091 updateFrom(tztmpfile.name) 3092 else: 3093 updateFrom(tzDir) 3094 3095 3096def readCurrencyFile(tree): 3097 reCurrency = re.compile(r"^[A-Z]{3}$") 3098 reIntMinorUnits = re.compile(r"^\d+$") 3099 3100 for country in tree.iterfind(".//CcyNtry"): 3101 # Skip entry if no currency information is available. 3102 currency = country.findtext("Ccy") 3103 if currency is None: 3104 continue 3105 assert reCurrency.match(currency) 3106 3107 minorUnits = country.findtext("CcyMnrUnts") 3108 assert minorUnits is not None 3109 3110 # Skip all entries without minorUnits or which use the default minorUnits. 3111 if reIntMinorUnits.match(minorUnits) and int(minorUnits) != 2: 3112 currencyName = country.findtext("CcyNm") 3113 countryName = country.findtext("CtryNm") 3114 yield (currency, int(minorUnits), currencyName, countryName) 3115 3116 3117def writeCurrencyFile(published, currencies, out): 3118 with io.open(out, mode="w", encoding="utf-8", newline="") as f: 3119 println = partial(print, file=f) 3120 3121 println(generatedFileWarning) 3122 println("// Version: {}".format(published)) 3123 3124 println( 3125 """ 3126/** 3127 * Mapping from currency codes to the number of decimal digits used for them. 3128 * Default is 2 digits. 3129 * 3130 * Spec: ISO 4217 Currency and Funds Code List. 3131 * http://www.currency-iso.org/en/home/tables/table-a1.html 3132 */""" 3133 ) 3134 println("var currencyDigits = {") 3135 for (currency, entries) in groupby( 3136 sorted(currencies, key=itemgetter(0)), itemgetter(0) 3137 ): 3138 for (_, minorUnits, currencyName, countryName) in entries: 3139 println(" // {} ({})".format(currencyName, countryName)) 3140 println(" {}: {},".format(currency, minorUnits)) 3141 println("};") 3142 3143 3144def updateCurrency(topsrcdir, args): 3145 """Update the CurrencyDataGenerated.js file.""" 3146 import xml.etree.ElementTree as ET 3147 from random import randint 3148 3149 url = args.url 3150 out = args.out 3151 filename = args.file 3152 3153 print("Arguments:") 3154 print("\tDownload url: %s" % url) 3155 print("\tLocal currency file: %s" % filename) 3156 print("\tOutput file: %s" % out) 3157 print("") 3158 3159 def updateFrom(currencyFile): 3160 print("Processing currency code list file...") 3161 tree = ET.parse(currencyFile) 3162 published = tree.getroot().attrib["Pblshd"] 3163 currencies = readCurrencyFile(tree) 3164 3165 print("Writing CurrencyData file...") 3166 writeCurrencyFile(published, currencies, out) 3167 3168 if filename is not None: 3169 print("Always make sure you have the newest currency code list file!") 3170 updateFrom(filename) 3171 else: 3172 print("Downloading currency & funds code list...") 3173 request = UrlRequest(url) 3174 request.add_header( 3175 "User-agent", 3176 "Mozilla/5.0 (Mobile; rv:{0}.0) Gecko/{0}.0 Firefox/{0}.0".format( 3177 randint(1, 999) 3178 ), 3179 ) 3180 with closing(urlopen(request)) as currencyFile: 3181 fname = urlsplit(currencyFile.geturl()).path.split("/")[-1] 3182 with tempfile.NamedTemporaryFile(suffix=fname) as currencyTmpFile: 3183 print("File stored in %s" % currencyTmpFile.name) 3184 currencyTmpFile.write(currencyFile.read()) 3185 currencyTmpFile.flush() 3186 updateFrom(currencyTmpFile.name) 3187 3188 3189def writeUnicodeExtensionsMappings(println, mapping, extension): 3190 println( 3191 """ 3192template <size_t Length> 3193static inline bool Is{0}Key(mozilla::Span<const char> key, const char (&str)[Length]) {{ 3194 static_assert(Length == {0}KeyLength + 1, 3195 "{0} extension key is two characters long"); 3196 return memcmp(key.data(), str, Length - 1) == 0; 3197}} 3198 3199template <size_t Length> 3200static inline bool Is{0}Type(mozilla::Span<const char> type, const char (&str)[Length]) {{ 3201 static_assert(Length > {0}KeyLength + 1, 3202 "{0} extension type contains more than two characters"); 3203 return type.size() == (Length - 1) && 3204 memcmp(type.data(), str, Length - 1) == 0; 3205}} 3206""".format( 3207 extension 3208 ).rstrip( 3209 "\n" 3210 ) 3211 ) 3212 3213 linear_search_max_length = 4 3214 3215 needs_binary_search = any( 3216 len(replacements.items()) > linear_search_max_length 3217 for replacements in mapping.values() 3218 ) 3219 3220 if needs_binary_search: 3221 println( 3222 """ 3223static int32_t Compare{0}Type(const char* a, mozilla::Span<const char> b) {{ 3224 MOZ_ASSERT(!std::char_traits<char>::find(b.data(), b.size(), '\\0'), 3225 "unexpected null-character in string"); 3226 3227 using UnsignedChar = unsigned char; 3228 for (size_t i = 0; i < b.size(); i++) {{ 3229 // |a| is zero-terminated and |b| doesn't contain a null-terminator. So if 3230 // we've reached the end of |a|, the below if-statement will always be true. 3231 // That ensures we don't read past the end of |a|. 3232 if (int32_t r = UnsignedChar(a[i]) - UnsignedChar(b[i])) {{ 3233 return r; 3234 }} 3235 }} 3236 3237 // Return zero if both strings are equal or a positive number if |b| is a 3238 // prefix of |a|. 3239 return int32_t(UnsignedChar(a[b.size()])); 3240}} 3241 3242template <size_t Length> 3243static inline const char* Search{0}Replacement( 3244 const char* (&types)[Length], const char* (&aliases)[Length], 3245 mozilla::Span<const char> type) {{ 3246 3247 auto p = std::lower_bound(std::begin(types), std::end(types), type, 3248 [](const auto& a, const auto& b) {{ 3249 return Compare{0}Type(a, b) < 0; 3250 }}); 3251 if (p != std::end(types) && Compare{0}Type(*p, type) == 0) {{ 3252 return aliases[std::distance(std::begin(types), p)]; 3253 }} 3254 return nullptr; 3255}} 3256""".format( 3257 extension 3258 ).rstrip( 3259 "\n" 3260 ) 3261 ) 3262 3263 println( 3264 """ 3265/** 3266 * Mapping from deprecated BCP 47 {0} extension types to their preferred 3267 * values. 3268 * 3269 * Spec: https://www.unicode.org/reports/tr35/#Unicode_Locale_Extension_Data_Files 3270 * Spec: https://www.unicode.org/reports/tr35/#t_Extension 3271 */ 3272const char* mozilla::intl::Locale::Replace{0}ExtensionType( 3273 mozilla::Span<const char> key, mozilla::Span<const char> type) {{ 3274 MOZ_ASSERT(key.size() == {0}KeyLength); 3275 MOZ_ASSERT(IsCanonicallyCased{0}Key(key)); 3276 3277 MOZ_ASSERT(type.size() > {0}KeyLength); 3278 MOZ_ASSERT(IsCanonicallyCased{0}Type(type)); 3279""".format( 3280 extension 3281 ) 3282 ) 3283 3284 def to_hash_key(replacements): 3285 return str(sorted(replacements.items())) 3286 3287 def write_array(subtags, name, length): 3288 max_entries = (80 - len(" ")) // (length + len('"", ')) 3289 3290 println(" static const char* {}[{}] = {{".format(name, len(subtags))) 3291 3292 for entries in grouper(subtags, max_entries): 3293 entries = ( 3294 '"{}"'.format(tag).center(length + 2) 3295 for tag in entries 3296 if tag is not None 3297 ) 3298 println(" {},".format(", ".join(entries))) 3299 3300 println(" };") 3301 3302 # Merge duplicate keys. 3303 key_aliases = {} 3304 for (key, replacements) in sorted(mapping.items(), key=itemgetter(0)): 3305 hash_key = to_hash_key(replacements) 3306 if hash_key not in key_aliases: 3307 key_aliases[hash_key] = [] 3308 else: 3309 key_aliases[hash_key].append(key) 3310 3311 first_key = True 3312 for (key, replacements) in sorted(mapping.items(), key=itemgetter(0)): 3313 hash_key = to_hash_key(replacements) 3314 if key in key_aliases[hash_key]: 3315 continue 3316 3317 cond = ( 3318 'Is{}Key(key, "{}")'.format(extension, k) 3319 for k in [key] + key_aliases[hash_key] 3320 ) 3321 3322 if_kind = "if" if first_key else "else if" 3323 cond = (" ||\n" + " " * (2 + len(if_kind) + 2)).join(cond) 3324 println( 3325 """ 3326 {} ({}) {{""".format( 3327 if_kind, cond 3328 ).strip( 3329 "\n" 3330 ) 3331 ) 3332 first_key = False 3333 3334 replacements = sorted(replacements.items(), key=itemgetter(0)) 3335 3336 if len(replacements) > linear_search_max_length: 3337 types = [t for (t, _) in replacements] 3338 preferred = [r for (_, r) in replacements] 3339 max_len = max(len(k) for k in types + preferred) 3340 3341 write_array(types, "types", max_len) 3342 write_array(preferred, "aliases", max_len) 3343 println( 3344 """ 3345 return Search{}Replacement(types, aliases, type); 3346""".format( 3347 extension 3348 ).strip( 3349 "\n" 3350 ) 3351 ) 3352 else: 3353 for (type, replacement) in replacements: 3354 println( 3355 """ 3356 if (Is{}Type(type, "{}")) {{ 3357 return "{}"; 3358 }}""".format( 3359 extension, type, replacement 3360 ).strip( 3361 "\n" 3362 ) 3363 ) 3364 3365 println( 3366 """ 3367 }""".lstrip( 3368 "\n" 3369 ) 3370 ) 3371 3372 println( 3373 """ 3374 return nullptr; 3375} 3376""".strip( 3377 "\n" 3378 ) 3379 ) 3380 3381 3382def readICUUnitResourceFile(filepath): 3383 """Return a set of unit descriptor pairs where the first entry denotes the unit type and the 3384 second entry the unit name. 3385 3386 Example: 3387 3388 root{ 3389 units{ 3390 compound{ 3391 } 3392 coordinate{ 3393 } 3394 length{ 3395 meter{ 3396 } 3397 } 3398 } 3399 unitsNarrow:alias{"/LOCALE/unitsShort"} 3400 unitsShort{ 3401 duration{ 3402 day{ 3403 } 3404 day-person:alias{"/LOCALE/unitsShort/duration/day"} 3405 } 3406 length{ 3407 meter{ 3408 } 3409 } 3410 } 3411 } 3412 3413 Returns {("length", "meter"), ("duration", "day"), ("duration", "day-person")} 3414 """ 3415 3416 start_table_re = re.compile(r"^([\w\-%:\"]+)\{$") 3417 end_table_re = re.compile(r"^\}$") 3418 table_entry_re = re.compile(r"^([\w\-%:\"]+)\{\"(.*?)\"\}$") 3419 3420 # The current resource table. 3421 table = {} 3422 3423 # List of parent tables when parsing. 3424 parents = [] 3425 3426 # Track multi-line comments state. 3427 in_multiline_comment = False 3428 3429 for line in flines(filepath, "utf-8-sig"): 3430 # Remove leading and trailing whitespace. 3431 line = line.strip() 3432 3433 # Skip over comments. 3434 if in_multiline_comment: 3435 if line.endswith("*/"): 3436 in_multiline_comment = False 3437 continue 3438 3439 if line.startswith("//"): 3440 continue 3441 3442 if line.startswith("/*"): 3443 in_multiline_comment = True 3444 continue 3445 3446 # Try to match the start of a table, e.g. `length{` or `meter{`. 3447 match = start_table_re.match(line) 3448 if match: 3449 parents.append(table) 3450 table_name = match.group(1) 3451 new_table = {} 3452 table[table_name] = new_table 3453 table = new_table 3454 continue 3455 3456 # Try to match the end of a table. 3457 match = end_table_re.match(line) 3458 if match: 3459 table = parents.pop() 3460 continue 3461 3462 # Try to match a table entry, e.g. `dnam{"meter"}`. 3463 match = table_entry_re.match(line) 3464 if match: 3465 entry_key = match.group(1) 3466 entry_value = match.group(2) 3467 table[entry_key] = entry_value 3468 continue 3469 3470 raise Exception("unexpected line: '{}' in {}".format(line, filepath)) 3471 3472 assert len(parents) == 0, "Not all tables closed" 3473 assert len(table) == 1, "More than one root table" 3474 3475 # Remove the top-level language identifier table. 3476 (_, unit_table) = table.popitem() 3477 3478 # Add all units for the three display formats "units", "unitsNarrow", and "unitsShort". 3479 # But exclude the pseudo-units "compound" and "ccoordinate". 3480 return { 3481 (unit_type, unit_name if not unit_name.endswith(":alias") else unit_name[:-6]) 3482 for unit_display in ("units", "unitsNarrow", "unitsShort") 3483 if unit_display in unit_table 3484 for (unit_type, unit_names) in unit_table[unit_display].items() 3485 if unit_type != "compound" and unit_type != "coordinate" 3486 for unit_name in unit_names.keys() 3487 } 3488 3489 3490def computeSupportedUnits(all_units, sanctioned_units): 3491 """Given the set of all possible ICU unit identifiers and the set of sanctioned unit 3492 identifiers, compute the set of effectively supported ICU unit identifiers. 3493 """ 3494 3495 def find_match(unit): 3496 unit_match = [ 3497 (unit_type, unit_name) 3498 for (unit_type, unit_name) in all_units 3499 if unit_name == unit 3500 ] 3501 if unit_match: 3502 assert len(unit_match) == 1 3503 return unit_match[0] 3504 return None 3505 3506 def compound_unit_identifiers(): 3507 for numerator in sanctioned_units: 3508 for denominator in sanctioned_units: 3509 yield "{}-per-{}".format(numerator, denominator) 3510 3511 supported_simple_units = {find_match(unit) for unit in sanctioned_units} 3512 assert None not in supported_simple_units 3513 3514 supported_compound_units = { 3515 unit_match 3516 for unit_match in (find_match(unit) for unit in compound_unit_identifiers()) 3517 if unit_match 3518 } 3519 3520 return supported_simple_units | supported_compound_units 3521 3522 3523def readICUDataFilterForUnits(data_filter_file): 3524 with io.open(data_filter_file, mode="r", encoding="utf-8") as f: 3525 data_filter = json.load(f) 3526 3527 # Find the rule set for the "unit_tree". 3528 unit_tree_rules = [ 3529 entry["rules"] 3530 for entry in data_filter["resourceFilters"] 3531 if entry["categories"] == ["unit_tree"] 3532 ] 3533 assert len(unit_tree_rules) == 1 3534 3535 # Compute the list of included units from that rule set. The regular expression must match 3536 # "+/*/length/meter" and mustn't match either "-/*" or "+/*/compound". 3537 included_unit_re = re.compile(r"^\+/\*/(.+?)/(.+)$") 3538 filtered_units = (included_unit_re.match(unit) for unit in unit_tree_rules[0]) 3539 3540 return {(unit.group(1), unit.group(2)) for unit in filtered_units if unit} 3541 3542 3543def writeSanctionedSimpleUnitIdentifiersFiles(all_units, sanctioned_units): 3544 js_src_builtin_intl_dir = os.path.dirname(os.path.abspath(__file__)) 3545 intl_components_src_dir = os.path.join( 3546 js_src_builtin_intl_dir, "../../../../intl/components/src" 3547 ) 3548 3549 def find_unit_type(unit): 3550 result = [ 3551 unit_type for (unit_type, unit_name) in all_units if unit_name == unit 3552 ] 3553 assert result and len(result) == 1 3554 return result[0] 3555 3556 sanctioned_js_file = os.path.join( 3557 js_src_builtin_intl_dir, "SanctionedSimpleUnitIdentifiersGenerated.js" 3558 ) 3559 with io.open(sanctioned_js_file, mode="w", encoding="utf-8", newline="") as f: 3560 println = partial(print, file=f) 3561 3562 sanctioned_units_object = json.dumps( 3563 {unit: True for unit in sorted(sanctioned_units)}, 3564 sort_keys=True, 3565 indent=4, 3566 separators=(",", ": "), 3567 ) 3568 3569 println(generatedFileWarning) 3570 3571 println( 3572 """ 3573/** 3574 * The list of currently supported simple unit identifiers. 3575 * 3576 * Intl.NumberFormat Unified API Proposal 3577 */""" 3578 ) 3579 3580 println( 3581 "var sanctionedSimpleUnitIdentifiers = {};".format(sanctioned_units_object) 3582 ) 3583 3584 sanctioned_h_file = os.path.join(intl_components_src_dir, "MeasureUnitGenerated.h") 3585 with io.open(sanctioned_h_file, mode="w", encoding="utf-8", newline="") as f: 3586 println = partial(print, file=f) 3587 3588 println(generatedFileWarning) 3589 3590 println( 3591 """ 3592#ifndef intl_components_MeasureUnitGenerated_h 3593#define intl_components_MeasureUnitGenerated_h 3594 3595namespace mozilla::intl { 3596 3597struct SimpleMeasureUnit { 3598 const char* const type; 3599 const char* const name; 3600}; 3601 3602/** 3603 * The list of currently supported simple unit identifiers. 3604 * 3605 * The list must be kept in alphabetical order of |name|. 3606 */ 3607inline constexpr SimpleMeasureUnit simpleMeasureUnits[] = { 3608 // clang-format off""" 3609 ) 3610 3611 for unit_name in sorted(sanctioned_units): 3612 println(' {{"{}", "{}"}},'.format(find_unit_type(unit_name), unit_name)) 3613 3614 println( 3615 """ 3616 // clang-format on 3617}; 3618 3619} // namespace mozilla::intl 3620 3621#endif 3622""".strip( 3623 "\n" 3624 ) 3625 ) 3626 3627 writeUnitTestFiles(all_units, sanctioned_units) 3628 3629 3630def writeUnitTestFiles(all_units, sanctioned_units): 3631 """Generate test files for unit number formatters.""" 3632 3633 js_src_builtin_intl_dir = os.path.dirname(os.path.abspath(__file__)) 3634 test_dir = os.path.join( 3635 js_src_builtin_intl_dir, "../../tests/non262/Intl/NumberFormat" 3636 ) 3637 3638 def write_test(file_name, test_content, indent=4): 3639 file_path = os.path.join(test_dir, file_name) 3640 with io.open(file_path, mode="w", encoding="utf-8", newline="") as f: 3641 println = partial(print, file=f) 3642 3643 println('// |reftest| skip-if(!this.hasOwnProperty("Intl"))') 3644 println("") 3645 println(generatedFileWarning) 3646 println("") 3647 3648 sanctioned_units_array = json.dumps( 3649 [unit for unit in sorted(sanctioned_units)], 3650 indent=indent, 3651 separators=(",", ": "), 3652 ) 3653 3654 println( 3655 "const sanctionedSimpleUnitIdentifiers = {};".format( 3656 sanctioned_units_array 3657 ) 3658 ) 3659 3660 println(test_content) 3661 3662 println( 3663 """ 3664if (typeof reportCompare === "function") 3665{}reportCompare(true, true);""".format( 3666 " " * indent 3667 ) 3668 ) 3669 3670 write_test( 3671 "unit-compound-combinations.js", 3672 """ 3673// Test all simple unit identifier combinations are allowed. 3674 3675for (const numerator of sanctionedSimpleUnitIdentifiers) { 3676 for (const denominator of sanctionedSimpleUnitIdentifiers) { 3677 const unit = `${numerator}-per-${denominator}`; 3678 const nf = new Intl.NumberFormat("en", {style: "unit", unit}); 3679 3680 assertEq(nf.format(1), nf.formatToParts(1).map(p => p.value).join("")); 3681 } 3682}""", 3683 ) 3684 3685 all_units_array = json.dumps( 3686 ["-".join(unit) for unit in sorted(all_units)], indent=4, separators=(",", ": ") 3687 ) 3688 3689 write_test( 3690 "unit-well-formed.js", 3691 """ 3692const allUnits = {}; 3693""".format( 3694 all_units_array 3695 ) 3696 + """ 3697// Test only sanctioned unit identifiers are allowed. 3698 3699for (const typeAndUnit of allUnits) { 3700 const [_, type, unit] = typeAndUnit.match(/(\w+)-(.+)/); 3701 3702 let allowed; 3703 if (unit.includes("-per-")) { 3704 const [numerator, denominator] = unit.split("-per-"); 3705 allowed = sanctionedSimpleUnitIdentifiers.includes(numerator) && 3706 sanctionedSimpleUnitIdentifiers.includes(denominator); 3707 } else { 3708 allowed = sanctionedSimpleUnitIdentifiers.includes(unit); 3709 } 3710 3711 if (allowed) { 3712 const nf = new Intl.NumberFormat("en", {style: "unit", unit}); 3713 assertEq(nf.format(1), nf.formatToParts(1).map(p => p.value).join("")); 3714 } else { 3715 assertThrowsInstanceOf(() => new Intl.NumberFormat("en", {style: "unit", unit}), 3716 RangeError, `Missing error for "${typeAndUnit}"`); 3717 } 3718}""", 3719 ) 3720 3721 write_test( 3722 "unit-formatToParts-has-unit-field.js", 3723 """ 3724// Test only English and Chinese to keep the overall runtime reasonable. 3725// 3726// Chinese is included because it contains more than one "unit" element for 3727// certain unit combinations. 3728const locales = ["en", "zh"]; 3729 3730// Plural rules for English only differentiate between "one" and "other". Plural 3731// rules for Chinese only use "other". That means we only need to test two values 3732// per unit. 3733const values = [0, 1]; 3734 3735// Ensure unit formatters contain at least one "unit" element. 3736 3737for (const locale of locales) { 3738 for (const unit of sanctionedSimpleUnitIdentifiers) { 3739 const nf = new Intl.NumberFormat(locale, {style: "unit", unit}); 3740 3741 for (const value of values) { 3742 assertEq(nf.formatToParts(value).some(e => e.type === "unit"), true, 3743 `locale=${locale}, unit=${unit}`); 3744 } 3745 } 3746 3747 for (const numerator of sanctionedSimpleUnitIdentifiers) { 3748 for (const denominator of sanctionedSimpleUnitIdentifiers) { 3749 const unit = `${numerator}-per-${denominator}`; 3750 const nf = new Intl.NumberFormat(locale, {style: "unit", unit}); 3751 3752 for (const value of values) { 3753 assertEq(nf.formatToParts(value).some(e => e.type === "unit"), true, 3754 `locale=${locale}, unit=${unit}`); 3755 } 3756 } 3757 } 3758}""", 3759 indent=2, 3760 ) 3761 3762 3763def updateUnits(topsrcdir, args): 3764 js_src_builtin_intl_dir = os.path.dirname(os.path.abspath(__file__)) 3765 icu_path = os.path.join(topsrcdir, "intl", "icu") 3766 icu_unit_path = os.path.join(icu_path, "source", "data", "unit") 3767 3768 with io.open( 3769 os.path.join(js_src_builtin_intl_dir, "SanctionedSimpleUnitIdentifiers.yaml"), 3770 mode="r", 3771 encoding="utf-8", 3772 ) as f: 3773 sanctioned_units = yaml.safe_load(f) 3774 3775 # Read all possible ICU unit identifiers from the "unit/root.txt" resource. 3776 unit_root_file = os.path.join(icu_unit_path, "root.txt") 3777 all_units = readICUUnitResourceFile(unit_root_file) 3778 3779 # Compute the set of effectively supported ICU unit identifiers. 3780 supported_units = computeSupportedUnits(all_units, sanctioned_units) 3781 3782 # Read the list of units we're including into the ICU data file. 3783 data_filter_file = os.path.join(icu_path, "data_filter.json") 3784 filtered_units = readICUDataFilterForUnits(data_filter_file) 3785 3786 # Both sets must match to avoid resource loading errors at runtime. 3787 if supported_units != filtered_units: 3788 3789 def units_to_string(units): 3790 return ", ".join("/".join(u) for u in units) 3791 3792 missing = supported_units - filtered_units 3793 if missing: 3794 raise RuntimeError("Missing units: {}".format(units_to_string(missing))) 3795 3796 # Not exactly an error, but we currently don't have a use case where we need to support 3797 # more units than required by ECMA-402. 3798 extra = filtered_units - supported_units 3799 if extra: 3800 raise RuntimeError("Unnecessary units: {}".format(units_to_string(extra))) 3801 3802 writeSanctionedSimpleUnitIdentifiersFiles(all_units, sanctioned_units) 3803 3804 3805def readICUNumberingSystemsResourceFile(filepath): 3806 """Returns a dictionary of numbering systems where the key denotes the numbering system name 3807 and the value a dictionary with additional numbering system data. 3808 3809 Example: 3810 3811 numberingSystems:table(nofallback){ 3812 numberingSystems{ 3813 latn{ 3814 algorithmic:int{0} 3815 desc{"0123456789"} 3816 radix:int{10} 3817 } 3818 roman{ 3819 algorithmic:int{1} 3820 desc{"%roman-upper"} 3821 radix:int{10} 3822 } 3823 } 3824 } 3825 3826 Returns {"latn": {"digits": "0123456789", "algorithmic": False}, 3827 "roman": {"algorithmic": True}} 3828 """ 3829 3830 start_table_re = re.compile(r"^(\w+)(?:\:[\w\(\)]+)?\{$") 3831 end_table_re = re.compile(r"^\}$") 3832 table_entry_re = re.compile(r"^(\w+)(?:\:[\w\(\)]+)?\{(?:(?:\"(.*?)\")|(\d+))\}$") 3833 3834 # The current resource table. 3835 table = {} 3836 3837 # List of parent tables when parsing. 3838 parents = [] 3839 3840 # Track multi-line comments state. 3841 in_multiline_comment = False 3842 3843 for line in flines(filepath, "utf-8-sig"): 3844 # Remove leading and trailing whitespace. 3845 line = line.strip() 3846 3847 # Skip over comments. 3848 if in_multiline_comment: 3849 if line.endswith("*/"): 3850 in_multiline_comment = False 3851 continue 3852 3853 if line.startswith("//"): 3854 continue 3855 3856 if line.startswith("/*"): 3857 in_multiline_comment = True 3858 continue 3859 3860 # Try to match the start of a table, e.g. `latn{`. 3861 match = start_table_re.match(line) 3862 if match: 3863 parents.append(table) 3864 table_name = match.group(1) 3865 new_table = {} 3866 table[table_name] = new_table 3867 table = new_table 3868 continue 3869 3870 # Try to match the end of a table. 3871 match = end_table_re.match(line) 3872 if match: 3873 table = parents.pop() 3874 continue 3875 3876 # Try to match a table entry, e.g. `desc{"0123456789"}`. 3877 match = table_entry_re.match(line) 3878 if match: 3879 entry_key = match.group(1) 3880 entry_value = ( 3881 match.group(2) if match.group(2) is not None else int(match.group(3)) 3882 ) 3883 table[entry_key] = entry_value 3884 continue 3885 3886 raise Exception("unexpected line: '{}' in {}".format(line, filepath)) 3887 3888 assert len(parents) == 0, "Not all tables closed" 3889 assert len(table) == 1, "More than one root table" 3890 3891 # Remove the two top-level "numberingSystems" tables. 3892 (_, numbering_systems) = table.popitem() 3893 (_, numbering_systems) = numbering_systems.popitem() 3894 3895 # Assert all numbering systems use base 10. 3896 assert all(ns["radix"] == 10 for ns in numbering_systems.values()) 3897 3898 # Return the numbering systems. 3899 return { 3900 key: {"digits": value["desc"], "algorithmic": False} 3901 if not bool(value["algorithmic"]) 3902 else {"algorithmic": True} 3903 for (key, value) in numbering_systems.items() 3904 } 3905 3906 3907def writeNumberingSystemFiles(numbering_systems): 3908 js_src_builtin_intl_dir = os.path.dirname(os.path.abspath(__file__)) 3909 3910 numbering_systems_js_file = os.path.join( 3911 js_src_builtin_intl_dir, "NumberingSystemsGenerated.h" 3912 ) 3913 with io.open( 3914 numbering_systems_js_file, mode="w", encoding="utf-8", newline="" 3915 ) as f: 3916 println = partial(print, file=f) 3917 3918 println(generatedFileWarning) 3919 3920 println( 3921 """ 3922/** 3923 * The list of numbering systems with simple digit mappings. 3924 */ 3925 3926#ifndef builtin_intl_NumberingSystemsGenerated_h 3927#define builtin_intl_NumberingSystemsGenerated_h 3928""" 3929 ) 3930 3931 simple_numbering_systems = sorted( 3932 name 3933 for (name, value) in numbering_systems.items() 3934 if not value["algorithmic"] 3935 ) 3936 3937 println("// clang-format off") 3938 println("#define NUMBERING_SYSTEMS_WITH_SIMPLE_DIGIT_MAPPINGS \\") 3939 println( 3940 "{}".format( 3941 ", \\\n".join( 3942 ' "{}"'.format(name) for name in simple_numbering_systems 3943 ) 3944 ) 3945 ) 3946 println("// clang-format on") 3947 println("") 3948 3949 println("#endif // builtin_intl_NumberingSystemsGenerated_h") 3950 3951 js_src_builtin_intl_dir = os.path.dirname(os.path.abspath(__file__)) 3952 test_dir = os.path.join(js_src_builtin_intl_dir, "../../tests/non262/Intl") 3953 3954 intl_shell_js_file = os.path.join(test_dir, "shell.js") 3955 3956 with io.open(intl_shell_js_file, mode="w", encoding="utf-8", newline="") as f: 3957 println = partial(print, file=f) 3958 3959 println(generatedFileWarning) 3960 3961 println( 3962 """ 3963// source: CLDR file common/bcp47/number.xml; version CLDR {}. 3964// https://github.com/unicode-org/cldr/blob/master/common/bcp47/number.xml 3965// https://github.com/unicode-org/cldr/blob/master/common/supplemental/numberingSystems.xml 3966""".format( 3967 readCLDRVersionFromICU() 3968 ).rstrip() 3969 ) 3970 3971 numbering_systems_object = json.dumps( 3972 numbering_systems, 3973 indent=2, 3974 separators=(",", ": "), 3975 sort_keys=True, 3976 ensure_ascii=False, 3977 ) 3978 println("const numberingSystems = {};".format(numbering_systems_object)) 3979 3980 3981def updateNumberingSystems(topsrcdir, args): 3982 js_src_builtin_intl_dir = os.path.dirname(os.path.abspath(__file__)) 3983 icu_path = os.path.join(topsrcdir, "intl", "icu") 3984 icu_misc_path = os.path.join(icu_path, "source", "data", "misc") 3985 3986 with io.open( 3987 os.path.join(js_src_builtin_intl_dir, "NumberingSystems.yaml"), 3988 mode="r", 3989 encoding="utf-8", 3990 ) as f: 3991 numbering_systems = yaml.safe_load(f) 3992 3993 # Read all possible ICU unit identifiers from the "misc/numberingSystems.txt" resource. 3994 misc_ns_file = os.path.join(icu_misc_path, "numberingSystems.txt") 3995 all_numbering_systems = readICUNumberingSystemsResourceFile(misc_ns_file) 3996 3997 all_numbering_systems_simple_digits = { 3998 name 3999 for (name, value) in all_numbering_systems.items() 4000 if not value["algorithmic"] 4001 } 4002 4003 # Assert ICU includes support for all required numbering systems. If this assertion fails, 4004 # something is broken in ICU. 4005 assert all_numbering_systems_simple_digits.issuperset( 4006 numbering_systems 4007 ), "{}".format(numbering_systems.difference(all_numbering_systems_simple_digits)) 4008 4009 # Assert the spec requires support for all numbering systems with simple digit mappings. If 4010 # this assertion fails, file a PR at <https://github.com/tc39/ecma402> to include any new 4011 # numbering systems. 4012 assert all_numbering_systems_simple_digits.issubset(numbering_systems), "{}".format( 4013 all_numbering_systems_simple_digits.difference(numbering_systems) 4014 ) 4015 4016 writeNumberingSystemFiles(all_numbering_systems) 4017 4018 4019if __name__ == "__main__": 4020 import argparse 4021 4022 # This script must reside in js/src/builtin/intl to work correctly. 4023 (thisDir, thisFile) = os.path.split(os.path.abspath(__file__)) 4024 dirPaths = os.path.normpath(thisDir).split(os.sep) 4025 if "/".join(dirPaths[-4:]) != "js/src/builtin/intl": 4026 raise RuntimeError("%s must reside in js/src/builtin/intl" % __file__) 4027 topsrcdir = "/".join(dirPaths[:-4]) 4028 4029 def EnsureHttps(v): 4030 if not v.startswith("https:"): 4031 raise argparse.ArgumentTypeError("URL protocol must be https: " % v) 4032 return v 4033 4034 parser = argparse.ArgumentParser(description="Update intl data.") 4035 subparsers = parser.add_subparsers(help="Select update mode") 4036 4037 parser_cldr_tags = subparsers.add_parser( 4038 "langtags", help="Update CLDR language tags data" 4039 ) 4040 parser_cldr_tags.add_argument( 4041 "--version", metavar="VERSION", help="CLDR version number" 4042 ) 4043 parser_cldr_tags.add_argument( 4044 "--url", 4045 metavar="URL", 4046 default="https://unicode.org/Public/cldr/<VERSION>/core.zip", 4047 type=EnsureHttps, 4048 help="Download url CLDR data (default: %(default)s)", 4049 ) 4050 parser_cldr_tags.add_argument( 4051 "--out", 4052 default=os.path.join( 4053 topsrcdir, "intl", "components", "src", "LocaleGenerated.cpp" 4054 ), 4055 help="Output file (default: %(default)s)", 4056 ) 4057 parser_cldr_tags.add_argument( 4058 "file", nargs="?", help="Local cldr-core.zip file, if omitted uses <URL>" 4059 ) 4060 parser_cldr_tags.set_defaults(func=updateCLDRLangTags) 4061 4062 parser_tz = subparsers.add_parser("tzdata", help="Update tzdata") 4063 parser_tz.add_argument( 4064 "--tz", 4065 help="Local tzdata directory or file, if omitted downloads tzdata " 4066 "distribution from https://www.iana.org/time-zones/", 4067 ) 4068 # ICU doesn't include the backzone file by default, but we still like to 4069 # use the backzone time zone names to avoid user confusion. This does lead 4070 # to formatting "historic" dates (pre-1970 era) with the wrong time zone, 4071 # but that's probably acceptable for now. 4072 parser_tz.add_argument( 4073 "--ignore-backzone", 4074 action="store_true", 4075 help="Ignore tzdata's 'backzone' file. Can be enabled to generate more " 4076 "accurate time zone canonicalization reflecting the actual time " 4077 "zones as used by ICU.", 4078 ) 4079 parser_tz.add_argument( 4080 "--out", 4081 default=os.path.join(thisDir, "TimeZoneDataGenerated.h"), 4082 help="Output file (default: %(default)s)", 4083 ) 4084 parser_tz.set_defaults(func=partial(updateTzdata, topsrcdir)) 4085 4086 parser_currency = subparsers.add_parser( 4087 "currency", help="Update currency digits mapping" 4088 ) 4089 parser_currency.add_argument( 4090 "--url", 4091 metavar="URL", 4092 default="https://www.currency-iso.org/dam/downloads/lists/list_one.xml", # NOQA: E501 4093 type=EnsureHttps, 4094 help="Download url for the currency & funds code list (default: " 4095 "%(default)s)", 4096 ) 4097 parser_currency.add_argument( 4098 "--out", 4099 default=os.path.join(thisDir, "CurrencyDataGenerated.js"), 4100 help="Output file (default: %(default)s)", 4101 ) 4102 parser_currency.add_argument( 4103 "file", nargs="?", help="Local currency code list file, if omitted uses <URL>" 4104 ) 4105 parser_currency.set_defaults(func=partial(updateCurrency, topsrcdir)) 4106 4107 parser_units = subparsers.add_parser( 4108 "units", help="Update sanctioned unit identifiers mapping" 4109 ) 4110 parser_units.set_defaults(func=partial(updateUnits, topsrcdir)) 4111 4112 parser_numbering_systems = subparsers.add_parser( 4113 "numbering", help="Update numbering systems with simple digit mappings" 4114 ) 4115 parser_numbering_systems.set_defaults( 4116 func=partial(updateNumberingSystems, topsrcdir) 4117 ) 4118 4119 args = parser.parse_args() 4120 args.func(args) 4121