1#!/usr/local/bin/python3.8 2 3"""Generator of the mapping from OpenType tags to BCP 47 tags and vice 4versa. 5 6It creates a ``const LangTag[]``, matching the tags from the OpenType 7languages system tag list to the language subtags of the BCP 47 language 8subtag registry, with some manual adjustments. The mappings are 9supplemented with macrolanguages' sublanguages and retired codes' 10replacements, according to BCP 47 and some manual additions where BCP 47 11omits a retired code entirely. 12 13Also generated is a function, ``hb_ot_ambiguous_tag_to_language``, 14intended for use by ``hb_ot_tag_to_language``. It maps OpenType tags 15back to BCP 47 tags. Ambiguous OpenType tags (those that correspond to 16multiple BCP 47 tags) are listed here, except when the alphabetically 17first BCP 47 tag happens to be the chosen disambiguated tag. In that 18case, the fallback behavior will choose the right tag anyway. 19 20usage: ./gen-tag-table.py languagetags language-subtag-registry 21 22Input files: 23* https://docs.microsoft.com/en-us/typography/opentype/spec/languagetags 24* https://www.iana.org/assignments/language-subtag-registry/language-subtag-registry 25""" 26 27import collections 28import html 29from html.parser import HTMLParser 30import itertools 31import re 32import sys 33import unicodedata 34 35if len (sys.argv) != 3: 36 sys.exit (__doc__) 37 38def expect (condition, message=None): 39 if not condition: 40 if message is None: 41 raise AssertionError 42 raise AssertionError (message) 43 44def write (s): 45 sys.stdout.flush () 46 sys.stdout.buffer.write (s.encode ('utf-8')) 47 48DEFAULT_LANGUAGE_SYSTEM = '' 49 50# from https://www-01.sil.org/iso639-3/iso-639-3.tab 51ISO_639_3_TO_1 = { 52 'aar': 'aa', 53 'abk': 'ab', 54 'afr': 'af', 55 'aka': 'ak', 56 'amh': 'am', 57 'ara': 'ar', 58 'arg': 'an', 59 'asm': 'as', 60 'ava': 'av', 61 'ave': 'ae', 62 'aym': 'ay', 63 'aze': 'az', 64 'bak': 'ba', 65 'bam': 'bm', 66 'bel': 'be', 67 'ben': 'bn', 68 'bis': 'bi', 69 'bod': 'bo', 70 'bos': 'bs', 71 'bre': 'br', 72 'bul': 'bg', 73 'cat': 'ca', 74 'ces': 'cs', 75 'cha': 'ch', 76 'che': 'ce', 77 'chu': 'cu', 78 'chv': 'cv', 79 'cor': 'kw', 80 'cos': 'co', 81 'cre': 'cr', 82 'cym': 'cy', 83 'dan': 'da', 84 'deu': 'de', 85 'div': 'dv', 86 'dzo': 'dz', 87 'ell': 'el', 88 'eng': 'en', 89 'epo': 'eo', 90 'est': 'et', 91 'eus': 'eu', 92 'ewe': 'ee', 93 'fao': 'fo', 94 'fas': 'fa', 95 'fij': 'fj', 96 'fin': 'fi', 97 'fra': 'fr', 98 'fry': 'fy', 99 'ful': 'ff', 100 'gla': 'gd', 101 'gle': 'ga', 102 'glg': 'gl', 103 'glv': 'gv', 104 'grn': 'gn', 105 'guj': 'gu', 106 'hat': 'ht', 107 'hau': 'ha', 108 'hbs': 'sh', 109 'heb': 'he', 110 'her': 'hz', 111 'hin': 'hi', 112 'hmo': 'ho', 113 'hrv': 'hr', 114 'hun': 'hu', 115 'hye': 'hy', 116 'ibo': 'ig', 117 'ido': 'io', 118 'iii': 'ii', 119 'iku': 'iu', 120 'ile': 'ie', 121 'ina': 'ia', 122 'ind': 'id', 123 'ipk': 'ik', 124 'isl': 'is', 125 'ita': 'it', 126 'jav': 'jv', 127 'jpn': 'ja', 128 'kal': 'kl', 129 'kan': 'kn', 130 'kas': 'ks', 131 'kat': 'ka', 132 'kau': 'kr', 133 'kaz': 'kk', 134 'khm': 'km', 135 'kik': 'ki', 136 'kin': 'rw', 137 'kir': 'ky', 138 'kom': 'kv', 139 'kon': 'kg', 140 'kor': 'ko', 141 'kua': 'kj', 142 'kur': 'ku', 143 'lao': 'lo', 144 'lat': 'la', 145 'lav': 'lv', 146 'lim': 'li', 147 'lin': 'ln', 148 'lit': 'lt', 149 'ltz': 'lb', 150 'lub': 'lu', 151 'lug': 'lg', 152 'mah': 'mh', 153 'mal': 'ml', 154 'mar': 'mr', 155 'mkd': 'mk', 156 'mlg': 'mg', 157 'mlt': 'mt', 158 'mol': 'mo', 159 'mon': 'mn', 160 'mri': 'mi', 161 'msa': 'ms', 162 'mya': 'my', 163 'nau': 'na', 164 'nav': 'nv', 165 'nbl': 'nr', 166 'nde': 'nd', 167 'ndo': 'ng', 168 'nep': 'ne', 169 'nld': 'nl', 170 'nno': 'nn', 171 'nob': 'nb', 172 'nor': 'no', 173 'nya': 'ny', 174 'oci': 'oc', 175 'oji': 'oj', 176 'ori': 'or', 177 'orm': 'om', 178 'oss': 'os', 179 'pan': 'pa', 180 'pli': 'pi', 181 'pol': 'pl', 182 'por': 'pt', 183 'pus': 'ps', 184 'que': 'qu', 185 'roh': 'rm', 186 'ron': 'ro', 187 'run': 'rn', 188 'rus': 'ru', 189 'sag': 'sg', 190 'san': 'sa', 191 'sin': 'si', 192 'slk': 'sk', 193 'slv': 'sl', 194 'sme': 'se', 195 'smo': 'sm', 196 'sna': 'sn', 197 'snd': 'sd', 198 'som': 'so', 199 'sot': 'st', 200 'spa': 'es', 201 'sqi': 'sq', 202 'srd': 'sc', 203 'srp': 'sr', 204 'ssw': 'ss', 205 'sun': 'su', 206 'swa': 'sw', 207 'swe': 'sv', 208 'tah': 'ty', 209 'tam': 'ta', 210 'tat': 'tt', 211 'tel': 'te', 212 'tgk': 'tg', 213 'tgl': 'tl', 214 'tha': 'th', 215 'tir': 'ti', 216 'ton': 'to', 217 'tsn': 'tn', 218 'tso': 'ts', 219 'tuk': 'tk', 220 'tur': 'tr', 221 'twi': 'tw', 222 'uig': 'ug', 223 'ukr': 'uk', 224 'urd': 'ur', 225 'uzb': 'uz', 226 'ven': 've', 227 'vie': 'vi', 228 'vol': 'vo', 229 'wln': 'wa', 230 'wol': 'wo', 231 'xho': 'xh', 232 'yid': 'yi', 233 'yor': 'yo', 234 'zha': 'za', 235 'zho': 'zh', 236 'zul': 'zu', 237} 238 239class LanguageTag (object): 240 """A BCP 47 language tag. 241 242 Attributes: 243 subtags (List[str]): The list of subtags in this tag. 244 grandfathered (bool): Whether this tag is grandfathered. If 245 ``true``, the entire lowercased tag is the ``language`` 246 and the other subtag fields are empty. 247 language (str): The language subtag. 248 script (str): The script subtag. 249 region (str): The region subtag. 250 variant (str): The variant subtag. 251 252 Args: 253 tag (str): A BCP 47 language tag. 254 255 """ 256 def __init__ (self, tag): 257 global bcp_47 258 self.subtags = tag.lower ().split ('-') 259 self.grandfathered = tag.lower () in bcp_47.grandfathered 260 if self.grandfathered: 261 self.language = tag.lower () 262 self.script = '' 263 self.region = '' 264 self.variant = '' 265 else: 266 self.language = self.subtags[0] 267 self.script = self._find_first (lambda s: len (s) == 4 and s[0] > '9', self.subtags) 268 self.region = self._find_first (lambda s: len (s) == 2 and s[0] > '9' or len (s) == 3 and s[0] <= '9', self.subtags[1:]) 269 self.variant = self._find_first (lambda s: len (s) > 4 or len (s) == 4 and s[0] <= '9', self.subtags) 270 271 def __str__(self): 272 return '-'.join(self.subtags) 273 274 def __repr__ (self): 275 return 'LanguageTag(%r)' % str(self) 276 277 @staticmethod 278 def _find_first (function, sequence): 279 try: 280 return next (iter (filter (function, sequence))) 281 except StopIteration: 282 return None 283 284 def is_complex (self): 285 """Return whether this tag is too complex to represent as a 286 ``LangTag`` in the generated code. 287 288 Complex tags need to be handled in 289 ``hb_ot_tags_from_complex_language``. 290 291 Returns: 292 Whether this tag is complex. 293 """ 294 return not (len (self.subtags) == 1 295 or self.grandfathered 296 and len (self.subtags[1]) != 3 297 and ot.from_bcp_47[self.subtags[0]] == ot.from_bcp_47[self.language]) 298 299 def get_group (self): 300 """Return the group into which this tag should be categorized in 301 ``hb_ot_tags_from_complex_language``. 302 303 The group is the first letter of the tag, or ``'und'`` if this tag 304 should not be matched in a ``switch`` statement in the generated 305 code. 306 307 Returns: 308 This tag's group. 309 """ 310 return ('und' 311 if (self.language == 'und' 312 or self.variant in bcp_47.prefixes and len (bcp_47.prefixes[self.variant]) == 1) 313 else self.language[0]) 314 315class OpenTypeRegistryParser (HTMLParser): 316 """A parser for the OpenType language system tag registry. 317 318 Attributes: 319 header (str): The "last updated" line of the registry. 320 names (Mapping[str, str]): A map of language system tags to the 321 names they are given in the registry. 322 ranks (DefaultDict[str, int]): A map of language system tags to 323 numbers. If a single BCP 47 tag corresponds to multiple 324 OpenType tags, the tags are ordered in increasing order by 325 rank. The rank is based on the number of BCP 47 tags 326 associated with a tag, though it may be manually modified. 327 to_bcp_47 (DefaultDict[str, AbstractSet[str]]): A map of 328 OpenType language system tags to sets of BCP 47 tags. 329 from_bcp_47 (DefaultDict[str, AbstractSet[str]]): ``to_bcp_47`` 330 inverted. Its values start as unsorted sets; 331 ``sort_languages`` converts them to sorted lists. 332 from_bcp_47_uninherited (Optional[Dict[str, AbstractSet[str]]]): 333 A copy of ``from_bcp_47``. It starts as ``None`` and is 334 populated at the beginning of the first call to 335 ``inherit_from_macrolanguages``. 336 337 """ 338 def __init__ (self): 339 HTMLParser.__init__ (self) 340 self.header = '' 341 self.names = {} 342 self.ranks = collections.defaultdict (int) 343 self.to_bcp_47 = collections.defaultdict (set) 344 self.from_bcp_47 = collections.defaultdict (set) 345 self.from_bcp_47_uninherited = None 346 # Whether the parser is in a <td> element 347 self._td = False 348 # Whether the parser is after a <br> element within the current <tr> element 349 self._br = False 350 # The text of the <td> elements of the current <tr> element. 351 self._current_tr = [] 352 353 def handle_starttag (self, tag, attrs): 354 if tag == 'br': 355 self._br = True 356 elif tag == 'meta': 357 for attr, value in attrs: 358 if attr == 'name' and value == 'updated_at': 359 self.header = self.get_starttag_text () 360 break 361 elif tag == 'td': 362 self._td = True 363 self._current_tr.append ('') 364 elif tag == 'tr': 365 self._br = False 366 self._current_tr = [] 367 368 def handle_endtag (self, tag): 369 if tag == 'td': 370 self._td = False 371 elif tag == 'tr' and self._current_tr: 372 expect (2 <= len (self._current_tr) <= 3) 373 name = self._current_tr[0].strip () 374 tag = self._current_tr[1].strip ("\t\n\v\f\r '") 375 rank = 0 376 if len (tag) > 4: 377 expect (tag.endswith (' (deprecated)'), 'ill-formed OpenType tag: %s' % tag) 378 name += ' (deprecated)' 379 tag = tag.split (' ')[0] 380 rank = 1 381 self.names[tag] = re.sub (' languages$', '', name) 382 if not self._current_tr[2]: 383 return 384 iso_codes = self._current_tr[2].strip () 385 self.to_bcp_47[tag].update (ISO_639_3_TO_1.get (code, code) for code in iso_codes.replace (' ', '').split (',')) 386 rank += 2 * len (self.to_bcp_47[tag]) 387 self.ranks[tag] = rank 388 389 def handle_data (self, data): 390 if self._td and not self._br: 391 self._current_tr[-1] += data 392 393 def handle_charref (self, name): 394 self.handle_data (html.unescape ('&#%s;' % name)) 395 396 def handle_entityref (self, name): 397 self.handle_data (html.unescape ('&%s;' % name)) 398 399 def parse (self, filename): 400 """Parse the OpenType language system tag registry. 401 402 Args: 403 filename (str): The file name of the registry. 404 """ 405 with open (filename, encoding='utf-8') as f: 406 self.feed (f.read ()) 407 expect (self.header) 408 for tag, iso_codes in self.to_bcp_47.items (): 409 for iso_code in iso_codes: 410 self.from_bcp_47[iso_code].add (tag) 411 412 def add_language (self, bcp_47_tag, ot_tag): 413 """Add a language as if it were in the registry. 414 415 Args: 416 bcp_47_tag (str): A BCP 47 tag. If the tag is more than just 417 a language subtag, and if the language subtag is a 418 macrolanguage, then new languages are added corresponding 419 to the macrolanguages' individual languages with the 420 remainder of the tag appended. 421 ot_tag (str): An OpenType language system tag. 422 """ 423 global bcp_47 424 self.to_bcp_47[ot_tag].add (bcp_47_tag) 425 self.from_bcp_47[bcp_47_tag].add (ot_tag) 426 if bcp_47_tag.lower () not in bcp_47.grandfathered: 427 try: 428 [macrolanguage, suffix] = bcp_47_tag.split ('-', 1) 429 if macrolanguage in bcp_47.macrolanguages: 430 s = set () 431 for language in bcp_47.macrolanguages[macrolanguage]: 432 if language.lower () not in bcp_47.grandfathered: 433 s.add ('%s-%s' % (language, suffix)) 434 bcp_47.macrolanguages['%s-%s' % (macrolanguage, suffix)] = s 435 except ValueError: 436 pass 437 438 @staticmethod 439 def _remove_language (tag_1, dict_1, dict_2): 440 for tag_2 in dict_1.pop (tag_1): 441 dict_2[tag_2].remove (tag_1) 442 if not dict_2[tag_2]: 443 del dict_2[tag_2] 444 445 def remove_language_ot (self, ot_tag): 446 """Remove an OpenType tag from the registry. 447 448 Args: 449 ot_tag (str): An OpenType tag. 450 """ 451 self._remove_language (ot_tag, self.to_bcp_47, self.from_bcp_47) 452 453 def remove_language_bcp_47 (self, bcp_47_tag): 454 """Remove a BCP 47 tag from the registry. 455 456 Args: 457 bcp_47_tag (str): A BCP 47 tag. 458 """ 459 self._remove_language (bcp_47_tag, self.from_bcp_47, self.to_bcp_47) 460 461 def inherit_from_macrolanguages (self): 462 """Copy mappings from macrolanguages to individual languages. 463 464 If a BCP 47 tag for an individual mapping has no OpenType 465 mapping but its macrolanguage does, the mapping is copied to 466 the individual language. For example, als (Tosk Albanian) has no 467 explicit mapping, so it inherits from sq (Albanian) the mapping 468 to SQI. 469 470 However, if an OpenType tag maps to a BCP 47 macrolanguage and 471 some but not all of its individual languages, the mapping is not 472 inherited from the macrolanguage to the missing individual 473 languages. For example, INUK (Nunavik Inuktitut) is mapped to 474 ike (Eastern Canadian Inuktitut) and iu (Inuktitut) but not to 475 ikt (Inuinnaqtun, which is an individual language of iu), so 476 this method does not add a mapping from ikt to INUK. 477 478 If a BCP 47 tag for a macrolanguage has no OpenType mapping but 479 some of its individual languages do, their mappings are copied 480 to the macrolanguage. 481 """ 482 global bcp_47 483 first_time = self.from_bcp_47_uninherited is None 484 if first_time: 485 self.from_bcp_47_uninherited = dict (self.from_bcp_47) 486 for macrolanguage, languages in dict (bcp_47.macrolanguages).items (): 487 ot_macrolanguages = { 488 ot_macrolanguage for ot_macrolanguage in self.from_bcp_47_uninherited.get (macrolanguage, set ()) 489 } 490 blocked_ot_macrolanguages = set () 491 if 'retired code' not in bcp_47.scopes.get (macrolanguage, ''): 492 for ot_macrolanguage in ot_macrolanguages: 493 round_trip_macrolanguages = { 494 l for l in self.to_bcp_47[ot_macrolanguage] 495 if 'retired code' not in bcp_47.scopes.get (l, '') 496 } 497 round_trip_languages = { 498 l for l in languages 499 if 'retired code' not in bcp_47.scopes.get (l, '') 500 } 501 intersection = round_trip_macrolanguages & round_trip_languages 502 if intersection and intersection != round_trip_languages: 503 blocked_ot_macrolanguages.add (ot_macrolanguage) 504 if ot_macrolanguages: 505 for ot_macrolanguage in ot_macrolanguages: 506 if ot_macrolanguage not in blocked_ot_macrolanguages: 507 for language in languages: 508 self.add_language (language, ot_macrolanguage) 509 if not blocked_ot_macrolanguages: 510 self.ranks[ot_macrolanguage] += 1 511 elif first_time: 512 for language in languages: 513 if language in self.from_bcp_47_uninherited: 514 ot_macrolanguages |= self.from_bcp_47_uninherited[language] 515 else: 516 ot_macrolanguages.clear () 517 if not ot_macrolanguages: 518 break 519 for ot_macrolanguage in ot_macrolanguages: 520 self.add_language (macrolanguage, ot_macrolanguage) 521 522 def sort_languages (self): 523 """Sort the values of ``from_bcp_47`` in ascending rank order.""" 524 for language, tags in self.from_bcp_47.items (): 525 self.from_bcp_47[language] = sorted (tags, 526 key=lambda t: (self.ranks[t] + rank_delta (language, t), t)) 527 528ot = OpenTypeRegistryParser () 529 530class BCP47Parser (object): 531 """A parser for the BCP 47 subtag registry. 532 533 Attributes: 534 header (str): The "File-Date" line of the registry. 535 names (Mapping[str, str]): A map of subtags to the names they 536 are given in the registry. Each value is a 537 ``'\\n'``-separated list of names. 538 scopes (Mapping[str, str]): A map of language subtags to strings 539 suffixed to language names, including suffixes to explain 540 language scopes. 541 macrolanguages (DefaultDict[str, AbstractSet[str]]): A map of 542 language subtags to the sets of language subtags which 543 inherit from them. See 544 ``OpenTypeRegistryParser.inherit_from_macrolanguages``. 545 prefixes (DefaultDict[str, AbstractSet[str]]): A map of variant 546 subtags to their prefixes. 547 grandfathered (AbstractSet[str]): The set of grandfathered tags, 548 normalized to lowercase. 549 550 """ 551 def __init__ (self): 552 self.header = '' 553 self.names = {} 554 self.scopes = {} 555 self.macrolanguages = collections.defaultdict (set) 556 self.prefixes = collections.defaultdict (set) 557 self.grandfathered = set () 558 559 def parse (self, filename): 560 """Parse the BCP 47 subtag registry. 561 562 Args: 563 filename (str): The file name of the registry. 564 """ 565 with open (filename, encoding='utf-8') as f: 566 subtag_type = None 567 subtag = None 568 deprecated = False 569 has_preferred_value = False 570 line_buffer = '' 571 for line in itertools.chain (f, ['']): 572 line = line.rstrip () 573 if line.startswith (' '): 574 line_buffer += line[1:] 575 continue 576 line, line_buffer = line_buffer, line 577 if line.startswith ('Type: '): 578 subtag_type = line.split (' ')[1] 579 deprecated = False 580 has_preferred_value = False 581 elif line.startswith ('Subtag: ') or line.startswith ('Tag: '): 582 subtag = line.split (' ')[1] 583 if subtag_type == 'grandfathered': 584 self.grandfathered.add (subtag.lower ()) 585 elif line.startswith ('Description: '): 586 description = line.split (' ', 1)[1].replace (' (individual language)', '') 587 description = re.sub (' (\(family\)|\((individual |macro)language\)|languages)$', '', 588 description) 589 if subtag in self.names: 590 self.names[subtag] += '\n' + description 591 else: 592 self.names[subtag] = description 593 elif subtag_type == 'language' or subtag_type == 'grandfathered': 594 if line.startswith ('Scope: '): 595 scope = line.split (' ')[1] 596 if scope == 'macrolanguage': 597 scope = ' [macrolanguage]' 598 elif scope == 'collection': 599 scope = ' [collection]' 600 else: 601 continue 602 self.scopes[subtag] = scope 603 elif line.startswith ('Deprecated: '): 604 self.scopes[subtag] = ' (retired code)' + self.scopes.get (subtag, '') 605 deprecated = True 606 elif deprecated and line.startswith ('Comments: see '): 607 # If a subtag is split into multiple replacement subtags, 608 # it essentially represents a macrolanguage. 609 for language in line.replace (',', '').split (' ')[2:]: 610 self._add_macrolanguage (subtag, language) 611 elif line.startswith ('Preferred-Value: '): 612 # If a subtag is deprecated in favor of a single replacement subtag, 613 # it is either a dialect or synonym of the preferred subtag. Either 614 # way, it is close enough to the truth to consider the replacement 615 # the macrolanguage of the deprecated language. 616 has_preferred_value = True 617 macrolanguage = line.split (' ')[1] 618 self._add_macrolanguage (macrolanguage, subtag) 619 elif not has_preferred_value and line.startswith ('Macrolanguage: '): 620 self._add_macrolanguage (line.split (' ')[1], subtag) 621 elif subtag_type == 'variant': 622 if line.startswith ('Deprecated: '): 623 self.scopes[subtag] = ' (retired code)' + self.scopes.get (subtag, '') 624 elif line.startswith ('Prefix: '): 625 self.prefixes[subtag].add (line.split (' ')[1]) 626 elif line.startswith ('File-Date: '): 627 self.header = line 628 expect (self.header) 629 630 def _add_macrolanguage (self, macrolanguage, language): 631 global ot 632 if language not in ot.from_bcp_47: 633 for l in self.macrolanguages.get (language, set ()): 634 self._add_macrolanguage (macrolanguage, l) 635 if macrolanguage not in ot.from_bcp_47: 636 for ls in list (self.macrolanguages.values ()): 637 if macrolanguage in ls: 638 ls.add (language) 639 return 640 self.macrolanguages[macrolanguage].add (language) 641 642 def remove_extra_macrolanguages (self): 643 """Make every language have at most one macrolanguage.""" 644 inverted = collections.defaultdict (list) 645 for macrolanguage, languages in self.macrolanguages.items (): 646 for language in languages: 647 inverted[language].append (macrolanguage) 648 for language, macrolanguages in inverted.items (): 649 if len (macrolanguages) > 1: 650 macrolanguages.sort (key=lambda ml: len (self.macrolanguages[ml])) 651 biggest_macrolanguage = macrolanguages.pop () 652 for macrolanguage in macrolanguages: 653 self._add_macrolanguage (biggest_macrolanguage, macrolanguage) 654 655 def _get_name_piece (self, subtag): 656 """Return the first name of a subtag plus its scope suffix. 657 658 Args: 659 subtag (str): A BCP 47 subtag. 660 661 Returns: 662 The name form of ``subtag``. 663 """ 664 return self.names[subtag].split ('\n')[0] + self.scopes.get (subtag, '') 665 666 def get_name (self, lt): 667 """Return the names of the subtags in a language tag. 668 669 Args: 670 lt (LanguageTag): A BCP 47 language tag. 671 672 Returns: 673 The name form of ``lt``. 674 """ 675 name = self._get_name_piece (lt.language) 676 if lt.script: 677 name += '; ' + self._get_name_piece (lt.script.title ()) 678 if lt.region: 679 name += '; ' + self._get_name_piece (lt.region.upper ()) 680 if lt.variant: 681 name += '; ' + self._get_name_piece (lt.variant) 682 return name 683 684bcp_47 = BCP47Parser () 685 686ot.parse (sys.argv[1]) 687bcp_47.parse (sys.argv[2]) 688 689ot.add_language ('ary', 'MOR') 690 691ot.add_language ('ath', 'ATH') 692 693ot.add_language ('bai', 'BML') 694 695ot.ranks['BAL'] = ot.ranks['KAR'] + 1 696 697ot.add_language ('ber', 'BBR') 698 699ot.remove_language_ot ('PGR') 700ot.add_language ('el-polyton', 'PGR') 701 702bcp_47.macrolanguages['et'] = {'ekk'} 703 704bcp_47.names['flm'] = 'Falam Chin' 705bcp_47.scopes['flm'] = ' (retired code)' 706bcp_47.macrolanguages['flm'] = {'cfm'} 707 708ot.ranks['FNE'] = ot.ranks['TNE'] + 1 709 710ot.add_language ('und-fonipa', 'IPPH') 711 712ot.add_language ('und-fonnapa', 'APPH') 713 714ot.remove_language_ot ('IRT') 715ot.add_language ('ga-Latg', 'IRT') 716 717ot.add_language ('hy-arevmda', 'HYE') 718 719ot.remove_language_ot ('KGE') 720ot.add_language ('und-Geok', 'KGE') 721 722bcp_47.macrolanguages['id'] = {'in'} 723 724bcp_47.macrolanguages['ijo'] = {'ijc'} 725 726ot.add_language ('kht', 'KHN') 727ot.names['KHN'] = ot.names['KHT'] + ' (Microsoft fonts)' 728ot.ranks['KHN'] = ot.ranks['KHT'] + 1 729 730ot.ranks['LCR'] = ot.ranks['MCR'] + 1 731 732ot.names['MAL'] = 'Malayalam Traditional' 733ot.ranks['MLR'] += 1 734 735bcp_47.names['mhv'] = 'Arakanese' 736bcp_47.scopes['mhv'] = ' (retired code)' 737 738ot.add_language ('mnw-TH', 'MONT') 739 740ot.add_language ('no', 'NOR') 741 742ot.add_language ('oc-provenc', 'PRO') 743 744ot.remove_language_ot ('QUZ') 745ot.add_language ('qu', 'QUZ') 746ot.add_language ('qub', 'QWH') 747ot.add_language ('qud', 'QVI') 748ot.add_language ('qug', 'QVI') 749ot.add_language ('qul', 'QUH') 750ot.add_language ('qup', 'QVI') 751ot.add_language ('qur', 'QWH') 752ot.add_language ('qus', 'QUH') 753ot.add_language ('quw', 'QVI') 754ot.add_language ('qux', 'QWH') 755ot.add_language ('qva', 'QWH') 756ot.add_language ('qvh', 'QWH') 757ot.add_language ('qvj', 'QVI') 758ot.add_language ('qvl', 'QWH') 759ot.add_language ('qvm', 'QWH') 760ot.add_language ('qvn', 'QWH') 761ot.add_language ('qvo', 'QVI') 762ot.add_language ('qvp', 'QWH') 763ot.add_language ('qvw', 'QWH') 764ot.add_language ('qvz', 'QVI') 765ot.add_language ('qwa', 'QWH') 766ot.add_language ('qws', 'QWH') 767ot.add_language ('qxa', 'QWH') 768ot.add_language ('qxc', 'QWH') 769ot.add_language ('qxh', 'QWH') 770ot.add_language ('qxl', 'QVI') 771ot.add_language ('qxn', 'QWH') 772ot.add_language ('qxo', 'QWH') 773ot.add_language ('qxr', 'QVI') 774ot.add_language ('qxt', 'QWH') 775ot.add_language ('qxw', 'QWH') 776 777bcp_47.macrolanguages['ro-MD'].add ('mo') 778 779ot.remove_language_ot ('SYRE') 780ot.remove_language_ot ('SYRJ') 781ot.remove_language_ot ('SYRN') 782ot.add_language ('und-Syre', 'SYRE') 783ot.add_language ('und-Syrj', 'SYRJ') 784ot.add_language ('und-Syrn', 'SYRN') 785 786bcp_47.names['xst'] = "Silt'e" 787bcp_47.scopes['xst'] = ' (retired code)' 788bcp_47.macrolanguages['xst'] = {'stv', 'wle'} 789 790ot.add_language ('xwo', 'TOD') 791 792ot.remove_language_ot ('ZHH') 793ot.remove_language_ot ('ZHP') 794ot.remove_language_ot ('ZHT') 795ot.remove_language_ot ('ZHTM') 796bcp_47.macrolanguages['zh'].remove ('lzh') 797bcp_47.macrolanguages['zh'].remove ('yue') 798ot.add_language ('zh-Hant-MO', 'ZHH') 799ot.add_language ('zh-Hant-MO', 'ZHTM') 800ot.add_language ('zh-Hant-HK', 'ZHH') 801ot.add_language ('zh-Hans', 'ZHS') 802ot.add_language ('zh-Hant', 'ZHT') 803ot.add_language ('zh-HK', 'ZHH') 804ot.add_language ('zh-MO', 'ZHH') 805ot.add_language ('zh-MO', 'ZHTM') 806ot.add_language ('zh-TW', 'ZHT') 807ot.add_language ('lzh', 'ZHT') 808ot.add_language ('lzh-Hans', 'ZHS') 809ot.add_language ('yue', 'ZHH') 810ot.add_language ('yue-Hans', 'ZHS') 811 812bcp_47.macrolanguages['zom'] = {'yos'} 813 814def rank_delta (bcp_47, ot): 815 """Return a delta to apply to a BCP 47 tag's rank. 816 817 Most OpenType tags have a constant rank, but a few have ranks that 818 depend on the BCP 47 tag. 819 820 Args: 821 bcp_47 (str): A BCP 47 tag. 822 ot (str): An OpenType tag to. 823 824 Returns: 825 A number to add to ``ot``'s rank when sorting ``bcp_47``'s 826 OpenType equivalents. 827 """ 828 if bcp_47 == 'ak' and ot == 'AKA': 829 return -1 830 if bcp_47 == 'tw' and ot == 'TWI': 831 return -1 832 return 0 833 834disambiguation = { 835 'ALT': 'alt', 836 'ARK': 'rki', 837 'ATH': 'ath', 838 'BHI': 'bhb', 839 'BLN': 'bjt', 840 'BTI': 'beb', 841 'CCHN': 'cco', 842 'CMR': 'swb', 843 'CPP': 'crp', 844 'CRR': 'crx', 845 'DUJ': 'dwu', 846 'ECR': 'crj', 847 'HAL': 'cfm', 848 'HND': 'hnd', 849 'HYE': 'hyw', 850 'KIS': 'kqs', 851 'KUI': 'uki', 852 'LRC': 'bqi', 853 'NDB': 'nd', 854 'NIS': 'njz', 855 'PLG': 'pce', 856 'PRO': 'pro', 857 'QIN': 'bgr', 858 'QUH': 'quh', 859 'QVI': 'qvi', 860 'QWH': 'qwh', 861 'SIG': 'stv', 862 'SRB': 'sr', 863 'SXT': 'xnj', 864 'ZHH': 'zh-HK', 865 'ZHS': 'zh-Hans', 866 'ZHT': 'zh-Hant', 867 'ZHTM': 'zh-MO', 868} 869 870ot.inherit_from_macrolanguages () 871bcp_47.remove_extra_macrolanguages () 872ot.inherit_from_macrolanguages () 873ot.names[DEFAULT_LANGUAGE_SYSTEM] = '*/' 874ot.ranks[DEFAULT_LANGUAGE_SYSTEM] = max (ot.ranks.values ()) + 1 875for tricky_ot_tag in filter (lambda tag: re.match ('[A-Z]{3}$', tag), ot.names): 876 possible_bcp_47_tag = tricky_ot_tag.lower () 877 if possible_bcp_47_tag in bcp_47.names and not ot.from_bcp_47[possible_bcp_47_tag]: 878 ot.add_language (possible_bcp_47_tag, DEFAULT_LANGUAGE_SYSTEM) 879 bcp_47.macrolanguages[possible_bcp_47_tag] = set () 880ot.sort_languages () 881 882print ('/* == Start of generated table == */') 883print ('/*') 884print (' * The following table is generated by running:') 885print (' *') 886print (' * %s languagetags language-subtag-registry' % sys.argv[0]) 887print (' *') 888print (' * on files with these headers:') 889print (' *') 890print (' * %s' % ot.header.strip ()) 891print (' * %s' % bcp_47.header) 892print (' */') 893print () 894print ('#ifndef HB_OT_TAG_TABLE_HH') 895print ('#define HB_OT_TAG_TABLE_HH') 896print () 897print ('static const LangTag ot_languages[] = {') 898 899def hb_tag (tag): 900 """Convert a tag to ``HB_TAG`` form. 901 902 Args: 903 tag (str): An OpenType tag. 904 905 Returns: 906 A snippet of C++ representing ``tag``. 907 """ 908 if tag == DEFAULT_LANGUAGE_SYSTEM: 909 return 'HB_TAG_NONE\t ' 910 return "HB_TAG('%s','%s','%s','%s')" % tuple (('%-4s' % tag)[:4]) 911 912def get_variant_set (name): 913 """Return a set of variant language names from a name. 914 915 Args: 916 name (str): A list of language names from the BCP 47 registry, 917 joined on ``'\\n'``. 918 919 Returns: 920 A set of normalized language names. 921 """ 922 return set (unicodedata.normalize ('NFD', n.replace ('\u2019', "'")) 923 .encode ('ASCII', 'ignore') 924 .strip () 925 for n in re.split ('[\n(),]', name) if n) 926 927def language_name_intersection (a, b): 928 """Return the names in common between two language names. 929 930 Args: 931 a (str): A list of language names from the BCP 47 registry, 932 joined on ``'\\n'``. 933 b (str): A list of language names from the BCP 47 registry, 934 joined on ``'\\n'``. 935 936 Returns: 937 The normalized language names shared by ``a`` and ``b``. 938 """ 939 return get_variant_set (a).intersection (get_variant_set (b)) 940 941def get_matching_language_name (intersection, candidates): 942 return next (iter (c for c in candidates if not intersection.isdisjoint (get_variant_set (c)))) 943 944def same_tag (bcp_47_tag, ot_tags): 945 return len (bcp_47_tag) == 3 and len (ot_tags) == 1 and bcp_47_tag == ot_tags[0].lower () 946 947for language, tags in sorted (ot.from_bcp_47.items ()): 948 if language == '' or '-' in language: 949 continue 950 commented_out = same_tag (language, tags) 951 for i, tag in enumerate (tags, start=1): 952 print ('%s{\"%s\",\t%s},' % ('/*' if commented_out else ' ', language, hb_tag (tag)), end='') 953 if commented_out: 954 print ('*/', end='') 955 print ('\t/* ', end='') 956 bcp_47_name = bcp_47.names.get (language, '') 957 bcp_47_name_candidates = bcp_47_name.split ('\n') 958 ot_name = ot.names[tag] 959 scope = bcp_47.scopes.get (language, '') 960 if tag == DEFAULT_LANGUAGE_SYSTEM: 961 write (f'{bcp_47_name_candidates[0]}{scope} != {ot.names[language.upper ()]}') 962 else: 963 intersection = language_name_intersection (bcp_47_name, ot_name) 964 if not intersection: 965 write ('%s%s -> %s' % (bcp_47_name_candidates[0], scope, ot_name)) 966 else: 967 name = get_matching_language_name (intersection, bcp_47_name_candidates) 968 bcp_47.names[language] = name 969 write ('%s%s' % (name if len (name) > len (ot_name) else ot_name, scope)) 970 print (' */') 971 972print ('};') 973print () 974 975print ('/**') 976print (' * hb_ot_tags_from_complex_language:') 977print (' * @lang_str: a BCP 47 language tag to convert.') 978print (' * @limit: a pointer to the end of the substring of @lang_str to consider for') 979print (' * conversion.') 980print (' * @count: maximum number of language tags to retrieve (IN) and actual number of') 981print (' * language tags retrieved (OUT). If no tags are retrieved, it is not modified.') 982print (' * @tags: array of size at least @language_count to store the language tag') 983print (' * results') 984print (' *') 985print (' * Converts a multi-subtag BCP 47 language tag to language tags.') 986print (' *') 987print (' * Return value: Whether any language systems were retrieved.') 988print (' **/') 989print ('static bool') 990print ('hb_ot_tags_from_complex_language (const char *lang_str,') 991print ('\t\t\t\t const char *limit,') 992print ('\t\t\t\t unsigned int *count /* IN/OUT */,') 993print ('\t\t\t\t hb_tag_t *tags /* OUT */)') 994print ('{') 995 996def print_subtag_matches (subtag, new_line): 997 if subtag: 998 if new_line: 999 print () 1000 print ('\t&& ', end='') 1001 print ('subtag_matches (lang_str, limit, "-%s")' % subtag, end='') 1002 1003complex_tags = collections.defaultdict (list) 1004for initial, group in itertools.groupby ((lt_tags for lt_tags in [ 1005 (LanguageTag (language), tags) 1006 for language, tags in sorted (ot.from_bcp_47.items (), 1007 key=lambda i: (-len (i[0]), i[0])) 1008 ] if lt_tags[0].is_complex ()), 1009 key=lambda lt_tags: lt_tags[0].get_group ()): 1010 complex_tags[initial] += group 1011 1012for initial, items in sorted (complex_tags.items ()): 1013 if initial != 'und': 1014 continue 1015 for lt, tags in items: 1016 if not tags: 1017 continue 1018 if lt.variant in bcp_47.prefixes: 1019 expect (next (iter (bcp_47.prefixes[lt.variant])) == lt.language, 1020 '%s is not a valid prefix of %s' % (lt.language, lt.variant)) 1021 print (' if (', end='') 1022 print_subtag_matches (lt.script, False) 1023 print_subtag_matches (lt.region, False) 1024 print_subtag_matches (lt.variant, False) 1025 print (')') 1026 print (' {') 1027 write (' /* %s */' % bcp_47.get_name (lt)) 1028 print () 1029 if len (tags) == 1: 1030 write (' tags[0] = %s; /* %s */' % (hb_tag (tags[0]), ot.names[tags[0]])) 1031 print () 1032 print (' *count = 1;') 1033 else: 1034 print (' hb_tag_t possible_tags[] = {') 1035 for tag in tags: 1036 write (' %s, /* %s */' % (hb_tag (tag), ot.names[tag])) 1037 print () 1038 print (' };') 1039 print (' for (i = 0; i < %s && i < *count; i++)' % len (tags)) 1040 print (' tags[i] = possible_tags[i];') 1041 print (' *count = i;') 1042 print (' return true;') 1043 print (' }') 1044 1045print (' switch (lang_str[0])') 1046print (' {') 1047for initial, items in sorted (complex_tags.items ()): 1048 if initial == 'und': 1049 continue 1050 print (" case '%s':" % initial) 1051 for lt, tags in items: 1052 if not tags: 1053 continue 1054 print (' if (', end='') 1055 script = lt.script 1056 region = lt.region 1057 if lt.grandfathered: 1058 print ('0 == strcmp (&lang_str[1], "%s")' % lt.language[1:], end='') 1059 else: 1060 string_literal = lt.language[1:] + '-' 1061 if script: 1062 string_literal += script 1063 script = None 1064 if region: 1065 string_literal += '-' + region 1066 region = None 1067 if string_literal[-1] == '-': 1068 print ('0 == strncmp (&lang_str[1], "%s", %i)' % (string_literal, len (string_literal)), end='') 1069 else: 1070 print ('lang_matches (&lang_str[1], "%s")' % string_literal, end='') 1071 print_subtag_matches (script, True) 1072 print_subtag_matches (region, True) 1073 print_subtag_matches (lt.variant, True) 1074 print (')') 1075 print (' {') 1076 write (' /* %s */' % bcp_47.get_name (lt)) 1077 print () 1078 if len (tags) == 1: 1079 write (' tags[0] = %s; /* %s */' % (hb_tag (tags[0]), ot.names[tags[0]])) 1080 print () 1081 print (' *count = 1;') 1082 else: 1083 print (' unsigned int i;') 1084 print (' hb_tag_t possible_tags[] = {') 1085 for tag in tags: 1086 write ('\t%s, /* %s */' % (hb_tag (tag), ot.names[tag])) 1087 print () 1088 print (' };') 1089 print (' for (i = 0; i < %s && i < *count; i++)' % len (tags)) 1090 print ('\ttags[i] = possible_tags[i];') 1091 print (' *count = i;') 1092 print (' return true;') 1093 print (' }') 1094 print (' break;') 1095 1096print (' }') 1097print (' return false;') 1098print ('}') 1099print () 1100print ('/**') 1101print (' * hb_ot_ambiguous_tag_to_language') 1102print (' * @tag: A language tag.') 1103print (' *') 1104print (' * Converts @tag to a BCP 47 language tag if it is ambiguous (it corresponds to') 1105print (' * many language tags) and the best tag is not the alphabetically first, or if') 1106print (' * the best tag consists of multiple subtags, or if the best tag does not appear') 1107print (' * in #ot_languages.') 1108print (' *') 1109print (' * Return value: The #hb_language_t corresponding to the BCP 47 language tag,') 1110print (' * or #HB_LANGUAGE_INVALID if @tag is not ambiguous.') 1111print (' **/') 1112print ('static hb_language_t') 1113print ('hb_ot_ambiguous_tag_to_language (hb_tag_t tag)') 1114print ('{') 1115print (' switch (tag)') 1116print (' {') 1117 1118def verify_disambiguation_dict (): 1119 """Verify and normalize ``disambiguation``. 1120 1121 ``disambiguation`` is a map of ambiguous OpenType language system 1122 tags to the particular BCP 47 tags they correspond to. This function 1123 checks that all its keys really are ambiguous and that each key's 1124 value is valid for that key. It checks that no ambiguous tag is 1125 missing, except when it can figure out which BCP 47 tag is the best 1126 by itself. 1127 1128 It modifies ``disambiguation`` to remove keys whose values are the 1129 same as those that the fallback would return anyway, and to add 1130 ambiguous keys whose disambiguations it determined automatically. 1131 1132 Raises: 1133 AssertionError: Verification failed. 1134 """ 1135 global bcp_47 1136 global disambiguation 1137 global ot 1138 for ot_tag, bcp_47_tags in ot.to_bcp_47.items (): 1139 if ot_tag == DEFAULT_LANGUAGE_SYSTEM: 1140 primary_tags = [] 1141 else: 1142 primary_tags = list (t for t in bcp_47_tags if t not in bcp_47.grandfathered and ot.from_bcp_47.get (t)[0] == ot_tag) 1143 if len (primary_tags) == 1: 1144 expect (ot_tag not in disambiguation, 'unnecessary disambiguation for OT tag: %s' % ot_tag) 1145 if '-' in primary_tags[0]: 1146 disambiguation[ot_tag] = primary_tags[0] 1147 else: 1148 first_tag = sorted (t for t in bcp_47_tags if t not in bcp_47.grandfathered and ot_tag in ot.from_bcp_47.get (t))[0] 1149 if primary_tags[0] != first_tag: 1150 disambiguation[ot_tag] = primary_tags[0] 1151 elif len (primary_tags) == 0: 1152 expect (ot_tag not in disambiguation, 'There is no possible valid disambiguation for %s' % ot_tag) 1153 else: 1154 original_languages = [t for t in primary_tags if t in ot.from_bcp_47_uninherited and 'retired code' not in bcp_47.scopes.get (t, '')] 1155 if len (original_languages) == 1: 1156 macrolanguages = original_languages 1157 else: 1158 macrolanguages = [t for t in primary_tags if bcp_47.scopes.get (t) == ' [macrolanguage]'] 1159 if len (macrolanguages) != 1: 1160 macrolanguages = list (t for t in primary_tags if bcp_47.scopes.get (t) == ' [collection]') 1161 if len (macrolanguages) != 1: 1162 macrolanguages = list (t for t in primary_tags if 'retired code' not in bcp_47.scopes.get (t, '')) 1163 if len (macrolanguages) != 1: 1164 expect (ot_tag in disambiguation, 'ambiguous OT tag: %s %s' % (ot_tag, str (macrolanguages))) 1165 expect (disambiguation[ot_tag] in bcp_47_tags, 1166 '%s is not a valid disambiguation for %s' % (disambiguation[ot_tag], ot_tag)) 1167 elif ot_tag not in disambiguation: 1168 disambiguation[ot_tag] = macrolanguages[0] 1169 different_bcp_47_tags = sorted (t for t in bcp_47_tags if not same_tag (t, ot.from_bcp_47.get (t))) 1170 if different_bcp_47_tags and disambiguation[ot_tag] == different_bcp_47_tags[0] and '-' not in disambiguation[ot_tag]: 1171 del disambiguation[ot_tag] 1172 for ot_tag in disambiguation.keys (): 1173 expect (ot_tag in ot.to_bcp_47, 'unknown OT tag: %s' % ot_tag) 1174 1175verify_disambiguation_dict () 1176for ot_tag, bcp_47_tag in sorted (disambiguation.items ()): 1177 write (' case %s: /* %s */' % (hb_tag (ot_tag), ot.names[ot_tag])) 1178 print () 1179 write (' return hb_language_from_string (\"%s\", -1); /* %s */' % (bcp_47_tag, bcp_47.get_name (LanguageTag (bcp_47_tag)))) 1180 print () 1181 1182print (' default:') 1183print (' return HB_LANGUAGE_INVALID;') 1184print (' }') 1185print ('}') 1186 1187print () 1188print ('#endif /* HB_OT_TAG_TABLE_HH */') 1189print () 1190print ('/* == End of generated table == */') 1191 1192