1#!/usr/bin/env python3 2 3"""Generator of the mapping from OpenType tags to BCP 47 tags and vice 4versa. 5 6It creates a ``const LangTag[]``, matching the tags from the OpenType 7languages system tag list to the language subtags of the BCP 47 language 8subtag registry, with some manual adjustments. The mappings are 9supplemented with macrolanguages' sublanguages and retired codes' 10replacements, according to BCP 47 and some manual additions where BCP 47 11omits a retired code entirely. 12 13Also generated is a function, ``hb_ot_ambiguous_tag_to_language``, 14intended for use by ``hb_ot_tag_to_language``. It maps OpenType tags 15back to BCP 47 tags. Ambiguous OpenType tags (those that correspond to 16multiple BCP 47 tags) are listed here, except when the alphabetically 17first BCP 47 tag happens to be the chosen disambiguated tag. In that 18case, the fallback behavior will choose the right tag anyway. 19 20usage: ./gen-tag-table.py languagetags language-subtag-registry 21 22Input files: 23* https://docs.microsoft.com/en-us/typography/opentype/spec/languagetags 24* https://www.iana.org/assignments/language-subtag-registry/language-subtag-registry 25""" 26 27import collections 28from html.parser import HTMLParser 29def write (s): 30 sys.stdout.flush () 31 sys.stdout.buffer.write (s.encode ('utf-8')) 32import itertools 33import re 34import sys 35import unicodedata 36 37if len (sys.argv) != 3: 38 sys.exit (__doc__) 39 40from html import unescape 41def html_unescape (parser, entity): 42 return unescape (entity) 43 44def expect (condition, message=None): 45 if not condition: 46 if message is None: 47 raise AssertionError 48 raise AssertionError (message) 49 50# from https://www-01.sil.org/iso639-3/iso-639-3.tab 51ISO_639_3_TO_1 = { 52 'aar': 'aa', 53 'abk': 'ab', 54 'afr': 'af', 55 'aka': 'ak', 56 'amh': 'am', 57 'ara': 'ar', 58 'arg': 'an', 59 'asm': 'as', 60 'ava': 'av', 61 'ave': 'ae', 62 'aym': 'ay', 63 'aze': 'az', 64 'bak': 'ba', 65 'bam': 'bm', 66 'bel': 'be', 67 'ben': 'bn', 68 'bis': 'bi', 69 'bod': 'bo', 70 'bos': 'bs', 71 'bre': 'br', 72 'bul': 'bg', 73 'cat': 'ca', 74 'ces': 'cs', 75 'cha': 'ch', 76 'che': 'ce', 77 'chu': 'cu', 78 'chv': 'cv', 79 'cor': 'kw', 80 'cos': 'co', 81 'cre': 'cr', 82 'cym': 'cy', 83 'dan': 'da', 84 'deu': 'de', 85 'div': 'dv', 86 'dzo': 'dz', 87 'ell': 'el', 88 'eng': 'en', 89 'epo': 'eo', 90 'est': 'et', 91 'eus': 'eu', 92 'ewe': 'ee', 93 'fao': 'fo', 94 'fas': 'fa', 95 'fij': 'fj', 96 'fin': 'fi', 97 'fra': 'fr', 98 'fry': 'fy', 99 'ful': 'ff', 100 'gla': 'gd', 101 'gle': 'ga', 102 'glg': 'gl', 103 'glv': 'gv', 104 'grn': 'gn', 105 'guj': 'gu', 106 'hat': 'ht', 107 'hau': 'ha', 108 'hbs': 'sh', 109 'heb': 'he', 110 'her': 'hz', 111 'hin': 'hi', 112 'hmo': 'ho', 113 'hrv': 'hr', 114 'hun': 'hu', 115 'hye': 'hy', 116 'ibo': 'ig', 117 'ido': 'io', 118 'iii': 'ii', 119 'iku': 'iu', 120 'ile': 'ie', 121 'ina': 'ia', 122 'ind': 'id', 123 'ipk': 'ik', 124 'isl': 'is', 125 'ita': 'it', 126 'jav': 'jv', 127 'jpn': 'ja', 128 'kal': 'kl', 129 'kan': 'kn', 130 'kas': 'ks', 131 'kat': 'ka', 132 'kau': 'kr', 133 'kaz': 'kk', 134 'khm': 'km', 135 'kik': 'ki', 136 'kin': 'rw', 137 'kir': 'ky', 138 'kom': 'kv', 139 'kon': 'kg', 140 'kor': 'ko', 141 'kua': 'kj', 142 'kur': 'ku', 143 'lao': 'lo', 144 'lat': 'la', 145 'lav': 'lv', 146 'lim': 'li', 147 'lin': 'ln', 148 'lit': 'lt', 149 'ltz': 'lb', 150 'lub': 'lu', 151 'lug': 'lg', 152 'mah': 'mh', 153 'mal': 'ml', 154 'mar': 'mr', 155 'mkd': 'mk', 156 'mlg': 'mg', 157 'mlt': 'mt', 158 'mol': 'mo', 159 'mon': 'mn', 160 'mri': 'mi', 161 'msa': 'ms', 162 'mya': 'my', 163 'nau': 'na', 164 'nav': 'nv', 165 'nbl': 'nr', 166 'nde': 'nd', 167 'ndo': 'ng', 168 'nep': 'ne', 169 'nld': 'nl', 170 'nno': 'nn', 171 'nob': 'nb', 172 'nor': 'no', 173 'nya': 'ny', 174 'oci': 'oc', 175 'oji': 'oj', 176 'ori': 'or', 177 'orm': 'om', 178 'oss': 'os', 179 'pan': 'pa', 180 'pli': 'pi', 181 'pol': 'pl', 182 'por': 'pt', 183 'pus': 'ps', 184 'que': 'qu', 185 'roh': 'rm', 186 'ron': 'ro', 187 'run': 'rn', 188 'rus': 'ru', 189 'sag': 'sg', 190 'san': 'sa', 191 'sin': 'si', 192 'slk': 'sk', 193 'slv': 'sl', 194 'sme': 'se', 195 'smo': 'sm', 196 'sna': 'sn', 197 'snd': 'sd', 198 'som': 'so', 199 'sot': 'st', 200 'spa': 'es', 201 'sqi': 'sq', 202 'srd': 'sc', 203 'srp': 'sr', 204 'ssw': 'ss', 205 'sun': 'su', 206 'swa': 'sw', 207 'swe': 'sv', 208 'tah': 'ty', 209 'tam': 'ta', 210 'tat': 'tt', 211 'tel': 'te', 212 'tgk': 'tg', 213 'tgl': 'tl', 214 'tha': 'th', 215 'tir': 'ti', 216 'ton': 'to', 217 'tsn': 'tn', 218 'tso': 'ts', 219 'tuk': 'tk', 220 'tur': 'tr', 221 'twi': 'tw', 222 'uig': 'ug', 223 'ukr': 'uk', 224 'urd': 'ur', 225 'uzb': 'uz', 226 'ven': 've', 227 'vie': 'vi', 228 'vol': 'vo', 229 'wln': 'wa', 230 'wol': 'wo', 231 'xho': 'xh', 232 'yid': 'yi', 233 'yor': 'yo', 234 'zha': 'za', 235 'zho': 'zh', 236 'zul': 'zu', 237} 238 239class LanguageTag (object): 240 """A BCP 47 language tag. 241 242 Attributes: 243 subtags (List[str]): The list of subtags in this tag. 244 grandfathered (bool): Whether this tag is grandfathered. If 245 ``true``, the entire lowercased tag is the ``language`` 246 and the other subtag fields are empty. 247 language (str): The language subtag. 248 script (str): The script subtag. 249 region (str): The region subtag. 250 variant (str): The variant subtag. 251 252 Args: 253 tag (str): A BCP 47 language tag. 254 255 """ 256 def __init__ (self, tag): 257 global bcp_47 258 self.subtags = tag.lower ().split ('-') 259 self.grandfathered = tag.lower () in bcp_47.grandfathered 260 if self.grandfathered: 261 self.language = tag.lower () 262 self.script = '' 263 self.region = '' 264 self.variant = '' 265 else: 266 self.language = self.subtags[0] 267 self.script = self._find_first (lambda s: len (s) == 4 and s[0] > '9', self.subtags) 268 self.region = self._find_first (lambda s: len (s) == 2 and s[0] > '9' or len (s) == 3 and s[0] <= '9', self.subtags[1:]) 269 self.variant = self._find_first (lambda s: len (s) > 4 or len (s) == 4 and s[0] <= '9', self.subtags) 270 271 def __str__(self): 272 return '-'.join(self.subtags) 273 274 def __repr__ (self): 275 return 'LanguageTag(%r)' % str(self) 276 277 @staticmethod 278 def _find_first (function, sequence): 279 try: 280 return next (iter (filter (function, sequence))) 281 except StopIteration: 282 return None 283 284 def is_complex (self): 285 """Return whether this tag is too complex to represent as a 286 ``LangTag`` in the generated code. 287 288 Complex tags need to be handled in 289 ``hb_ot_tags_from_complex_language``. 290 291 Returns: 292 Whether this tag is complex. 293 """ 294 return not (len (self.subtags) == 1 295 or self.grandfathered 296 and len (self.subtags[1]) != 3 297 and ot.from_bcp_47[self.subtags[0]] == ot.from_bcp_47[self.language]) 298 299 def get_group (self): 300 """Return the group into which this tag should be categorized in 301 ``hb_ot_tags_from_complex_language``. 302 303 The group is the first letter of the tag, or ``'und'`` if this tag 304 should not be matched in a ``switch`` statement in the generated 305 code. 306 307 Returns: 308 This tag's group. 309 """ 310 return ('und' 311 if (self.language == 'und' 312 or self.variant in bcp_47.prefixes and len (bcp_47.prefixes[self.variant]) == 1) 313 else self.language[0]) 314 315class OpenTypeRegistryParser (HTMLParser): 316 """A parser for the OpenType language system tag registry. 317 318 Attributes: 319 header (str): The "last updated" line of the registry. 320 names (Mapping[str, str]): A map of language system tags to the 321 names they are given in the registry. 322 ranks (DefaultDict[str, int]): A map of language system tags to 323 numbers. If a single BCP 47 tag corresponds to multiple 324 OpenType tags, the tags are ordered in increasing order by 325 rank. The rank is based on the number of BCP 47 tags 326 associated with a tag, though it may be manually modified. 327 to_bcp_47 (DefaultDict[str, AbstractSet[str]]): A map of 328 OpenType language system tags to sets of BCP 47 tags. 329 from_bcp_47 (DefaultDict[str, AbstractSet[str]]): ``to_bcp_47`` 330 inverted. Its values start as unsorted sets; 331 ``sort_languages`` converts them to sorted lists. 332 333 """ 334 def __init__ (self): 335 HTMLParser.__init__ (self) 336 self.header = '' 337 self.names = {} 338 self.ranks = collections.defaultdict (int) 339 self.to_bcp_47 = collections.defaultdict (set) 340 self.from_bcp_47 = collections.defaultdict (set) 341 # Whether the parser is in a <td> element 342 self._td = False 343 # The text of the <td> elements of the current <tr> element. 344 self._current_tr = [] 345 346 def handle_starttag (self, tag, attrs): 347 if tag == 'meta': 348 for attr, value in attrs: 349 if attr == 'name' and value == 'updated_at': 350 self.header = self.get_starttag_text () 351 break 352 elif tag == 'td': 353 self._td = True 354 self._current_tr.append ('') 355 elif tag == 'tr': 356 self._current_tr = [] 357 358 def handle_endtag (self, tag): 359 if tag == 'td': 360 self._td = False 361 elif tag == 'tr' and self._current_tr: 362 expect (2 <= len (self._current_tr) <= 3) 363 name = self._current_tr[0].strip () 364 tag = self._current_tr[1].strip ("\t\n\v\f\r '") 365 rank = 0 366 if len (tag) > 4: 367 expect (tag.endswith (' (deprecated)'), 'ill-formed OpenType tag: %s' % tag) 368 name += ' (deprecated)' 369 tag = tag.split (' ')[0] 370 rank = 1 371 self.names[tag] = re.sub (' languages$', '', name) 372 if not self._current_tr[2]: 373 return 374 iso_codes = self._current_tr[2].strip () 375 self.to_bcp_47[tag].update (ISO_639_3_TO_1.get (code, code) for code in iso_codes.replace (' ', '').split (',')) 376 rank += 2 * len (self.to_bcp_47[tag]) 377 self.ranks[tag] = rank 378 379 def handle_data (self, data): 380 if self._td: 381 self._current_tr[-1] += data 382 383 def handle_charref (self, name): 384 self.handle_data (html_unescape (self, '&#%s;' % name)) 385 386 def handle_entityref (self, name): 387 self.handle_data (html_unescape (self, '&%s;' % name)) 388 389 def parse (self, filename): 390 """Parse the OpenType language system tag registry. 391 392 Args: 393 filename (str): The file name of the registry. 394 """ 395 with open (filename, encoding='utf-8') as f: 396 self.feed (f.read ()) 397 expect (self.header) 398 for tag, iso_codes in self.to_bcp_47.items (): 399 for iso_code in iso_codes: 400 self.from_bcp_47[iso_code].add (tag) 401 402 def add_language (self, bcp_47_tag, ot_tag): 403 """Add a language as if it were in the registry. 404 405 Args: 406 bcp_47_tag (str): A BCP 47 tag. If the tag is more than just 407 a language subtag, and if the language subtag is a 408 macrolanguage, then new languages are added corresponding 409 to the macrolanguages' individual languages with the 410 remainder of the tag appended. 411 ot_tag (str): An OpenType language system tag. 412 """ 413 global bcp_47 414 self.to_bcp_47[ot_tag].add (bcp_47_tag) 415 self.from_bcp_47[bcp_47_tag].add (ot_tag) 416 if bcp_47_tag.lower () not in bcp_47.grandfathered: 417 try: 418 [macrolanguage, suffix] = bcp_47_tag.split ('-', 1) 419 if macrolanguage in bcp_47.macrolanguages: 420 s = set () 421 for language in bcp_47.macrolanguages[macrolanguage]: 422 if language.lower () not in bcp_47.grandfathered: 423 s.add ('%s-%s' % (language, suffix)) 424 bcp_47.macrolanguages['%s-%s' % (macrolanguage, suffix)] = s 425 except ValueError: 426 pass 427 428 @staticmethod 429 def _remove_language (tag_1, dict_1, dict_2): 430 for tag_2 in dict_1.pop (tag_1): 431 dict_2[tag_2].remove (tag_1) 432 if not dict_2[tag_2]: 433 del dict_2[tag_2] 434 435 def remove_language_ot (self, ot_tag): 436 """Remove an OpenType tag from the registry. 437 438 Args: 439 ot_tag (str): An OpenType tag. 440 """ 441 self._remove_language (ot_tag, self.to_bcp_47, self.from_bcp_47) 442 443 def remove_language_bcp_47 (self, bcp_47_tag): 444 """Remove a BCP 47 tag from the registry. 445 446 Args: 447 bcp_47_tag (str): A BCP 47 tag. 448 """ 449 self._remove_language (bcp_47_tag, self.from_bcp_47, self.to_bcp_47) 450 451 def inherit_from_macrolanguages (self): 452 """Copy mappings from macrolanguages to individual languages. 453 454 If a BCP 47 tag for an individual mapping has no OpenType 455 mapping but its macrolanguage does, the mapping is copied to 456 the individual language. For example, als (Tosk Albanian) has no 457 explicit mapping, so it inherits from sq (Albanian) the mapping 458 to SQI. 459 460 If a BCP 47 tag for a macrolanguage has no OpenType mapping but 461 all of its individual languages do and they all map to the same 462 tags, the mapping is copied to the macrolanguage. 463 """ 464 global bcp_47 465 original_ot_from_bcp_47 = dict (self.from_bcp_47) 466 for macrolanguage, languages in dict (bcp_47.macrolanguages).items (): 467 ot_macrolanguages = set (original_ot_from_bcp_47.get (macrolanguage, set ())) 468 if ot_macrolanguages: 469 for ot_macrolanguage in ot_macrolanguages: 470 for language in languages: 471 # Remove the following condition if e.g. nn should map to NYN,NOR 472 # instead of just NYN. 473 if language not in original_ot_from_bcp_47: 474 self.add_language (language, ot_macrolanguage) 475 self.ranks[ot_macrolanguage] += 1 476 else: 477 for language in languages: 478 if language in original_ot_from_bcp_47: 479 if ot_macrolanguages: 480 ml = original_ot_from_bcp_47[language] 481 if ml: 482 ot_macrolanguages &= ml 483 else: 484 pass 485 else: 486 ot_macrolanguages |= original_ot_from_bcp_47[language] 487 else: 488 ot_macrolanguages.clear () 489 if not ot_macrolanguages: 490 break 491 for ot_macrolanguage in ot_macrolanguages: 492 self.add_language (macrolanguage, ot_macrolanguage) 493 494 def sort_languages (self): 495 """Sort the values of ``from_bcp_47`` in ascending rank order.""" 496 for language, tags in self.from_bcp_47.items (): 497 self.from_bcp_47[language] = sorted (tags, 498 key=lambda t: (self.ranks[t] + rank_delta (language, t), t)) 499 500ot = OpenTypeRegistryParser () 501 502class BCP47Parser (object): 503 """A parser for the BCP 47 subtag registry. 504 505 Attributes: 506 header (str): The "File-Date" line of the registry. 507 names (Mapping[str, str]): A map of subtags to the names they 508 are given in the registry. Each value is a 509 ``'\\n'``-separated list of names. 510 scopes (Mapping[str, str]): A map of language subtags to strings 511 suffixed to language names, including suffixes to explain 512 language scopes. 513 macrolanguages (DefaultDict[str, AbstractSet[str]]): A map of 514 language subtags to the sets of language subtags which 515 inherit from them. See 516 ``OpenTypeRegistryParser.inherit_from_macrolanguages``. 517 prefixes (DefaultDict[str, AbstractSet[str]]): A map of variant 518 subtags to their prefixes. 519 grandfathered (AbstractSet[str]): The set of grandfathered tags, 520 normalized to lowercase. 521 522 """ 523 def __init__ (self): 524 self.header = '' 525 self.names = {} 526 self.scopes = {} 527 self.macrolanguages = collections.defaultdict (set) 528 self.prefixes = collections.defaultdict (set) 529 self.grandfathered = set () 530 531 def parse (self, filename): 532 """Parse the BCP 47 subtag registry. 533 534 Args: 535 filename (str): The file name of the registry. 536 """ 537 with open (filename, encoding='utf-8') as f: 538 subtag_type = None 539 subtag = None 540 deprecated = False 541 has_preferred_value = False 542 line_buffer = '' 543 for line in itertools.chain (f, ['']): 544 line = line.rstrip () 545 if line.startswith (' '): 546 line_buffer += line[1:] 547 continue 548 line, line_buffer = line_buffer, line 549 if line.startswith ('Type: '): 550 subtag_type = line.split (' ')[1] 551 deprecated = False 552 has_preferred_value = False 553 elif line.startswith ('Subtag: ') or line.startswith ('Tag: '): 554 subtag = line.split (' ')[1] 555 if subtag_type == 'grandfathered': 556 self.grandfathered.add (subtag.lower ()) 557 elif line.startswith ('Description: '): 558 description = line.split (' ', 1)[1].replace (' (individual language)', '') 559 description = re.sub (' (\((individual |macro)language\)|languages)$', '', 560 description) 561 if subtag in self.names: 562 self.names[subtag] += '\n' + description 563 else: 564 self.names[subtag] = description 565 elif subtag_type == 'language' or subtag_type == 'grandfathered': 566 if line.startswith ('Scope: '): 567 scope = line.split (' ')[1] 568 if scope == 'macrolanguage': 569 scope = ' [macrolanguage]' 570 elif scope == 'collection': 571 scope = ' [family]' 572 else: 573 continue 574 self.scopes[subtag] = scope 575 elif line.startswith ('Deprecated: '): 576 self.scopes[subtag] = ' (retired code)' + self.scopes.get (subtag, '') 577 deprecated = True 578 elif deprecated and line.startswith ('Comments: see '): 579 # If a subtag is split into multiple replacement subtags, 580 # it essentially represents a macrolanguage. 581 for language in line.replace (',', '').split (' ')[2:]: 582 self._add_macrolanguage (subtag, language) 583 elif line.startswith ('Preferred-Value: '): 584 # If a subtag is deprecated in favor of a single replacement subtag, 585 # it is either a dialect or synonym of the preferred subtag. Either 586 # way, it is close enough to the truth to consider the replacement 587 # the macrolanguage of the deprecated language. 588 has_preferred_value = True 589 macrolanguage = line.split (' ')[1] 590 self._add_macrolanguage (macrolanguage, subtag) 591 elif not has_preferred_value and line.startswith ('Macrolanguage: '): 592 self._add_macrolanguage (line.split (' ')[1], subtag) 593 elif subtag_type == 'variant': 594 if line.startswith ('Prefix: '): 595 self.prefixes[subtag].add (line.split (' ')[1]) 596 elif line.startswith ('File-Date: '): 597 self.header = line 598 expect (self.header) 599 600 def _add_macrolanguage (self, macrolanguage, language): 601 global ot 602 if language not in ot.from_bcp_47: 603 for l in self.macrolanguages.get (language, set ()): 604 self._add_macrolanguage (macrolanguage, l) 605 if macrolanguage not in ot.from_bcp_47: 606 for ls in list (self.macrolanguages.values ()): 607 if macrolanguage in ls: 608 ls.add (language) 609 return 610 self.macrolanguages[macrolanguage].add (language) 611 612 def remove_extra_macrolanguages (self): 613 """Make every language have at most one macrolanguage.""" 614 inverted = collections.defaultdict (list) 615 for macrolanguage, languages in self.macrolanguages.items (): 616 for language in languages: 617 inverted[language].append (macrolanguage) 618 for language, macrolanguages in inverted.items (): 619 if len (macrolanguages) > 1: 620 macrolanguages.sort (key=lambda ml: len (self.macrolanguages[ml])) 621 biggest_macrolanguage = macrolanguages.pop () 622 for macrolanguage in macrolanguages: 623 self._add_macrolanguage (biggest_macrolanguage, macrolanguage) 624 625 def get_name (self, lt): 626 """Return the names of the subtags in a language tag. 627 628 Args: 629 lt (LanguageTag): A BCP 47 language tag. 630 631 Returns: 632 The name form of ``lt``. 633 """ 634 name = self.names[lt.language].split ('\n')[0] 635 if lt.script: 636 name += '; ' + self.names[lt.script.title ()].split ('\n')[0] 637 if lt.region: 638 name += '; ' + self.names[lt.region.upper ()].split ('\n')[0] 639 if lt.variant: 640 name += '; ' + self.names[lt.variant].split ('\n')[0] 641 return name 642 643bcp_47 = BCP47Parser () 644 645ot.parse (sys.argv[1]) 646bcp_47.parse (sys.argv[2]) 647 648ot.add_language ('ary', 'MOR') 649 650ot.add_language ('ath', 'ATH') 651 652ot.add_language ('bai', 'BML') 653 654ot.ranks['BAL'] = ot.ranks['KAR'] + 1 655 656ot.add_language ('ber', 'BBR') 657 658ot.remove_language_ot ('PGR') 659ot.add_language ('el-polyton', 'PGR') 660 661bcp_47.macrolanguages['et'] = {'ekk'} 662 663bcp_47.names['flm'] = 'Falam Chin' 664bcp_47.scopes['flm'] = ' (retired code)' 665bcp_47.macrolanguages['flm'] = {'cfm'} 666 667ot.ranks['FNE'] = ot.ranks['TNE'] + 1 668 669ot.add_language ('und-fonipa', 'IPPH') 670 671ot.add_language ('und-fonnapa', 'APPH') 672 673ot.remove_language_ot ('IRT') 674ot.add_language ('ga-Latg', 'IRT') 675 676ot.remove_language_ot ('KGE') 677ot.add_language ('und-Geok', 'KGE') 678 679bcp_47.macrolanguages['id'] = {'in'} 680 681bcp_47.macrolanguages['ijo'] = {'ijc'} 682 683ot.add_language ('kht', 'KHN') 684ot.names['KHN'] = ot.names['KHT'] + ' (Microsoft fonts)' 685ot.ranks['KHN'] = ot.ranks['KHT'] + 1 686 687ot.ranks['LCR'] = ot.ranks['MCR'] + 1 688 689ot.names['MAL'] = 'Malayalam Traditional' 690ot.ranks['MLR'] += 1 691 692bcp_47.names['mhv'] = 'Arakanese' 693bcp_47.scopes['mhv'] = ' (retired code)' 694 695ot.add_language ('no', 'NOR') 696 697ot.add_language ('oc-provenc', 'PRO') 698 699ot.add_language ('qu', 'QUZ') 700ot.add_language ('qub', 'QWH') 701ot.add_language ('qud', 'QVI') 702ot.add_language ('qug', 'QVI') 703ot.add_language ('qup', 'QVI') 704ot.add_language ('qur', 'QWH') 705ot.add_language ('qus', 'QUH') 706ot.add_language ('quw', 'QVI') 707ot.add_language ('qux', 'QWH') 708ot.add_language ('qva', 'QWH') 709ot.add_language ('qvh', 'QWH') 710ot.add_language ('qvj', 'QVI') 711ot.add_language ('qvl', 'QWH') 712ot.add_language ('qvm', 'QWH') 713ot.add_language ('qvn', 'QWH') 714ot.add_language ('qvo', 'QVI') 715ot.add_language ('qvp', 'QWH') 716ot.add_language ('qvw', 'QWH') 717ot.add_language ('qvz', 'QVI') 718ot.add_language ('qwa', 'QWH') 719ot.add_language ('qws', 'QWH') 720ot.add_language ('qxa', 'QWH') 721ot.add_language ('qxc', 'QWH') 722ot.add_language ('qxh', 'QWH') 723ot.add_language ('qxl', 'QVI') 724ot.add_language ('qxn', 'QWH') 725ot.add_language ('qxo', 'QWH') 726ot.add_language ('qxr', 'QVI') 727ot.add_language ('qxt', 'QWH') 728ot.add_language ('qxw', 'QWH') 729 730bcp_47.macrolanguages['ro'].remove ('mo') 731bcp_47.macrolanguages['ro-MD'].add ('mo') 732 733ot.remove_language_ot ('SYRE') 734ot.remove_language_ot ('SYRJ') 735ot.remove_language_ot ('SYRN') 736ot.add_language ('und-Syre', 'SYRE') 737ot.add_language ('und-Syrj', 'SYRJ') 738ot.add_language ('und-Syrn', 'SYRN') 739 740bcp_47.names['xst'] = "Silt'e" 741bcp_47.scopes['xst'] = ' (retired code)' 742bcp_47.macrolanguages['xst'] = {'stv', 'wle'} 743 744ot.add_language ('xwo', 'TOD') 745 746ot.remove_language_ot ('ZHH') 747ot.remove_language_ot ('ZHP') 748ot.remove_language_ot ('ZHT') 749bcp_47.macrolanguages['zh'].remove ('lzh') 750bcp_47.macrolanguages['zh'].remove ('yue') 751ot.add_language ('zh-Hant-MO', 'ZHH') 752ot.add_language ('zh-Hant-HK', 'ZHH') 753ot.add_language ('zh-Hans', 'ZHS') 754ot.add_language ('zh-Hant', 'ZHT') 755ot.add_language ('zh-HK', 'ZHH') 756ot.add_language ('zh-MO', 'ZHH') 757ot.add_language ('zh-TW', 'ZHT') 758ot.add_language ('lzh', 'ZHT') 759ot.add_language ('lzh-Hans', 'ZHS') 760ot.add_language ('yue', 'ZHH') 761ot.add_language ('yue-Hans', 'ZHS') 762 763bcp_47.macrolanguages['zom'] = {'yos'} 764 765def rank_delta (bcp_47, ot): 766 """Return a delta to apply to a BCP 47 tag's rank. 767 768 Most OpenType tags have a constant rank, but a few have ranks that 769 depend on the BCP 47 tag. 770 771 Args: 772 bcp_47 (str): A BCP 47 tag. 773 ot (str): An OpenType tag to. 774 775 Returns: 776 A number to add to ``ot``'s rank when sorting ``bcp_47``'s 777 OpenType equivalents. 778 """ 779 if bcp_47 == 'ak' and ot == 'AKA': 780 return -1 781 if bcp_47 == 'tw' and ot == 'TWI': 782 return -1 783 return 0 784 785disambiguation = { 786 'ALT': 'alt', 787 'ARK': 'rki', 788 'BHI': 'bhb', 789 'BLN': 'bjt', 790 'BTI': 'beb', 791 'CCHN': 'cco', 792 'CMR': 'swb', 793 'CPP': 'crp', 794 'CRR': 'crx', 795 'DUJ': 'dwu', 796 'ECR': 'crj', 797 'HAL': 'cfm', 798 'HND': 'hnd', 799 'KIS': 'kqs', 800 'KUI': 'uki', 801 'LRC': 'bqi', 802 'NDB': 'nd', 803 'NIS': 'njz', 804 'PLG': 'pce', 805 'PRO': 'pro', 806 'QIN': 'bgr', 807 'QUH': 'quh', 808 'QVI': 'qvi', 809 'QWH': 'qwh', 810 'SIG': 'stv', 811 'TNE': 'yrk', 812 'ZHH': 'zh-HK', 813 'ZHS': 'zh-Hans', 814 'ZHT': 'zh-Hant', 815} 816 817ot.inherit_from_macrolanguages () 818bcp_47.remove_extra_macrolanguages () 819ot.inherit_from_macrolanguages () 820ot.sort_languages () 821 822print ('/* == Start of generated table == */') 823print ('/*') 824print (' * The following table is generated by running:') 825print (' *') 826print (' * %s languagetags language-subtag-registry' % sys.argv[0]) 827print (' *') 828print (' * on files with these headers:') 829print (' *') 830print (' * %s' % ot.header.strip ()) 831print (' * %s' % bcp_47.header) 832print (' */') 833print () 834print ('#ifndef HB_OT_TAG_TABLE_HH') 835print ('#define HB_OT_TAG_TABLE_HH') 836print () 837print ('static const LangTag ot_languages[] = {') 838 839def hb_tag (tag): 840 """Convert a tag to ``HB_TAG`` form. 841 842 Args: 843 tag (str): An OpenType tag. 844 845 Returns: 846 A snippet of C++ representing ``tag``. 847 """ 848 return "HB_TAG('%s','%s','%s','%s')" % tuple (('%-4s' % tag)[:4]) 849 850def get_variant_set (name): 851 """Return a set of variant language names from a name. 852 853 Args: 854 name (str): A list of language names from the BCP 47 registry, 855 joined on ``'\\n'``. 856 857 Returns: 858 A set of normalized language names. 859 """ 860 return set (unicodedata.normalize ('NFD', n.replace ('\u2019', "'")) 861 .encode ('ASCII', 'ignore') 862 .strip () 863 for n in re.split ('[\n(),]', name) if n) 864 865def language_name_intersection (a, b): 866 """Return the names in common between two language names. 867 868 Args: 869 a (str): A list of language names from the BCP 47 registry, 870 joined on ``'\\n'``. 871 b (str): A list of language names from the BCP 47 registry, 872 joined on ``'\\n'``. 873 874 Returns: 875 The normalized language names shared by ``a`` and ``b``. 876 """ 877 return get_variant_set (a).intersection (get_variant_set (b)) 878 879def get_matching_language_name (intersection, candidates): 880 return next (iter (c for c in candidates if not intersection.isdisjoint (get_variant_set (c)))) 881 882def same_tag (bcp_47_tag, ot_tags): 883 return len (bcp_47_tag) == 3 and len (ot_tags) == 1 and bcp_47_tag == ot_tags[0].lower () 884 885for language, tags in sorted (ot.from_bcp_47.items ()): 886 if language == '' or '-' in language: 887 continue 888 commented_out = same_tag (language, tags) 889 for i, tag in enumerate (tags, start=1): 890 print ('%s{\"%s\",\t%s},' % ('/*' if commented_out else ' ', language, hb_tag (tag)), end='') 891 if commented_out: 892 print ('*/', end='') 893 print ('\t/* ', end='') 894 bcp_47_name = bcp_47.names.get (language, '') 895 bcp_47_name_candidates = bcp_47_name.split ('\n') 896 intersection = language_name_intersection (bcp_47_name, ot.names[tag]) 897 scope = bcp_47.scopes.get (language, '') 898 if not intersection: 899 write ('%s%s -> %s' % (bcp_47_name_candidates[0], scope, ot.names[tag])) 900 else: 901 name = get_matching_language_name (intersection, bcp_47_name_candidates) 902 bcp_47.names[language] = name 903 write ('%s%s' % (name if len (name) > len (ot.names[tag]) else ot.names[tag], scope)) 904 print (' */') 905 906print ('};') 907print () 908 909print ('/**') 910print (' * hb_ot_tags_from_complex_language:') 911print (' * @lang_str: a BCP 47 language tag to convert.') 912print (' * @limit: a pointer to the end of the substring of @lang_str to consider for') 913print (' * conversion.') 914print (' * @count: maximum number of language tags to retrieve (IN) and actual number of') 915print (' * language tags retrieved (OUT). If no tags are retrieved, it is not modified.') 916print (' * @tags: array of size at least @language_count to store the language tag') 917print (' * results') 918print (' *') 919print (' * Converts a multi-subtag BCP 47 language tag to language tags.') 920print (' *') 921print (' * Return value: Whether any language systems were retrieved.') 922print (' **/') 923print ('static bool') 924print ('hb_ot_tags_from_complex_language (const char *lang_str,') 925print ('\t\t\t\t const char *limit,') 926print ('\t\t\t\t unsigned int *count /* IN/OUT */,') 927print ('\t\t\t\t hb_tag_t *tags /* OUT */)') 928print ('{') 929 930def print_subtag_matches (subtag, new_line): 931 if subtag: 932 if new_line: 933 print () 934 print ('\t&& ', end='') 935 print ('subtag_matches (lang_str, limit, "-%s")' % subtag, end='') 936 937complex_tags = collections.defaultdict (list) 938for initial, group in itertools.groupby ((lt_tags for lt_tags in [ 939 (LanguageTag (language), tags) 940 for language, tags in sorted (ot.from_bcp_47.items (), 941 key=lambda i: (-len (i[0]), i[0])) 942 ] if lt_tags[0].is_complex ()), 943 key=lambda lt_tags: lt_tags[0].get_group ()): 944 complex_tags[initial] += group 945 946for initial, items in sorted (complex_tags.items ()): 947 if initial != 'und': 948 continue 949 for lt, tags in items: 950 if lt.variant in bcp_47.prefixes: 951 expect (next (iter (bcp_47.prefixes[lt.variant])) == lt.language, 952 '%s is not a valid prefix of %s' % (lt.language, lt.variant)) 953 print (' if (', end='') 954 print_subtag_matches (lt.script, False) 955 print_subtag_matches (lt.region, False) 956 print_subtag_matches (lt.variant, False) 957 print (')') 958 print (' {') 959 write (' /* %s */' % bcp_47.get_name (lt)) 960 print () 961 if len (tags) == 1: 962 write (' tags[0] = %s; /* %s */' % (hb_tag (tags[0]), ot.names[tags[0]])) 963 print () 964 print (' *count = 1;') 965 else: 966 print (' hb_tag_t possible_tags[] = {') 967 for tag in tags: 968 write (' %s, /* %s */' % (hb_tag (tag), ot.names[tag])) 969 print () 970 print (' };') 971 print (' for (i = 0; i < %s && i < *count; i++)' % len (tags)) 972 print (' tags[i] = possible_tags[i];') 973 print (' *count = i;') 974 print (' return true;') 975 print (' }') 976 977print (' switch (lang_str[0])') 978print (' {') 979for initial, items in sorted (complex_tags.items ()): 980 if initial == 'und': 981 continue 982 print (" case '%s':" % initial) 983 for lt, tags in items: 984 print (' if (', end='') 985 if lt.grandfathered: 986 print ('0 == strcmp (&lang_str[1], "%s")' % lt.language[1:], end='') 987 else: 988 string_literal = lt.language[1:] + '-' 989 if lt.script: 990 string_literal += lt.script 991 lt.script = None 992 if lt.region: 993 string_literal += '-' + lt.region 994 lt.region = None 995 if string_literal[-1] == '-': 996 print ('0 == strncmp (&lang_str[1], "%s", %i)' % (string_literal, len (string_literal)), end='') 997 else: 998 print ('lang_matches (&lang_str[1], "%s")' % string_literal, end='') 999 print_subtag_matches (lt.script, True) 1000 print_subtag_matches (lt.region, True) 1001 print_subtag_matches (lt.variant, True) 1002 print (')') 1003 print (' {') 1004 write (' /* %s */' % bcp_47.get_name (lt)) 1005 print () 1006 if len (tags) == 1: 1007 write (' tags[0] = %s; /* %s */' % (hb_tag (tags[0]), ot.names[tags[0]])) 1008 print () 1009 print (' *count = 1;') 1010 else: 1011 print (' unsigned int i;') 1012 print (' hb_tag_t possible_tags[] = {') 1013 for tag in tags: 1014 write ('\t%s, /* %s */' % (hb_tag (tag), ot.names[tag])) 1015 print () 1016 print (' };') 1017 print (' for (i = 0; i < %s && i < *count; i++)' % len (tags)) 1018 print ('\ttags[i] = possible_tags[i];') 1019 print (' *count = i;') 1020 print (' return true;') 1021 print (' }') 1022 print (' break;') 1023 1024print (' }') 1025print (' return false;') 1026print ('}') 1027print () 1028print ('/**') 1029print (' * hb_ot_ambiguous_tag_to_language') 1030print (' * @tag: A language tag.') 1031print (' *') 1032print (' * Converts @tag to a BCP 47 language tag if it is ambiguous (it corresponds to') 1033print (' * many language tags) and the best tag is not the alphabetically first, or if') 1034print (' * the best tag consists of multiple subtags, or if the best tag does not appear') 1035print (' * in #ot_languages.') 1036print (' *') 1037print (' * Return value: The #hb_language_t corresponding to the BCP 47 language tag,') 1038print (' * or #HB_LANGUAGE_INVALID if @tag is not ambiguous.') 1039print (' **/') 1040print ('static hb_language_t') 1041print ('hb_ot_ambiguous_tag_to_language (hb_tag_t tag)') 1042print ('{') 1043print (' switch (tag)') 1044print (' {') 1045 1046def verify_disambiguation_dict (): 1047 """Verify and normalize ``disambiguation``. 1048 1049 ``disambiguation`` is a map of ambiguous OpenType language system 1050 tags to the particular BCP 47 tags they correspond to. This function 1051 checks that all its keys really are ambiguous and that each key's 1052 value is valid for that key. It checks that no ambiguous tag is 1053 missing, except when it can figure out which BCP 47 tag is the best 1054 by itself. 1055 1056 It modifies ``disambiguation`` to remove keys whose values are the 1057 same as those that the fallback would return anyway, and to add 1058 ambiguous keys whose disambiguations it determined automatically. 1059 1060 Raises: 1061 AssertionError: Verification failed. 1062 """ 1063 global bcp_47 1064 global disambiguation 1065 global ot 1066 for ot_tag, bcp_47_tags in ot.to_bcp_47.items (): 1067 primary_tags = list (t for t in bcp_47_tags if t not in bcp_47.grandfathered and ot.from_bcp_47.get (t)[0] == ot_tag) 1068 if len (primary_tags) == 1: 1069 expect (ot_tag not in disambiguation, 'unnecessary disambiguation for OT tag: %s' % ot_tag) 1070 if '-' in primary_tags[0]: 1071 disambiguation[ot_tag] = primary_tags[0] 1072 else: 1073 first_tag = sorted (t for t in bcp_47_tags if t not in bcp_47.grandfathered and ot_tag in ot.from_bcp_47.get (t))[0] 1074 if primary_tags[0] != first_tag: 1075 disambiguation[ot_tag] = primary_tags[0] 1076 elif len (primary_tags) == 0: 1077 expect (ot_tag not in disambiguation, 'There is no possible valid disambiguation for %s' % ot_tag) 1078 else: 1079 macrolanguages = list (t for t in primary_tags if bcp_47.scopes.get (t) == ' [macrolanguage]') 1080 if len (macrolanguages) != 1: 1081 macrolanguages = list (t for t in primary_tags if bcp_47.scopes.get (t) == ' [family]') 1082 if len (macrolanguages) != 1: 1083 macrolanguages = list (t for t in primary_tags if 'retired code' not in bcp_47.scopes.get (t, '')) 1084 if len (macrolanguages) != 1: 1085 expect (ot_tag in disambiguation, 'ambiguous OT tag: %s %s' % (ot_tag, str (macrolanguages))) 1086 expect (disambiguation[ot_tag] in bcp_47_tags, 1087 '%s is not a valid disambiguation for %s' % (disambiguation[ot_tag], ot_tag)) 1088 elif ot_tag not in disambiguation: 1089 disambiguation[ot_tag] = macrolanguages[0] 1090 different_primary_tags = sorted (t for t in primary_tags if not same_tag (t, ot.from_bcp_47.get (t))) 1091 if different_primary_tags and disambiguation[ot_tag] == different_primary_tags[0] and '-' not in disambiguation[ot_tag]: 1092 del disambiguation[ot_tag] 1093 for ot_tag in disambiguation.keys (): 1094 expect (ot_tag in ot.to_bcp_47, 'unknown OT tag: %s' % ot_tag) 1095 1096verify_disambiguation_dict () 1097for ot_tag, bcp_47_tag in sorted (disambiguation.items ()): 1098 write (' case %s: /* %s */' % (hb_tag (ot_tag), ot.names[ot_tag])) 1099 print () 1100 write (' return hb_language_from_string (\"%s\", -1); /* %s */' % (bcp_47_tag, bcp_47.get_name (LanguageTag (bcp_47_tag)))) 1101 print () 1102 1103print (' default:') 1104print (' return HB_LANGUAGE_INVALID;') 1105print (' }') 1106print ('}') 1107 1108print () 1109print ('#endif /* HB_OT_TAG_TABLE_HH */') 1110print () 1111print ('/* == End of generated table == */') 1112 1113