1#!/usr/bin/env python3
2
3"""Generator of the mapping from OpenType tags to BCP 47 tags and vice
4versa.
5
6It creates a ``const LangTag[]``, matching the tags from the OpenType
7languages system tag list to the language subtags of the BCP 47 language
8subtag registry, with some manual adjustments. The mappings are
9supplemented with macrolanguages' sublanguages and retired codes'
10replacements, according to BCP 47 and some manual additions where BCP 47
11omits a retired code entirely.
12
13Also generated is a function, ``hb_ot_ambiguous_tag_to_language``,
14intended for use by ``hb_ot_tag_to_language``. It maps OpenType tags
15back to BCP 47 tags. Ambiguous OpenType tags (those that correspond to
16multiple BCP 47 tags) are listed here, except when the alphabetically
17first BCP 47 tag happens to be the chosen disambiguated tag. In that
18case, the fallback behavior will choose the right tag anyway.
19
20usage: ./gen-tag-table.py languagetags language-subtag-registry
21
22Input files:
23* https://docs.microsoft.com/en-us/typography/opentype/spec/languagetags
24* https://www.iana.org/assignments/language-subtag-registry/language-subtag-registry
25"""
26
27import collections
28from html.parser import HTMLParser
29def write (s):
30	sys.stdout.flush ()
31	sys.stdout.buffer.write (s.encode ('utf-8'))
32import itertools
33import re
34import sys
35import unicodedata
36
37if len (sys.argv) != 3:
38	sys.exit (__doc__)
39
40from html import unescape
41def html_unescape (parser, entity):
42	return unescape (entity)
43
44def expect (condition, message=None):
45	if not condition:
46		if message is None:
47			raise AssertionError
48		raise AssertionError (message)
49
50# from https://www-01.sil.org/iso639-3/iso-639-3.tab
51ISO_639_3_TO_1 = {
52	'aar': 'aa',
53	'abk': 'ab',
54	'afr': 'af',
55	'aka': 'ak',
56	'amh': 'am',
57	'ara': 'ar',
58	'arg': 'an',
59	'asm': 'as',
60	'ava': 'av',
61	'ave': 'ae',
62	'aym': 'ay',
63	'aze': 'az',
64	'bak': 'ba',
65	'bam': 'bm',
66	'bel': 'be',
67	'ben': 'bn',
68	'bis': 'bi',
69	'bod': 'bo',
70	'bos': 'bs',
71	'bre': 'br',
72	'bul': 'bg',
73	'cat': 'ca',
74	'ces': 'cs',
75	'cha': 'ch',
76	'che': 'ce',
77	'chu': 'cu',
78	'chv': 'cv',
79	'cor': 'kw',
80	'cos': 'co',
81	'cre': 'cr',
82	'cym': 'cy',
83	'dan': 'da',
84	'deu': 'de',
85	'div': 'dv',
86	'dzo': 'dz',
87	'ell': 'el',
88	'eng': 'en',
89	'epo': 'eo',
90	'est': 'et',
91	'eus': 'eu',
92	'ewe': 'ee',
93	'fao': 'fo',
94	'fas': 'fa',
95	'fij': 'fj',
96	'fin': 'fi',
97	'fra': 'fr',
98	'fry': 'fy',
99	'ful': 'ff',
100	'gla': 'gd',
101	'gle': 'ga',
102	'glg': 'gl',
103	'glv': 'gv',
104	'grn': 'gn',
105	'guj': 'gu',
106	'hat': 'ht',
107	'hau': 'ha',
108	'hbs': 'sh',
109	'heb': 'he',
110	'her': 'hz',
111	'hin': 'hi',
112	'hmo': 'ho',
113	'hrv': 'hr',
114	'hun': 'hu',
115	'hye': 'hy',
116	'ibo': 'ig',
117	'ido': 'io',
118	'iii': 'ii',
119	'iku': 'iu',
120	'ile': 'ie',
121	'ina': 'ia',
122	'ind': 'id',
123	'ipk': 'ik',
124	'isl': 'is',
125	'ita': 'it',
126	'jav': 'jv',
127	'jpn': 'ja',
128	'kal': 'kl',
129	'kan': 'kn',
130	'kas': 'ks',
131	'kat': 'ka',
132	'kau': 'kr',
133	'kaz': 'kk',
134	'khm': 'km',
135	'kik': 'ki',
136	'kin': 'rw',
137	'kir': 'ky',
138	'kom': 'kv',
139	'kon': 'kg',
140	'kor': 'ko',
141	'kua': 'kj',
142	'kur': 'ku',
143	'lao': 'lo',
144	'lat': 'la',
145	'lav': 'lv',
146	'lim': 'li',
147	'lin': 'ln',
148	'lit': 'lt',
149	'ltz': 'lb',
150	'lub': 'lu',
151	'lug': 'lg',
152	'mah': 'mh',
153	'mal': 'ml',
154	'mar': 'mr',
155	'mkd': 'mk',
156	'mlg': 'mg',
157	'mlt': 'mt',
158	'mol': 'mo',
159	'mon': 'mn',
160	'mri': 'mi',
161	'msa': 'ms',
162	'mya': 'my',
163	'nau': 'na',
164	'nav': 'nv',
165	'nbl': 'nr',
166	'nde': 'nd',
167	'ndo': 'ng',
168	'nep': 'ne',
169	'nld': 'nl',
170	'nno': 'nn',
171	'nob': 'nb',
172	'nor': 'no',
173	'nya': 'ny',
174	'oci': 'oc',
175	'oji': 'oj',
176	'ori': 'or',
177	'orm': 'om',
178	'oss': 'os',
179	'pan': 'pa',
180	'pli': 'pi',
181	'pol': 'pl',
182	'por': 'pt',
183	'pus': 'ps',
184	'que': 'qu',
185	'roh': 'rm',
186	'ron': 'ro',
187	'run': 'rn',
188	'rus': 'ru',
189	'sag': 'sg',
190	'san': 'sa',
191	'sin': 'si',
192	'slk': 'sk',
193	'slv': 'sl',
194	'sme': 'se',
195	'smo': 'sm',
196	'sna': 'sn',
197	'snd': 'sd',
198	'som': 'so',
199	'sot': 'st',
200	'spa': 'es',
201	'sqi': 'sq',
202	'srd': 'sc',
203	'srp': 'sr',
204	'ssw': 'ss',
205	'sun': 'su',
206	'swa': 'sw',
207	'swe': 'sv',
208	'tah': 'ty',
209	'tam': 'ta',
210	'tat': 'tt',
211	'tel': 'te',
212	'tgk': 'tg',
213	'tgl': 'tl',
214	'tha': 'th',
215	'tir': 'ti',
216	'ton': 'to',
217	'tsn': 'tn',
218	'tso': 'ts',
219	'tuk': 'tk',
220	'tur': 'tr',
221	'twi': 'tw',
222	'uig': 'ug',
223	'ukr': 'uk',
224	'urd': 'ur',
225	'uzb': 'uz',
226	'ven': 've',
227	'vie': 'vi',
228	'vol': 'vo',
229	'wln': 'wa',
230	'wol': 'wo',
231	'xho': 'xh',
232	'yid': 'yi',
233	'yor': 'yo',
234	'zha': 'za',
235	'zho': 'zh',
236	'zul': 'zu',
237}
238
239class LanguageTag (object):
240	"""A BCP 47 language tag.
241
242	Attributes:
243		subtags (List[str]): The list of subtags in this tag.
244		grandfathered (bool): Whether this tag is grandfathered. If
245			``true``, the entire lowercased tag is the ``language``
246			and the other subtag fields are empty.
247		language (str): The language subtag.
248		script (str): The script subtag.
249		region (str): The region subtag.
250		variant (str): The variant subtag.
251
252	Args:
253		tag (str): A BCP 47 language tag.
254
255	"""
256	def __init__ (self, tag):
257		global bcp_47
258		self.subtags = tag.lower ().split ('-')
259		self.grandfathered = tag.lower () in bcp_47.grandfathered
260		if self.grandfathered:
261			self.language = tag.lower ()
262			self.script = ''
263			self.region = ''
264			self.variant = ''
265		else:
266			self.language = self.subtags[0]
267			self.script = self._find_first (lambda s: len (s) == 4 and s[0] > '9', self.subtags)
268			self.region = self._find_first (lambda s: len (s) == 2 and s[0] > '9' or len (s) == 3 and s[0] <= '9', self.subtags[1:])
269			self.variant = self._find_first (lambda s: len (s) > 4 or len (s) == 4 and s[0] <= '9', self.subtags)
270
271	def __str__(self):
272		return '-'.join(self.subtags)
273
274	def __repr__ (self):
275		return 'LanguageTag(%r)' % str(self)
276
277	@staticmethod
278	def _find_first (function, sequence):
279		try:
280			return next (iter (filter (function, sequence)))
281		except StopIteration:
282			return None
283
284	def is_complex (self):
285		"""Return whether this tag is too complex to represent as a
286		``LangTag`` in the generated code.
287
288		Complex tags need to be handled in
289		``hb_ot_tags_from_complex_language``.
290
291		Returns:
292			Whether this tag is complex.
293		"""
294		return not (len (self.subtags) == 1
295			or self.grandfathered
296			and len (self.subtags[1]) != 3
297			and ot.from_bcp_47[self.subtags[0]] == ot.from_bcp_47[self.language])
298
299	def get_group (self):
300		"""Return the group into which this tag should be categorized in
301		``hb_ot_tags_from_complex_language``.
302
303		The group is the first letter of the tag, or ``'und'`` if this tag
304		should not be matched in a ``switch`` statement in the generated
305		code.
306
307		Returns:
308			This tag's group.
309		"""
310		return ('und'
311			if (self.language == 'und'
312				or self.variant in bcp_47.prefixes and len (bcp_47.prefixes[self.variant]) == 1)
313			else self.language[0])
314
315class OpenTypeRegistryParser (HTMLParser):
316	"""A parser for the OpenType language system tag registry.
317
318	Attributes:
319		header (str): The "last updated" line of the registry.
320		names (Mapping[str, str]): A map of language system tags to the
321			names they are given in the registry.
322		ranks (DefaultDict[str, int]): A map of language system tags to
323			numbers. If a single BCP 47 tag corresponds to multiple
324			OpenType tags, the tags are ordered in increasing order by
325			rank. The rank is based on the number of BCP 47 tags
326			associated with a tag, though it may be manually modified.
327		to_bcp_47 (DefaultDict[str, AbstractSet[str]]): A map of
328			OpenType language system tags to sets of BCP 47 tags.
329		from_bcp_47 (DefaultDict[str, AbstractSet[str]]): ``to_bcp_47``
330			inverted. Its values start as unsorted sets;
331			``sort_languages`` converts them to sorted lists.
332
333	"""
334	def __init__ (self):
335		HTMLParser.__init__ (self)
336		self.header = ''
337		self.names = {}
338		self.ranks = collections.defaultdict (int)
339		self.to_bcp_47 = collections.defaultdict (set)
340		self.from_bcp_47 = collections.defaultdict (set)
341		# Whether the parser is in a <td> element
342		self._td = False
343		# The text of the <td> elements of the current <tr> element.
344		self._current_tr = []
345
346	def handle_starttag (self, tag, attrs):
347		if tag == 'meta':
348			for attr, value in attrs:
349				if attr == 'name' and value == 'updated_at':
350					self.header = self.get_starttag_text ()
351					break
352		elif tag == 'td':
353			self._td = True
354			self._current_tr.append ('')
355		elif tag == 'tr':
356			self._current_tr = []
357
358	def handle_endtag (self, tag):
359		if tag == 'td':
360			self._td = False
361		elif tag == 'tr' and self._current_tr:
362			expect (2 <= len (self._current_tr) <= 3)
363			name = self._current_tr[0].strip ()
364			tag = self._current_tr[1].strip ("\t\n\v\f\r '")
365			rank = 0
366			if len (tag) > 4:
367				expect (tag.endswith (' (deprecated)'), 'ill-formed OpenType tag: %s' % tag)
368				name += ' (deprecated)'
369				tag = tag.split (' ')[0]
370				rank = 1
371			self.names[tag] = re.sub (' languages$', '', name)
372			if not self._current_tr[2]:
373				return
374			iso_codes = self._current_tr[2].strip ()
375			self.to_bcp_47[tag].update (ISO_639_3_TO_1.get (code, code) for code in iso_codes.replace (' ', '').split (','))
376			rank += 2 * len (self.to_bcp_47[tag])
377			self.ranks[tag] = rank
378
379	def handle_data (self, data):
380		if self._td:
381			self._current_tr[-1] += data
382
383	def handle_charref (self, name):
384		self.handle_data (html_unescape (self, '&#%s;' % name))
385
386	def handle_entityref (self, name):
387		self.handle_data (html_unescape (self, '&%s;' % name))
388
389	def parse (self, filename):
390		"""Parse the OpenType language system tag registry.
391
392		Args:
393			filename (str): The file name of the registry.
394		"""
395		with open (filename, encoding='utf-8') as f:
396			self.feed (f.read ())
397		expect (self.header)
398		for tag, iso_codes in self.to_bcp_47.items ():
399			for iso_code in iso_codes:
400				self.from_bcp_47[iso_code].add (tag)
401
402	def add_language (self, bcp_47_tag, ot_tag):
403		"""Add a language as if it were in the registry.
404
405		Args:
406			bcp_47_tag (str): A BCP 47 tag. If the tag is more than just
407				a language subtag, and if the language subtag is a
408				macrolanguage, then new languages are added corresponding
409				to the macrolanguages' individual languages with the
410				remainder of the tag appended.
411			ot_tag (str): An OpenType language system tag.
412		"""
413		global bcp_47
414		self.to_bcp_47[ot_tag].add (bcp_47_tag)
415		self.from_bcp_47[bcp_47_tag].add (ot_tag)
416		if bcp_47_tag.lower () not in bcp_47.grandfathered:
417			try:
418				[macrolanguage, suffix] = bcp_47_tag.split ('-', 1)
419				if macrolanguage in bcp_47.macrolanguages:
420					s = set ()
421					for language in bcp_47.macrolanguages[macrolanguage]:
422						if language.lower () not in bcp_47.grandfathered:
423							s.add ('%s-%s' % (language, suffix))
424					bcp_47.macrolanguages['%s-%s' % (macrolanguage, suffix)] = s
425			except ValueError:
426				pass
427
428	@staticmethod
429	def _remove_language (tag_1, dict_1, dict_2):
430		for tag_2 in dict_1.pop (tag_1):
431			dict_2[tag_2].remove (tag_1)
432			if not dict_2[tag_2]:
433				del dict_2[tag_2]
434
435	def remove_language_ot (self, ot_tag):
436		"""Remove an OpenType tag from the registry.
437
438		Args:
439			ot_tag (str): An OpenType tag.
440		"""
441		self._remove_language (ot_tag, self.to_bcp_47, self.from_bcp_47)
442
443	def remove_language_bcp_47 (self, bcp_47_tag):
444		"""Remove a BCP 47 tag from the registry.
445
446		Args:
447			bcp_47_tag (str): A BCP 47 tag.
448		"""
449		self._remove_language (bcp_47_tag, self.from_bcp_47, self.to_bcp_47)
450
451	def inherit_from_macrolanguages (self):
452		"""Copy mappings from macrolanguages to individual languages.
453
454		If a BCP 47 tag for an individual mapping has no OpenType
455		mapping but its macrolanguage does, the mapping is copied to
456		the individual language. For example, als (Tosk Albanian) has no
457		explicit mapping, so it inherits from sq (Albanian) the mapping
458		to SQI.
459
460		If a BCP 47 tag for a macrolanguage has no OpenType mapping but
461		all of its individual languages do and they all map to the same
462		tags, the mapping is copied to the macrolanguage.
463		"""
464		global bcp_47
465		original_ot_from_bcp_47 = dict (self.from_bcp_47)
466		for macrolanguage, languages in dict (bcp_47.macrolanguages).items ():
467			ot_macrolanguages = set (original_ot_from_bcp_47.get (macrolanguage, set ()))
468			if ot_macrolanguages:
469				for ot_macrolanguage in ot_macrolanguages:
470					for language in languages:
471						# Remove the following condition if e.g. nn should map to NYN,NOR
472						# instead of just NYN.
473						if language not in original_ot_from_bcp_47:
474							self.add_language (language, ot_macrolanguage)
475							self.ranks[ot_macrolanguage] += 1
476			else:
477				for language in languages:
478					if language in original_ot_from_bcp_47:
479						if ot_macrolanguages:
480							ml = original_ot_from_bcp_47[language]
481							if ml:
482								ot_macrolanguages &= ml
483							else:
484								pass
485						else:
486							ot_macrolanguages |= original_ot_from_bcp_47[language]
487					else:
488						ot_macrolanguages.clear ()
489					if not ot_macrolanguages:
490						break
491				for ot_macrolanguage in ot_macrolanguages:
492					self.add_language (macrolanguage, ot_macrolanguage)
493
494	def sort_languages (self):
495		"""Sort the values of ``from_bcp_47`` in ascending rank order."""
496		for language, tags in self.from_bcp_47.items ():
497			self.from_bcp_47[language] = sorted (tags,
498					key=lambda t: (self.ranks[t] + rank_delta (language, t), t))
499
500ot = OpenTypeRegistryParser ()
501
502class BCP47Parser (object):
503	"""A parser for the BCP 47 subtag registry.
504
505	Attributes:
506		header (str): The "File-Date" line of the registry.
507		names (Mapping[str, str]): A map of subtags to the names they
508			are given in the registry. Each value is a
509			``'\\n'``-separated list of names.
510		scopes (Mapping[str, str]): A map of language subtags to strings
511			suffixed to language names, including suffixes to explain
512			language scopes.
513		macrolanguages (DefaultDict[str, AbstractSet[str]]): A map of
514			language subtags to the sets of language subtags which
515			inherit from them. See
516			``OpenTypeRegistryParser.inherit_from_macrolanguages``.
517		prefixes (DefaultDict[str, AbstractSet[str]]): A map of variant
518			subtags to their prefixes.
519		grandfathered (AbstractSet[str]): The set of grandfathered tags,
520			normalized to lowercase.
521
522	"""
523	def __init__ (self):
524		self.header = ''
525		self.names = {}
526		self.scopes = {}
527		self.macrolanguages = collections.defaultdict (set)
528		self.prefixes = collections.defaultdict (set)
529		self.grandfathered = set ()
530
531	def parse (self, filename):
532		"""Parse the BCP 47 subtag registry.
533
534		Args:
535			filename (str): The file name of the registry.
536		"""
537		with open (filename, encoding='utf-8') as f:
538			subtag_type = None
539			subtag = None
540			deprecated = False
541			has_preferred_value = False
542			line_buffer = ''
543			for line in itertools.chain (f, ['']):
544				line = line.rstrip ()
545				if line.startswith (' '):
546					line_buffer += line[1:]
547					continue
548				line, line_buffer = line_buffer, line
549				if line.startswith ('Type: '):
550					subtag_type = line.split (' ')[1]
551					deprecated = False
552					has_preferred_value = False
553				elif line.startswith ('Subtag: ') or line.startswith ('Tag: '):
554					subtag = line.split (' ')[1]
555					if subtag_type == 'grandfathered':
556						self.grandfathered.add (subtag.lower ())
557				elif line.startswith ('Description: '):
558					description = line.split (' ', 1)[1].replace (' (individual language)', '')
559					description = re.sub (' (\((individual |macro)language\)|languages)$', '',
560							description)
561					if subtag in self.names:
562						self.names[subtag] += '\n' + description
563					else:
564						self.names[subtag] = description
565				elif subtag_type == 'language' or subtag_type == 'grandfathered':
566					if line.startswith ('Scope: '):
567						scope = line.split (' ')[1]
568						if scope == 'macrolanguage':
569							scope = ' [macrolanguage]'
570						elif scope == 'collection':
571							scope = ' [family]'
572						else:
573							continue
574						self.scopes[subtag] = scope
575					elif line.startswith ('Deprecated: '):
576						self.scopes[subtag] = ' (retired code)' + self.scopes.get (subtag, '')
577						deprecated = True
578					elif deprecated and line.startswith ('Comments: see '):
579						# If a subtag is split into multiple replacement subtags,
580						# it essentially represents a macrolanguage.
581						for language in line.replace (',', '').split (' ')[2:]:
582							self._add_macrolanguage (subtag, language)
583					elif line.startswith ('Preferred-Value: '):
584						# If a subtag is deprecated in favor of a single replacement subtag,
585						# it is either a dialect or synonym of the preferred subtag. Either
586						# way, it is close enough to the truth to consider the replacement
587						# the macrolanguage of the deprecated language.
588						has_preferred_value = True
589						macrolanguage = line.split (' ')[1]
590						self._add_macrolanguage (macrolanguage, subtag)
591					elif not has_preferred_value and line.startswith ('Macrolanguage: '):
592						self._add_macrolanguage (line.split (' ')[1], subtag)
593				elif subtag_type == 'variant':
594					if line.startswith ('Prefix: '):
595						self.prefixes[subtag].add (line.split (' ')[1])
596				elif line.startswith ('File-Date: '):
597					self.header = line
598		expect (self.header)
599
600	def _add_macrolanguage (self, macrolanguage, language):
601		global ot
602		if language not in ot.from_bcp_47:
603			for l in self.macrolanguages.get (language, set ()):
604				self._add_macrolanguage (macrolanguage, l)
605		if macrolanguage not in ot.from_bcp_47:
606			for ls in list (self.macrolanguages.values ()):
607				if macrolanguage in ls:
608					ls.add (language)
609					return
610		self.macrolanguages[macrolanguage].add (language)
611
612	def remove_extra_macrolanguages (self):
613		"""Make every language have at most one macrolanguage."""
614		inverted = collections.defaultdict (list)
615		for macrolanguage, languages in self.macrolanguages.items ():
616			for language in languages:
617				inverted[language].append (macrolanguage)
618		for language, macrolanguages in inverted.items ():
619			if len (macrolanguages) > 1:
620				macrolanguages.sort (key=lambda ml: len (self.macrolanguages[ml]))
621				biggest_macrolanguage = macrolanguages.pop ()
622				for macrolanguage in macrolanguages:
623					self._add_macrolanguage (biggest_macrolanguage, macrolanguage)
624
625	def get_name (self, lt):
626		"""Return the names of the subtags in a language tag.
627
628		Args:
629			lt (LanguageTag): A BCP 47 language tag.
630
631		Returns:
632			The name form of ``lt``.
633		"""
634		name = self.names[lt.language].split ('\n')[0]
635		if lt.script:
636			name += '; ' + self.names[lt.script.title ()].split ('\n')[0]
637		if lt.region:
638			name += '; ' + self.names[lt.region.upper ()].split ('\n')[0]
639		if lt.variant:
640			name += '; ' + self.names[lt.variant].split ('\n')[0]
641		return name
642
643bcp_47 = BCP47Parser ()
644
645ot.parse (sys.argv[1])
646bcp_47.parse (sys.argv[2])
647
648ot.add_language ('ary', 'MOR')
649
650ot.add_language ('ath', 'ATH')
651
652ot.add_language ('bai', 'BML')
653
654ot.ranks['BAL'] = ot.ranks['KAR'] + 1
655
656ot.add_language ('ber', 'BBR')
657
658ot.remove_language_ot ('PGR')
659ot.add_language ('el-polyton', 'PGR')
660
661bcp_47.macrolanguages['et'] = {'ekk'}
662
663bcp_47.names['flm'] = 'Falam Chin'
664bcp_47.scopes['flm'] = ' (retired code)'
665bcp_47.macrolanguages['flm'] = {'cfm'}
666
667ot.ranks['FNE'] = ot.ranks['TNE'] + 1
668
669ot.add_language ('und-fonipa', 'IPPH')
670
671ot.add_language ('und-fonnapa', 'APPH')
672
673ot.remove_language_ot ('IRT')
674ot.add_language ('ga-Latg', 'IRT')
675
676ot.remove_language_ot ('KGE')
677ot.add_language ('und-Geok', 'KGE')
678
679bcp_47.macrolanguages['id'] = {'in'}
680
681bcp_47.macrolanguages['ijo'] = {'ijc'}
682
683ot.add_language ('kht', 'KHN')
684ot.names['KHN'] = ot.names['KHT'] + ' (Microsoft fonts)'
685ot.ranks['KHN'] = ot.ranks['KHT'] + 1
686
687ot.ranks['LCR'] = ot.ranks['MCR'] + 1
688
689ot.names['MAL'] = 'Malayalam Traditional'
690ot.ranks['MLR'] += 1
691
692bcp_47.names['mhv'] = 'Arakanese'
693bcp_47.scopes['mhv'] = ' (retired code)'
694
695ot.add_language ('no', 'NOR')
696
697ot.add_language ('oc-provenc', 'PRO')
698
699ot.add_language ('qu', 'QUZ')
700ot.add_language ('qub', 'QWH')
701ot.add_language ('qud', 'QVI')
702ot.add_language ('qug', 'QVI')
703ot.add_language ('qup', 'QVI')
704ot.add_language ('qur', 'QWH')
705ot.add_language ('qus', 'QUH')
706ot.add_language ('quw', 'QVI')
707ot.add_language ('qux', 'QWH')
708ot.add_language ('qva', 'QWH')
709ot.add_language ('qvh', 'QWH')
710ot.add_language ('qvj', 'QVI')
711ot.add_language ('qvl', 'QWH')
712ot.add_language ('qvm', 'QWH')
713ot.add_language ('qvn', 'QWH')
714ot.add_language ('qvo', 'QVI')
715ot.add_language ('qvp', 'QWH')
716ot.add_language ('qvw', 'QWH')
717ot.add_language ('qvz', 'QVI')
718ot.add_language ('qwa', 'QWH')
719ot.add_language ('qws', 'QWH')
720ot.add_language ('qxa', 'QWH')
721ot.add_language ('qxc', 'QWH')
722ot.add_language ('qxh', 'QWH')
723ot.add_language ('qxl', 'QVI')
724ot.add_language ('qxn', 'QWH')
725ot.add_language ('qxo', 'QWH')
726ot.add_language ('qxr', 'QVI')
727ot.add_language ('qxt', 'QWH')
728ot.add_language ('qxw', 'QWH')
729
730bcp_47.macrolanguages['ro'].remove ('mo')
731bcp_47.macrolanguages['ro-MD'].add ('mo')
732
733ot.remove_language_ot ('SYRE')
734ot.remove_language_ot ('SYRJ')
735ot.remove_language_ot ('SYRN')
736ot.add_language ('und-Syre', 'SYRE')
737ot.add_language ('und-Syrj', 'SYRJ')
738ot.add_language ('und-Syrn', 'SYRN')
739
740bcp_47.names['xst'] = "Silt'e"
741bcp_47.scopes['xst'] = ' (retired code)'
742bcp_47.macrolanguages['xst'] = {'stv', 'wle'}
743
744ot.add_language ('xwo', 'TOD')
745
746ot.remove_language_ot ('ZHH')
747ot.remove_language_ot ('ZHP')
748ot.remove_language_ot ('ZHT')
749bcp_47.macrolanguages['zh'].remove ('lzh')
750bcp_47.macrolanguages['zh'].remove ('yue')
751ot.add_language ('zh-Hant-MO', 'ZHH')
752ot.add_language ('zh-Hant-HK', 'ZHH')
753ot.add_language ('zh-Hans', 'ZHS')
754ot.add_language ('zh-Hant', 'ZHT')
755ot.add_language ('zh-HK', 'ZHH')
756ot.add_language ('zh-MO', 'ZHH')
757ot.add_language ('zh-TW', 'ZHT')
758ot.add_language ('lzh', 'ZHT')
759ot.add_language ('lzh-Hans', 'ZHS')
760ot.add_language ('yue', 'ZHH')
761ot.add_language ('yue-Hans', 'ZHS')
762
763bcp_47.macrolanguages['zom'] = {'yos'}
764
765def rank_delta (bcp_47, ot):
766	"""Return a delta to apply to a BCP 47 tag's rank.
767
768	Most OpenType tags have a constant rank, but a few have ranks that
769	depend on the BCP 47 tag.
770
771	Args:
772		bcp_47 (str): A BCP 47 tag.
773		ot (str): An OpenType tag to.
774
775	Returns:
776		A number to add to ``ot``'s rank when sorting ``bcp_47``'s
777		OpenType equivalents.
778	"""
779	if bcp_47 == 'ak' and ot == 'AKA':
780		return -1
781	if bcp_47 == 'tw' and ot == 'TWI':
782		return -1
783	return 0
784
785disambiguation = {
786	'ALT': 'alt',
787	'ARK': 'rki',
788	'BHI': 'bhb',
789	'BLN': 'bjt',
790	'BTI': 'beb',
791	'CCHN': 'cco',
792	'CMR': 'swb',
793	'CPP': 'crp',
794	'CRR': 'crx',
795	'DUJ': 'dwu',
796	'ECR': 'crj',
797	'HAL': 'cfm',
798	'HND': 'hnd',
799	'KIS': 'kqs',
800	'KUI': 'uki',
801	'LRC': 'bqi',
802	'NDB': 'nd',
803	'NIS': 'njz',
804	'PLG': 'pce',
805	'PRO': 'pro',
806	'QIN': 'bgr',
807	'QUH': 'quh',
808	'QVI': 'qvi',
809	'QWH': 'qwh',
810	'SIG': 'stv',
811	'TNE': 'yrk',
812	'ZHH': 'zh-HK',
813	'ZHS': 'zh-Hans',
814	'ZHT': 'zh-Hant',
815}
816
817ot.inherit_from_macrolanguages ()
818bcp_47.remove_extra_macrolanguages ()
819ot.inherit_from_macrolanguages ()
820ot.sort_languages ()
821
822print ('/* == Start of generated table == */')
823print ('/*')
824print (' * The following table is generated by running:')
825print (' *')
826print (' *   %s languagetags language-subtag-registry' % sys.argv[0])
827print (' *')
828print (' * on files with these headers:')
829print (' *')
830print (' * %s' % ot.header.strip ())
831print (' * %s' % bcp_47.header)
832print (' */')
833print ()
834print ('#ifndef HB_OT_TAG_TABLE_HH')
835print ('#define HB_OT_TAG_TABLE_HH')
836print ()
837print ('static const LangTag ot_languages[] = {')
838
839def hb_tag (tag):
840	"""Convert a tag to ``HB_TAG`` form.
841
842	Args:
843		tag (str): An OpenType tag.
844
845	Returns:
846		A snippet of C++ representing ``tag``.
847	"""
848	return "HB_TAG('%s','%s','%s','%s')" % tuple (('%-4s' % tag)[:4])
849
850def get_variant_set (name):
851	"""Return a set of variant language names from a name.
852
853	Args:
854		name (str): A list of language names from the BCP 47 registry,
855			joined on ``'\\n'``.
856
857	Returns:
858		A set of normalized language names.
859	"""
860	return set (unicodedata.normalize ('NFD', n.replace ('\u2019', "'"))
861			.encode ('ASCII', 'ignore')
862			.strip ()
863			for n in re.split ('[\n(),]', name) if n)
864
865def language_name_intersection (a, b):
866	"""Return the names in common between two language names.
867
868	Args:
869		a (str): A list of language names from the BCP 47 registry,
870			joined on ``'\\n'``.
871		b (str): A list of language names from the BCP 47 registry,
872			joined on ``'\\n'``.
873
874	Returns:
875		The normalized language names shared by ``a`` and ``b``.
876	"""
877	return get_variant_set (a).intersection (get_variant_set (b))
878
879def get_matching_language_name (intersection, candidates):
880	return next (iter (c for c in candidates if not intersection.isdisjoint (get_variant_set (c))))
881
882def same_tag (bcp_47_tag, ot_tags):
883	return len (bcp_47_tag) == 3 and len (ot_tags) == 1 and bcp_47_tag == ot_tags[0].lower ()
884
885for language, tags in sorted (ot.from_bcp_47.items ()):
886	if language == '' or '-' in language:
887		continue
888	commented_out = same_tag (language, tags)
889	for i, tag in enumerate (tags, start=1):
890		print ('%s{\"%s\",\t%s},' % ('/*' if commented_out else '  ', language, hb_tag (tag)), end='')
891		if commented_out:
892			print ('*/', end='')
893		print ('\t/* ', end='')
894		bcp_47_name = bcp_47.names.get (language, '')
895		bcp_47_name_candidates = bcp_47_name.split ('\n')
896		intersection = language_name_intersection (bcp_47_name, ot.names[tag])
897		scope = bcp_47.scopes.get (language, '')
898		if not intersection:
899			write ('%s%s -> %s' % (bcp_47_name_candidates[0], scope, ot.names[tag]))
900		else:
901			name = get_matching_language_name (intersection, bcp_47_name_candidates)
902			bcp_47.names[language] = name
903			write ('%s%s' % (name if len (name) > len (ot.names[tag]) else ot.names[tag], scope))
904		print (' */')
905
906print ('};')
907print ()
908
909print ('/**')
910print (' * hb_ot_tags_from_complex_language:')
911print (' * @lang_str: a BCP 47 language tag to convert.')
912print (' * @limit: a pointer to the end of the substring of @lang_str to consider for')
913print (' * conversion.')
914print (' * @count: maximum number of language tags to retrieve (IN) and actual number of')
915print (' * language tags retrieved (OUT). If no tags are retrieved, it is not modified.')
916print (' * @tags: array of size at least @language_count to store the language tag')
917print (' * results')
918print (' *')
919print (' * Converts a multi-subtag BCP 47 language tag to language tags.')
920print (' *')
921print (' * Return value: Whether any language systems were retrieved.')
922print (' **/')
923print ('static bool')
924print ('hb_ot_tags_from_complex_language (const char   *lang_str,')
925print ('\t\t\t\t  const char   *limit,')
926print ('\t\t\t\t  unsigned int *count /* IN/OUT */,')
927print ('\t\t\t\t  hb_tag_t     *tags /* OUT */)')
928print ('{')
929
930def print_subtag_matches (subtag, new_line):
931	if subtag:
932		if new_line:
933			print ()
934			print ('\t&& ', end='')
935		print ('subtag_matches (lang_str, limit, "-%s")' % subtag, end='')
936
937complex_tags = collections.defaultdict (list)
938for initial, group in itertools.groupby ((lt_tags for lt_tags in [
939			(LanguageTag (language), tags)
940			for language, tags in sorted (ot.from_bcp_47.items (),
941				key=lambda i: (-len (i[0]), i[0]))
942		] if lt_tags[0].is_complex ()),
943		key=lambda lt_tags: lt_tags[0].get_group ()):
944	complex_tags[initial] += group
945
946for initial, items in sorted (complex_tags.items ()):
947	if initial != 'und':
948		continue
949	for lt, tags in items:
950		if lt.variant in bcp_47.prefixes:
951			expect (next (iter (bcp_47.prefixes[lt.variant])) == lt.language,
952					'%s is not a valid prefix of %s' % (lt.language, lt.variant))
953		print ('  if (', end='')
954		print_subtag_matches (lt.script, False)
955		print_subtag_matches (lt.region, False)
956		print_subtag_matches (lt.variant, False)
957		print (')')
958		print ('  {')
959		write ('    /* %s */' % bcp_47.get_name (lt))
960		print ()
961		if len (tags) == 1:
962			write ('    tags[0] = %s;  /* %s */' % (hb_tag (tags[0]), ot.names[tags[0]]))
963			print ()
964			print ('    *count = 1;')
965		else:
966			print ('    hb_tag_t possible_tags[] = {')
967			for tag in tags:
968				write ('      %s,  /* %s */' % (hb_tag (tag), ot.names[tag]))
969				print ()
970			print ('    };')
971			print ('    for (i = 0; i < %s && i < *count; i++)' % len (tags))
972			print ('      tags[i] = possible_tags[i];')
973			print ('    *count = i;')
974		print ('    return true;')
975		print ('  }')
976
977print ('  switch (lang_str[0])')
978print ('  {')
979for initial, items in sorted (complex_tags.items ()):
980	if initial == 'und':
981		continue
982	print ("  case '%s':" % initial)
983	for lt, tags in items:
984		print ('    if (', end='')
985		if lt.grandfathered:
986			print ('0 == strcmp (&lang_str[1], "%s")' % lt.language[1:], end='')
987		else:
988			string_literal = lt.language[1:] + '-'
989			if lt.script:
990				string_literal += lt.script
991				lt.script = None
992				if lt.region:
993					string_literal += '-' + lt.region
994					lt.region = None
995			if string_literal[-1] == '-':
996				print ('0 == strncmp (&lang_str[1], "%s", %i)' % (string_literal, len (string_literal)), end='')
997			else:
998				print ('lang_matches (&lang_str[1], "%s")' % string_literal, end='')
999		print_subtag_matches (lt.script, True)
1000		print_subtag_matches (lt.region, True)
1001		print_subtag_matches (lt.variant, True)
1002		print (')')
1003		print ('    {')
1004		write ('      /* %s */' % bcp_47.get_name (lt))
1005		print ()
1006		if len (tags) == 1:
1007			write ('      tags[0] = %s;  /* %s */' % (hb_tag (tags[0]), ot.names[tags[0]]))
1008			print ()
1009			print ('      *count = 1;')
1010		else:
1011			print ('      unsigned int i;')
1012			print ('      hb_tag_t possible_tags[] = {')
1013			for tag in tags:
1014				write ('\t%s,  /* %s */' % (hb_tag (tag), ot.names[tag]))
1015				print ()
1016			print ('      };')
1017			print ('      for (i = 0; i < %s && i < *count; i++)' % len (tags))
1018			print ('\ttags[i] = possible_tags[i];')
1019			print ('      *count = i;')
1020		print ('      return true;')
1021		print ('    }')
1022	print ('    break;')
1023
1024print ('  }')
1025print ('  return false;')
1026print ('}')
1027print ()
1028print ('/**')
1029print (' * hb_ot_ambiguous_tag_to_language')
1030print (' * @tag: A language tag.')
1031print (' *')
1032print (' * Converts @tag to a BCP 47 language tag if it is ambiguous (it corresponds to')
1033print (' * many language tags) and the best tag is not the alphabetically first, or if')
1034print (' * the best tag consists of multiple subtags, or if the best tag does not appear')
1035print (' * in #ot_languages.')
1036print (' *')
1037print (' * Return value: The #hb_language_t corresponding to the BCP 47 language tag,')
1038print (' * or #HB_LANGUAGE_INVALID if @tag is not ambiguous.')
1039print (' **/')
1040print ('static hb_language_t')
1041print ('hb_ot_ambiguous_tag_to_language (hb_tag_t tag)')
1042print ('{')
1043print ('  switch (tag)')
1044print ('  {')
1045
1046def verify_disambiguation_dict ():
1047	"""Verify and normalize ``disambiguation``.
1048
1049	``disambiguation`` is a map of ambiguous OpenType language system
1050	tags to the particular BCP 47 tags they correspond to. This function
1051	checks that all its keys really are ambiguous and that each key's
1052	value is valid for that key. It checks that no ambiguous tag is
1053	missing, except when it can figure out which BCP 47 tag is the best
1054	by itself.
1055
1056	It modifies ``disambiguation`` to remove keys whose values are the
1057	same as those that the fallback would return anyway, and to add
1058	ambiguous keys whose disambiguations it determined automatically.
1059
1060	Raises:
1061		AssertionError: Verification failed.
1062	"""
1063	global bcp_47
1064	global disambiguation
1065	global ot
1066	for ot_tag, bcp_47_tags in ot.to_bcp_47.items ():
1067		primary_tags = list (t for t in bcp_47_tags if t not in bcp_47.grandfathered and ot.from_bcp_47.get (t)[0] == ot_tag)
1068		if len (primary_tags) == 1:
1069			expect (ot_tag not in disambiguation, 'unnecessary disambiguation for OT tag: %s' % ot_tag)
1070			if '-' in primary_tags[0]:
1071				disambiguation[ot_tag] = primary_tags[0]
1072			else:
1073				first_tag = sorted (t for t in bcp_47_tags if t not in bcp_47.grandfathered and ot_tag in ot.from_bcp_47.get (t))[0]
1074				if primary_tags[0] != first_tag:
1075					disambiguation[ot_tag] = primary_tags[0]
1076		elif len (primary_tags) == 0:
1077			expect (ot_tag not in disambiguation, 'There is no possible valid disambiguation for %s' % ot_tag)
1078		else:
1079			macrolanguages = list (t for t in primary_tags if bcp_47.scopes.get (t) == ' [macrolanguage]')
1080			if len (macrolanguages) != 1:
1081				macrolanguages = list (t for t in primary_tags if bcp_47.scopes.get (t) == ' [family]')
1082			if len (macrolanguages) != 1:
1083				macrolanguages = list (t for t in primary_tags if 'retired code' not in bcp_47.scopes.get (t, ''))
1084			if len (macrolanguages) != 1:
1085				expect (ot_tag in disambiguation, 'ambiguous OT tag: %s %s' % (ot_tag, str (macrolanguages)))
1086				expect (disambiguation[ot_tag] in bcp_47_tags,
1087						'%s is not a valid disambiguation for %s' % (disambiguation[ot_tag], ot_tag))
1088			elif ot_tag not in disambiguation:
1089				disambiguation[ot_tag] = macrolanguages[0]
1090			different_primary_tags = sorted (t for t in primary_tags if not same_tag (t, ot.from_bcp_47.get (t)))
1091			if different_primary_tags and disambiguation[ot_tag] == different_primary_tags[0] and '-' not in disambiguation[ot_tag]:
1092				del disambiguation[ot_tag]
1093	for ot_tag in disambiguation.keys ():
1094		expect (ot_tag in ot.to_bcp_47, 'unknown OT tag: %s' % ot_tag)
1095
1096verify_disambiguation_dict ()
1097for ot_tag, bcp_47_tag in sorted (disambiguation.items ()):
1098	write ('  case %s:  /* %s */' % (hb_tag (ot_tag), ot.names[ot_tag]))
1099	print ()
1100	write ('    return hb_language_from_string (\"%s\", -1);  /* %s */' % (bcp_47_tag, bcp_47.get_name (LanguageTag (bcp_47_tag))))
1101	print ()
1102
1103print ('  default:')
1104print ('    return HB_LANGUAGE_INVALID;')
1105print ('  }')
1106print ('}')
1107
1108print ()
1109print ('#endif /* HB_OT_TAG_TABLE_HH */')
1110print ()
1111print ('/* == End of generated table == */')
1112
1113