1#!/usr/local/bin/python3.8
2
3"""Generator of the mapping from OpenType tags to BCP 47 tags and vice
4versa.
5
6It creates a ``const LangTag[]``, matching the tags from the OpenType
7languages system tag list to the language subtags of the BCP 47 language
8subtag registry, with some manual adjustments. The mappings are
9supplemented with macrolanguages' sublanguages and retired codes'
10replacements, according to BCP 47 and some manual additions where BCP 47
11omits a retired code entirely.
12
13Also generated is a function, ``hb_ot_ambiguous_tag_to_language``,
14intended for use by ``hb_ot_tag_to_language``. It maps OpenType tags
15back to BCP 47 tags. Ambiguous OpenType tags (those that correspond to
16multiple BCP 47 tags) are listed here, except when the alphabetically
17first BCP 47 tag happens to be the chosen disambiguated tag. In that
18case, the fallback behavior will choose the right tag anyway.
19
20usage: ./gen-tag-table.py languagetags language-subtag-registry
21
22Input files:
23* https://docs.microsoft.com/en-us/typography/opentype/spec/languagetags
24* https://www.iana.org/assignments/language-subtag-registry/language-subtag-registry
25"""
26
27import collections
28import html
29from html.parser import HTMLParser
30import itertools
31import re
32import sys
33import unicodedata
34
35if len (sys.argv) != 3:
36	sys.exit (__doc__)
37
38def expect (condition, message=None):
39	if not condition:
40		if message is None:
41			raise AssertionError
42		raise AssertionError (message)
43
44def write (s):
45	sys.stdout.flush ()
46	sys.stdout.buffer.write (s.encode ('utf-8'))
47
48DEFAULT_LANGUAGE_SYSTEM = ''
49
50# from https://www-01.sil.org/iso639-3/iso-639-3.tab
51ISO_639_3_TO_1 = {
52	'aar': 'aa',
53	'abk': 'ab',
54	'afr': 'af',
55	'aka': 'ak',
56	'amh': 'am',
57	'ara': 'ar',
58	'arg': 'an',
59	'asm': 'as',
60	'ava': 'av',
61	'ave': 'ae',
62	'aym': 'ay',
63	'aze': 'az',
64	'bak': 'ba',
65	'bam': 'bm',
66	'bel': 'be',
67	'ben': 'bn',
68	'bis': 'bi',
69	'bod': 'bo',
70	'bos': 'bs',
71	'bre': 'br',
72	'bul': 'bg',
73	'cat': 'ca',
74	'ces': 'cs',
75	'cha': 'ch',
76	'che': 'ce',
77	'chu': 'cu',
78	'chv': 'cv',
79	'cor': 'kw',
80	'cos': 'co',
81	'cre': 'cr',
82	'cym': 'cy',
83	'dan': 'da',
84	'deu': 'de',
85	'div': 'dv',
86	'dzo': 'dz',
87	'ell': 'el',
88	'eng': 'en',
89	'epo': 'eo',
90	'est': 'et',
91	'eus': 'eu',
92	'ewe': 'ee',
93	'fao': 'fo',
94	'fas': 'fa',
95	'fij': 'fj',
96	'fin': 'fi',
97	'fra': 'fr',
98	'fry': 'fy',
99	'ful': 'ff',
100	'gla': 'gd',
101	'gle': 'ga',
102	'glg': 'gl',
103	'glv': 'gv',
104	'grn': 'gn',
105	'guj': 'gu',
106	'hat': 'ht',
107	'hau': 'ha',
108	'hbs': 'sh',
109	'heb': 'he',
110	'her': 'hz',
111	'hin': 'hi',
112	'hmo': 'ho',
113	'hrv': 'hr',
114	'hun': 'hu',
115	'hye': 'hy',
116	'ibo': 'ig',
117	'ido': 'io',
118	'iii': 'ii',
119	'iku': 'iu',
120	'ile': 'ie',
121	'ina': 'ia',
122	'ind': 'id',
123	'ipk': 'ik',
124	'isl': 'is',
125	'ita': 'it',
126	'jav': 'jv',
127	'jpn': 'ja',
128	'kal': 'kl',
129	'kan': 'kn',
130	'kas': 'ks',
131	'kat': 'ka',
132	'kau': 'kr',
133	'kaz': 'kk',
134	'khm': 'km',
135	'kik': 'ki',
136	'kin': 'rw',
137	'kir': 'ky',
138	'kom': 'kv',
139	'kon': 'kg',
140	'kor': 'ko',
141	'kua': 'kj',
142	'kur': 'ku',
143	'lao': 'lo',
144	'lat': 'la',
145	'lav': 'lv',
146	'lim': 'li',
147	'lin': 'ln',
148	'lit': 'lt',
149	'ltz': 'lb',
150	'lub': 'lu',
151	'lug': 'lg',
152	'mah': 'mh',
153	'mal': 'ml',
154	'mar': 'mr',
155	'mkd': 'mk',
156	'mlg': 'mg',
157	'mlt': 'mt',
158	'mol': 'mo',
159	'mon': 'mn',
160	'mri': 'mi',
161	'msa': 'ms',
162	'mya': 'my',
163	'nau': 'na',
164	'nav': 'nv',
165	'nbl': 'nr',
166	'nde': 'nd',
167	'ndo': 'ng',
168	'nep': 'ne',
169	'nld': 'nl',
170	'nno': 'nn',
171	'nob': 'nb',
172	'nor': 'no',
173	'nya': 'ny',
174	'oci': 'oc',
175	'oji': 'oj',
176	'ori': 'or',
177	'orm': 'om',
178	'oss': 'os',
179	'pan': 'pa',
180	'pli': 'pi',
181	'pol': 'pl',
182	'por': 'pt',
183	'pus': 'ps',
184	'que': 'qu',
185	'roh': 'rm',
186	'ron': 'ro',
187	'run': 'rn',
188	'rus': 'ru',
189	'sag': 'sg',
190	'san': 'sa',
191	'sin': 'si',
192	'slk': 'sk',
193	'slv': 'sl',
194	'sme': 'se',
195	'smo': 'sm',
196	'sna': 'sn',
197	'snd': 'sd',
198	'som': 'so',
199	'sot': 'st',
200	'spa': 'es',
201	'sqi': 'sq',
202	'srd': 'sc',
203	'srp': 'sr',
204	'ssw': 'ss',
205	'sun': 'su',
206	'swa': 'sw',
207	'swe': 'sv',
208	'tah': 'ty',
209	'tam': 'ta',
210	'tat': 'tt',
211	'tel': 'te',
212	'tgk': 'tg',
213	'tgl': 'tl',
214	'tha': 'th',
215	'tir': 'ti',
216	'ton': 'to',
217	'tsn': 'tn',
218	'tso': 'ts',
219	'tuk': 'tk',
220	'tur': 'tr',
221	'twi': 'tw',
222	'uig': 'ug',
223	'ukr': 'uk',
224	'urd': 'ur',
225	'uzb': 'uz',
226	'ven': 've',
227	'vie': 'vi',
228	'vol': 'vo',
229	'wln': 'wa',
230	'wol': 'wo',
231	'xho': 'xh',
232	'yid': 'yi',
233	'yor': 'yo',
234	'zha': 'za',
235	'zho': 'zh',
236	'zul': 'zu',
237}
238
239class LanguageTag (object):
240	"""A BCP 47 language tag.
241
242	Attributes:
243		subtags (List[str]): The list of subtags in this tag.
244		grandfathered (bool): Whether this tag is grandfathered. If
245			``true``, the entire lowercased tag is the ``language``
246			and the other subtag fields are empty.
247		language (str): The language subtag.
248		script (str): The script subtag.
249		region (str): The region subtag.
250		variant (str): The variant subtag.
251
252	Args:
253		tag (str): A BCP 47 language tag.
254
255	"""
256	def __init__ (self, tag):
257		global bcp_47
258		self.subtags = tag.lower ().split ('-')
259		self.grandfathered = tag.lower () in bcp_47.grandfathered
260		if self.grandfathered:
261			self.language = tag.lower ()
262			self.script = ''
263			self.region = ''
264			self.variant = ''
265		else:
266			self.language = self.subtags[0]
267			self.script = self._find_first (lambda s: len (s) == 4 and s[0] > '9', self.subtags)
268			self.region = self._find_first (lambda s: len (s) == 2 and s[0] > '9' or len (s) == 3 and s[0] <= '9', self.subtags[1:])
269			self.variant = self._find_first (lambda s: len (s) > 4 or len (s) == 4 and s[0] <= '9', self.subtags)
270
271	def __str__(self):
272		return '-'.join(self.subtags)
273
274	def __repr__ (self):
275		return 'LanguageTag(%r)' % str(self)
276
277	@staticmethod
278	def _find_first (function, sequence):
279		try:
280			return next (iter (filter (function, sequence)))
281		except StopIteration:
282			return None
283
284	def is_complex (self):
285		"""Return whether this tag is too complex to represent as a
286		``LangTag`` in the generated code.
287
288		Complex tags need to be handled in
289		``hb_ot_tags_from_complex_language``.
290
291		Returns:
292			Whether this tag is complex.
293		"""
294		return not (len (self.subtags) == 1
295			or self.grandfathered
296			and len (self.subtags[1]) != 3
297			and ot.from_bcp_47[self.subtags[0]] == ot.from_bcp_47[self.language])
298
299	def get_group (self):
300		"""Return the group into which this tag should be categorized in
301		``hb_ot_tags_from_complex_language``.
302
303		The group is the first letter of the tag, or ``'und'`` if this tag
304		should not be matched in a ``switch`` statement in the generated
305		code.
306
307		Returns:
308			This tag's group.
309		"""
310		return ('und'
311			if (self.language == 'und'
312				or self.variant in bcp_47.prefixes and len (bcp_47.prefixes[self.variant]) == 1)
313			else self.language[0])
314
315class OpenTypeRegistryParser (HTMLParser):
316	"""A parser for the OpenType language system tag registry.
317
318	Attributes:
319		header (str): The "last updated" line of the registry.
320		names (Mapping[str, str]): A map of language system tags to the
321			names they are given in the registry.
322		ranks (DefaultDict[str, int]): A map of language system tags to
323			numbers. If a single BCP 47 tag corresponds to multiple
324			OpenType tags, the tags are ordered in increasing order by
325			rank. The rank is based on the number of BCP 47 tags
326			associated with a tag, though it may be manually modified.
327		to_bcp_47 (DefaultDict[str, AbstractSet[str]]): A map of
328			OpenType language system tags to sets of BCP 47 tags.
329		from_bcp_47 (DefaultDict[str, AbstractSet[str]]): ``to_bcp_47``
330			inverted. Its values start as unsorted sets;
331			``sort_languages`` converts them to sorted lists.
332		from_bcp_47_uninherited (Optional[Dict[str, AbstractSet[str]]]):
333			A copy of ``from_bcp_47``. It starts as ``None`` and is
334			populated at the beginning of the first call to
335			``inherit_from_macrolanguages``.
336
337	"""
338	def __init__ (self):
339		HTMLParser.__init__ (self)
340		self.header = ''
341		self.names = {}
342		self.ranks = collections.defaultdict (int)
343		self.to_bcp_47 = collections.defaultdict (set)
344		self.from_bcp_47 = collections.defaultdict (set)
345		self.from_bcp_47_uninherited = None
346		# Whether the parser is in a <td> element
347		self._td = False
348		# Whether the parser is after a <br> element within the current <tr> element
349		self._br = False
350		# The text of the <td> elements of the current <tr> element.
351		self._current_tr = []
352
353	def handle_starttag (self, tag, attrs):
354		if tag == 'br':
355			self._br = True
356		elif tag == 'meta':
357			for attr, value in attrs:
358				if attr == 'name' and value == 'updated_at':
359					self.header = self.get_starttag_text ()
360					break
361		elif tag == 'td':
362			self._td = True
363			self._current_tr.append ('')
364		elif tag == 'tr':
365			self._br = False
366			self._current_tr = []
367
368	def handle_endtag (self, tag):
369		if tag == 'td':
370			self._td = False
371		elif tag == 'tr' and self._current_tr:
372			expect (2 <= len (self._current_tr) <= 3)
373			name = self._current_tr[0].strip ()
374			tag = self._current_tr[1].strip ("\t\n\v\f\r '")
375			rank = 0
376			if len (tag) > 4:
377				expect (tag.endswith (' (deprecated)'), 'ill-formed OpenType tag: %s' % tag)
378				name += ' (deprecated)'
379				tag = tag.split (' ')[0]
380				rank = 1
381			self.names[tag] = re.sub (' languages$', '', name)
382			if not self._current_tr[2]:
383				return
384			iso_codes = self._current_tr[2].strip ()
385			self.to_bcp_47[tag].update (ISO_639_3_TO_1.get (code, code) for code in iso_codes.replace (' ', '').split (','))
386			rank += 2 * len (self.to_bcp_47[tag])
387			self.ranks[tag] = rank
388
389	def handle_data (self, data):
390		if self._td and not self._br:
391			self._current_tr[-1] += data
392
393	def handle_charref (self, name):
394		self.handle_data (html.unescape ('&#%s;' % name))
395
396	def handle_entityref (self, name):
397		self.handle_data (html.unescape ('&%s;' % name))
398
399	def parse (self, filename):
400		"""Parse the OpenType language system tag registry.
401
402		Args:
403			filename (str): The file name of the registry.
404		"""
405		with open (filename, encoding='utf-8') as f:
406			self.feed (f.read ())
407		expect (self.header)
408		for tag, iso_codes in self.to_bcp_47.items ():
409			for iso_code in iso_codes:
410				self.from_bcp_47[iso_code].add (tag)
411
412	def add_language (self, bcp_47_tag, ot_tag):
413		"""Add a language as if it were in the registry.
414
415		Args:
416			bcp_47_tag (str): A BCP 47 tag. If the tag is more than just
417				a language subtag, and if the language subtag is a
418				macrolanguage, then new languages are added corresponding
419				to the macrolanguages' individual languages with the
420				remainder of the tag appended.
421			ot_tag (str): An OpenType language system tag.
422		"""
423		global bcp_47
424		self.to_bcp_47[ot_tag].add (bcp_47_tag)
425		self.from_bcp_47[bcp_47_tag].add (ot_tag)
426		if bcp_47_tag.lower () not in bcp_47.grandfathered:
427			try:
428				[macrolanguage, suffix] = bcp_47_tag.split ('-', 1)
429				if macrolanguage in bcp_47.macrolanguages:
430					s = set ()
431					for language in bcp_47.macrolanguages[macrolanguage]:
432						if language.lower () not in bcp_47.grandfathered:
433							s.add ('%s-%s' % (language, suffix))
434					bcp_47.macrolanguages['%s-%s' % (macrolanguage, suffix)] = s
435			except ValueError:
436				pass
437
438	@staticmethod
439	def _remove_language (tag_1, dict_1, dict_2):
440		for tag_2 in dict_1.pop (tag_1):
441			dict_2[tag_2].remove (tag_1)
442			if not dict_2[tag_2]:
443				del dict_2[tag_2]
444
445	def remove_language_ot (self, ot_tag):
446		"""Remove an OpenType tag from the registry.
447
448		Args:
449			ot_tag (str): An OpenType tag.
450		"""
451		self._remove_language (ot_tag, self.to_bcp_47, self.from_bcp_47)
452
453	def remove_language_bcp_47 (self, bcp_47_tag):
454		"""Remove a BCP 47 tag from the registry.
455
456		Args:
457			bcp_47_tag (str): A BCP 47 tag.
458		"""
459		self._remove_language (bcp_47_tag, self.from_bcp_47, self.to_bcp_47)
460
461	def inherit_from_macrolanguages (self):
462		"""Copy mappings from macrolanguages to individual languages.
463
464		If a BCP 47 tag for an individual mapping has no OpenType
465		mapping but its macrolanguage does, the mapping is copied to
466		the individual language. For example, als (Tosk Albanian) has no
467		explicit mapping, so it inherits from sq (Albanian) the mapping
468		to SQI.
469
470		However, if an OpenType tag maps to a BCP 47 macrolanguage and
471		some but not all of its individual languages, the mapping is not
472		inherited from the macrolanguage to the missing individual
473		languages. For example, INUK (Nunavik Inuktitut) is mapped to
474		ike (Eastern Canadian Inuktitut) and iu (Inuktitut) but not to
475		ikt (Inuinnaqtun, which is an individual language of iu), so
476		this method does not add a mapping from ikt to INUK.
477
478		If a BCP 47 tag for a macrolanguage has no OpenType mapping but
479		some of its individual languages do, their mappings are copied
480		to the macrolanguage.
481		"""
482		global bcp_47
483		first_time = self.from_bcp_47_uninherited is None
484		if first_time:
485			self.from_bcp_47_uninherited = dict (self.from_bcp_47)
486		for macrolanguage, languages in dict (bcp_47.macrolanguages).items ():
487			ot_macrolanguages = {
488				ot_macrolanguage for ot_macrolanguage in self.from_bcp_47_uninherited.get (macrolanguage, set ())
489			}
490			blocked_ot_macrolanguages = set ()
491			if 'retired code' not in bcp_47.scopes.get (macrolanguage, ''):
492				for ot_macrolanguage in ot_macrolanguages:
493					round_trip_macrolanguages = {
494						l for l in self.to_bcp_47[ot_macrolanguage]
495						if 'retired code' not in bcp_47.scopes.get (l, '')
496					}
497					round_trip_languages = {
498						l for l in languages
499						if 'retired code' not in bcp_47.scopes.get (l, '')
500					}
501					intersection = round_trip_macrolanguages & round_trip_languages
502					if intersection and intersection != round_trip_languages:
503						blocked_ot_macrolanguages.add (ot_macrolanguage)
504			if ot_macrolanguages:
505				for ot_macrolanguage in ot_macrolanguages:
506					if ot_macrolanguage not in blocked_ot_macrolanguages:
507						for language in languages:
508							self.add_language (language, ot_macrolanguage)
509							if not blocked_ot_macrolanguages:
510								self.ranks[ot_macrolanguage] += 1
511			elif first_time:
512				for language in languages:
513					if language in self.from_bcp_47_uninherited:
514						ot_macrolanguages |= self.from_bcp_47_uninherited[language]
515					else:
516						ot_macrolanguages.clear ()
517					if not ot_macrolanguages:
518						break
519				for ot_macrolanguage in ot_macrolanguages:
520					self.add_language (macrolanguage, ot_macrolanguage)
521
522	def sort_languages (self):
523		"""Sort the values of ``from_bcp_47`` in ascending rank order."""
524		for language, tags in self.from_bcp_47.items ():
525			self.from_bcp_47[language] = sorted (tags,
526					key=lambda t: (self.ranks[t] + rank_delta (language, t), t))
527
528ot = OpenTypeRegistryParser ()
529
530class BCP47Parser (object):
531	"""A parser for the BCP 47 subtag registry.
532
533	Attributes:
534		header (str): The "File-Date" line of the registry.
535		names (Mapping[str, str]): A map of subtags to the names they
536			are given in the registry. Each value is a
537			``'\\n'``-separated list of names.
538		scopes (Mapping[str, str]): A map of language subtags to strings
539			suffixed to language names, including suffixes to explain
540			language scopes.
541		macrolanguages (DefaultDict[str, AbstractSet[str]]): A map of
542			language subtags to the sets of language subtags which
543			inherit from them. See
544			``OpenTypeRegistryParser.inherit_from_macrolanguages``.
545		prefixes (DefaultDict[str, AbstractSet[str]]): A map of variant
546			subtags to their prefixes.
547		grandfathered (AbstractSet[str]): The set of grandfathered tags,
548			normalized to lowercase.
549
550	"""
551	def __init__ (self):
552		self.header = ''
553		self.names = {}
554		self.scopes = {}
555		self.macrolanguages = collections.defaultdict (set)
556		self.prefixes = collections.defaultdict (set)
557		self.grandfathered = set ()
558
559	def parse (self, filename):
560		"""Parse the BCP 47 subtag registry.
561
562		Args:
563			filename (str): The file name of the registry.
564		"""
565		with open (filename, encoding='utf-8') as f:
566			subtag_type = None
567			subtag = None
568			deprecated = False
569			has_preferred_value = False
570			line_buffer = ''
571			for line in itertools.chain (f, ['']):
572				line = line.rstrip ()
573				if line.startswith (' '):
574					line_buffer += line[1:]
575					continue
576				line, line_buffer = line_buffer, line
577				if line.startswith ('Type: '):
578					subtag_type = line.split (' ')[1]
579					deprecated = False
580					has_preferred_value = False
581				elif line.startswith ('Subtag: ') or line.startswith ('Tag: '):
582					subtag = line.split (' ')[1]
583					if subtag_type == 'grandfathered':
584						self.grandfathered.add (subtag.lower ())
585				elif line.startswith ('Description: '):
586					description = line.split (' ', 1)[1].replace (' (individual language)', '')
587					description = re.sub (' (\(family\)|\((individual |macro)language\)|languages)$', '',
588							description)
589					if subtag in self.names:
590						self.names[subtag] += '\n' + description
591					else:
592						self.names[subtag] = description
593				elif subtag_type == 'language' or subtag_type == 'grandfathered':
594					if line.startswith ('Scope: '):
595						scope = line.split (' ')[1]
596						if scope == 'macrolanguage':
597							scope = ' [macrolanguage]'
598						elif scope == 'collection':
599							scope = ' [collection]'
600						else:
601							continue
602						self.scopes[subtag] = scope
603					elif line.startswith ('Deprecated: '):
604						self.scopes[subtag] = ' (retired code)' + self.scopes.get (subtag, '')
605						deprecated = True
606					elif deprecated and line.startswith ('Comments: see '):
607						# If a subtag is split into multiple replacement subtags,
608						# it essentially represents a macrolanguage.
609						for language in line.replace (',', '').split (' ')[2:]:
610							self._add_macrolanguage (subtag, language)
611					elif line.startswith ('Preferred-Value: '):
612						# If a subtag is deprecated in favor of a single replacement subtag,
613						# it is either a dialect or synonym of the preferred subtag. Either
614						# way, it is close enough to the truth to consider the replacement
615						# the macrolanguage of the deprecated language.
616						has_preferred_value = True
617						macrolanguage = line.split (' ')[1]
618						self._add_macrolanguage (macrolanguage, subtag)
619					elif not has_preferred_value and line.startswith ('Macrolanguage: '):
620						self._add_macrolanguage (line.split (' ')[1], subtag)
621				elif subtag_type == 'variant':
622					if line.startswith ('Deprecated: '):
623						self.scopes[subtag] = ' (retired code)' + self.scopes.get (subtag, '')
624					elif line.startswith ('Prefix: '):
625						self.prefixes[subtag].add (line.split (' ')[1])
626				elif line.startswith ('File-Date: '):
627					self.header = line
628		expect (self.header)
629
630	def _add_macrolanguage (self, macrolanguage, language):
631		global ot
632		if language not in ot.from_bcp_47:
633			for l in self.macrolanguages.get (language, set ()):
634				self._add_macrolanguage (macrolanguage, l)
635		if macrolanguage not in ot.from_bcp_47:
636			for ls in list (self.macrolanguages.values ()):
637				if macrolanguage in ls:
638					ls.add (language)
639					return
640		self.macrolanguages[macrolanguage].add (language)
641
642	def remove_extra_macrolanguages (self):
643		"""Make every language have at most one macrolanguage."""
644		inverted = collections.defaultdict (list)
645		for macrolanguage, languages in self.macrolanguages.items ():
646			for language in languages:
647				inverted[language].append (macrolanguage)
648		for language, macrolanguages in inverted.items ():
649			if len (macrolanguages) > 1:
650				macrolanguages.sort (key=lambda ml: len (self.macrolanguages[ml]))
651				biggest_macrolanguage = macrolanguages.pop ()
652				for macrolanguage in macrolanguages:
653					self._add_macrolanguage (biggest_macrolanguage, macrolanguage)
654
655	def _get_name_piece (self, subtag):
656		"""Return the first name of a subtag plus its scope suffix.
657
658		Args:
659			subtag (str): A BCP 47 subtag.
660
661		Returns:
662			The name form of ``subtag``.
663		"""
664		return self.names[subtag].split ('\n')[0] + self.scopes.get (subtag, '')
665
666	def get_name (self, lt):
667		"""Return the names of the subtags in a language tag.
668
669		Args:
670			lt (LanguageTag): A BCP 47 language tag.
671
672		Returns:
673			The name form of ``lt``.
674		"""
675		name = self._get_name_piece (lt.language)
676		if lt.script:
677			name += '; ' + self._get_name_piece (lt.script.title ())
678		if lt.region:
679			name += '; ' + self._get_name_piece (lt.region.upper ())
680		if lt.variant:
681			name += '; ' + self._get_name_piece (lt.variant)
682		return name
683
684bcp_47 = BCP47Parser ()
685
686ot.parse (sys.argv[1])
687bcp_47.parse (sys.argv[2])
688
689ot.add_language ('ary', 'MOR')
690
691ot.add_language ('ath', 'ATH')
692
693ot.add_language ('bai', 'BML')
694
695ot.ranks['BAL'] = ot.ranks['KAR'] + 1
696
697ot.add_language ('ber', 'BBR')
698
699ot.remove_language_ot ('PGR')
700ot.add_language ('el-polyton', 'PGR')
701
702bcp_47.macrolanguages['et'] = {'ekk'}
703
704bcp_47.names['flm'] = 'Falam Chin'
705bcp_47.scopes['flm'] = ' (retired code)'
706bcp_47.macrolanguages['flm'] = {'cfm'}
707
708ot.ranks['FNE'] = ot.ranks['TNE'] + 1
709
710ot.add_language ('und-fonipa', 'IPPH')
711
712ot.add_language ('und-fonnapa', 'APPH')
713
714ot.remove_language_ot ('IRT')
715ot.add_language ('ga-Latg', 'IRT')
716
717ot.add_language ('hy-arevmda', 'HYE')
718
719ot.remove_language_ot ('KGE')
720ot.add_language ('und-Geok', 'KGE')
721
722bcp_47.macrolanguages['id'] = {'in'}
723
724bcp_47.macrolanguages['ijo'] = {'ijc'}
725
726ot.add_language ('kht', 'KHN')
727ot.names['KHN'] = ot.names['KHT'] + ' (Microsoft fonts)'
728ot.ranks['KHN'] = ot.ranks['KHT'] + 1
729
730ot.ranks['LCR'] = ot.ranks['MCR'] + 1
731
732ot.names['MAL'] = 'Malayalam Traditional'
733ot.ranks['MLR'] += 1
734
735bcp_47.names['mhv'] = 'Arakanese'
736bcp_47.scopes['mhv'] = ' (retired code)'
737
738ot.add_language ('mnw-TH', 'MONT')
739
740ot.add_language ('no', 'NOR')
741
742ot.add_language ('oc-provenc', 'PRO')
743
744ot.remove_language_ot ('QUZ')
745ot.add_language ('qu', 'QUZ')
746ot.add_language ('qub', 'QWH')
747ot.add_language ('qud', 'QVI')
748ot.add_language ('qug', 'QVI')
749ot.add_language ('qul', 'QUH')
750ot.add_language ('qup', 'QVI')
751ot.add_language ('qur', 'QWH')
752ot.add_language ('qus', 'QUH')
753ot.add_language ('quw', 'QVI')
754ot.add_language ('qux', 'QWH')
755ot.add_language ('qva', 'QWH')
756ot.add_language ('qvh', 'QWH')
757ot.add_language ('qvj', 'QVI')
758ot.add_language ('qvl', 'QWH')
759ot.add_language ('qvm', 'QWH')
760ot.add_language ('qvn', 'QWH')
761ot.add_language ('qvo', 'QVI')
762ot.add_language ('qvp', 'QWH')
763ot.add_language ('qvw', 'QWH')
764ot.add_language ('qvz', 'QVI')
765ot.add_language ('qwa', 'QWH')
766ot.add_language ('qws', 'QWH')
767ot.add_language ('qxa', 'QWH')
768ot.add_language ('qxc', 'QWH')
769ot.add_language ('qxh', 'QWH')
770ot.add_language ('qxl', 'QVI')
771ot.add_language ('qxn', 'QWH')
772ot.add_language ('qxo', 'QWH')
773ot.add_language ('qxr', 'QVI')
774ot.add_language ('qxt', 'QWH')
775ot.add_language ('qxw', 'QWH')
776
777bcp_47.macrolanguages['ro-MD'].add ('mo')
778
779ot.remove_language_ot ('SYRE')
780ot.remove_language_ot ('SYRJ')
781ot.remove_language_ot ('SYRN')
782ot.add_language ('und-Syre', 'SYRE')
783ot.add_language ('und-Syrj', 'SYRJ')
784ot.add_language ('und-Syrn', 'SYRN')
785
786bcp_47.names['xst'] = "Silt'e"
787bcp_47.scopes['xst'] = ' (retired code)'
788bcp_47.macrolanguages['xst'] = {'stv', 'wle'}
789
790ot.add_language ('xwo', 'TOD')
791
792ot.remove_language_ot ('ZHH')
793ot.remove_language_ot ('ZHP')
794ot.remove_language_ot ('ZHT')
795ot.remove_language_ot ('ZHTM')
796bcp_47.macrolanguages['zh'].remove ('lzh')
797bcp_47.macrolanguages['zh'].remove ('yue')
798ot.add_language ('zh-Hant-MO', 'ZHH')
799ot.add_language ('zh-Hant-MO', 'ZHTM')
800ot.add_language ('zh-Hant-HK', 'ZHH')
801ot.add_language ('zh-Hans', 'ZHS')
802ot.add_language ('zh-Hant', 'ZHT')
803ot.add_language ('zh-HK', 'ZHH')
804ot.add_language ('zh-MO', 'ZHH')
805ot.add_language ('zh-MO', 'ZHTM')
806ot.add_language ('zh-TW', 'ZHT')
807ot.add_language ('lzh', 'ZHT')
808ot.add_language ('lzh-Hans', 'ZHS')
809ot.add_language ('yue', 'ZHH')
810ot.add_language ('yue-Hans', 'ZHS')
811
812bcp_47.macrolanguages['zom'] = {'yos'}
813
814def rank_delta (bcp_47, ot):
815	"""Return a delta to apply to a BCP 47 tag's rank.
816
817	Most OpenType tags have a constant rank, but a few have ranks that
818	depend on the BCP 47 tag.
819
820	Args:
821		bcp_47 (str): A BCP 47 tag.
822		ot (str): An OpenType tag to.
823
824	Returns:
825		A number to add to ``ot``'s rank when sorting ``bcp_47``'s
826		OpenType equivalents.
827	"""
828	if bcp_47 == 'ak' and ot == 'AKA':
829		return -1
830	if bcp_47 == 'tw' and ot == 'TWI':
831		return -1
832	return 0
833
834disambiguation = {
835	'ALT': 'alt',
836	'ARK': 'rki',
837	'ATH': 'ath',
838	'BHI': 'bhb',
839	'BLN': 'bjt',
840	'BTI': 'beb',
841	'CCHN': 'cco',
842	'CMR': 'swb',
843	'CPP': 'crp',
844	'CRR': 'crx',
845	'DUJ': 'dwu',
846	'ECR': 'crj',
847	'HAL': 'cfm',
848	'HND': 'hnd',
849	'HYE': 'hyw',
850	'KIS': 'kqs',
851	'KUI': 'uki',
852	'LRC': 'bqi',
853	'NDB': 'nd',
854	'NIS': 'njz',
855	'PLG': 'pce',
856	'PRO': 'pro',
857	'QIN': 'bgr',
858	'QUH': 'quh',
859	'QVI': 'qvi',
860	'QWH': 'qwh',
861	'SIG': 'stv',
862	'SRB': 'sr',
863	'SXT': 'xnj',
864	'ZHH': 'zh-HK',
865	'ZHS': 'zh-Hans',
866	'ZHT': 'zh-Hant',
867	'ZHTM': 'zh-MO',
868}
869
870ot.inherit_from_macrolanguages ()
871bcp_47.remove_extra_macrolanguages ()
872ot.inherit_from_macrolanguages ()
873ot.names[DEFAULT_LANGUAGE_SYSTEM] = '*/'
874ot.ranks[DEFAULT_LANGUAGE_SYSTEM] = max (ot.ranks.values ()) + 1
875for tricky_ot_tag in filter (lambda tag: re.match ('[A-Z]{3}$', tag), ot.names):
876	possible_bcp_47_tag = tricky_ot_tag.lower ()
877	if possible_bcp_47_tag in bcp_47.names and not ot.from_bcp_47[possible_bcp_47_tag]:
878		ot.add_language (possible_bcp_47_tag, DEFAULT_LANGUAGE_SYSTEM)
879		bcp_47.macrolanguages[possible_bcp_47_tag] = set ()
880ot.sort_languages ()
881
882print ('/* == Start of generated table == */')
883print ('/*')
884print (' * The following table is generated by running:')
885print (' *')
886print (' *   %s languagetags language-subtag-registry' % sys.argv[0])
887print (' *')
888print (' * on files with these headers:')
889print (' *')
890print (' * %s' % ot.header.strip ())
891print (' * %s' % bcp_47.header)
892print (' */')
893print ()
894print ('#ifndef HB_OT_TAG_TABLE_HH')
895print ('#define HB_OT_TAG_TABLE_HH')
896print ()
897print ('static const LangTag ot_languages[] = {')
898
899def hb_tag (tag):
900	"""Convert a tag to ``HB_TAG`` form.
901
902	Args:
903		tag (str): An OpenType tag.
904
905	Returns:
906		A snippet of C++ representing ``tag``.
907	"""
908	if tag == DEFAULT_LANGUAGE_SYSTEM:
909		return 'HB_TAG_NONE\t       '
910	return "HB_TAG('%s','%s','%s','%s')" % tuple (('%-4s' % tag)[:4])
911
912def get_variant_set (name):
913	"""Return a set of variant language names from a name.
914
915	Args:
916		name (str): A list of language names from the BCP 47 registry,
917			joined on ``'\\n'``.
918
919	Returns:
920		A set of normalized language names.
921	"""
922	return set (unicodedata.normalize ('NFD', n.replace ('\u2019', "'"))
923			.encode ('ASCII', 'ignore')
924			.strip ()
925			for n in re.split ('[\n(),]', name) if n)
926
927def language_name_intersection (a, b):
928	"""Return the names in common between two language names.
929
930	Args:
931		a (str): A list of language names from the BCP 47 registry,
932			joined on ``'\\n'``.
933		b (str): A list of language names from the BCP 47 registry,
934			joined on ``'\\n'``.
935
936	Returns:
937		The normalized language names shared by ``a`` and ``b``.
938	"""
939	return get_variant_set (a).intersection (get_variant_set (b))
940
941def get_matching_language_name (intersection, candidates):
942	return next (iter (c for c in candidates if not intersection.isdisjoint (get_variant_set (c))))
943
944def same_tag (bcp_47_tag, ot_tags):
945	return len (bcp_47_tag) == 3 and len (ot_tags) == 1 and bcp_47_tag == ot_tags[0].lower ()
946
947for language, tags in sorted (ot.from_bcp_47.items ()):
948	if language == '' or '-' in language:
949		continue
950	commented_out = same_tag (language, tags)
951	for i, tag in enumerate (tags, start=1):
952		print ('%s{\"%s\",\t%s},' % ('/*' if commented_out else '  ', language, hb_tag (tag)), end='')
953		if commented_out:
954			print ('*/', end='')
955		print ('\t/* ', end='')
956		bcp_47_name = bcp_47.names.get (language, '')
957		bcp_47_name_candidates = bcp_47_name.split ('\n')
958		ot_name = ot.names[tag]
959		scope = bcp_47.scopes.get (language, '')
960		if tag == DEFAULT_LANGUAGE_SYSTEM:
961			write (f'{bcp_47_name_candidates[0]}{scope} != {ot.names[language.upper ()]}')
962		else:
963			intersection = language_name_intersection (bcp_47_name, ot_name)
964			if not intersection:
965				write ('%s%s -> %s' % (bcp_47_name_candidates[0], scope, ot_name))
966			else:
967				name = get_matching_language_name (intersection, bcp_47_name_candidates)
968				bcp_47.names[language] = name
969				write ('%s%s' % (name if len (name) > len (ot_name) else ot_name, scope))
970		print (' */')
971
972print ('};')
973print ()
974
975print ('/**')
976print (' * hb_ot_tags_from_complex_language:')
977print (' * @lang_str: a BCP 47 language tag to convert.')
978print (' * @limit: a pointer to the end of the substring of @lang_str to consider for')
979print (' * conversion.')
980print (' * @count: maximum number of language tags to retrieve (IN) and actual number of')
981print (' * language tags retrieved (OUT). If no tags are retrieved, it is not modified.')
982print (' * @tags: array of size at least @language_count to store the language tag')
983print (' * results')
984print (' *')
985print (' * Converts a multi-subtag BCP 47 language tag to language tags.')
986print (' *')
987print (' * Return value: Whether any language systems were retrieved.')
988print (' **/')
989print ('static bool')
990print ('hb_ot_tags_from_complex_language (const char   *lang_str,')
991print ('\t\t\t\t  const char   *limit,')
992print ('\t\t\t\t  unsigned int *count /* IN/OUT */,')
993print ('\t\t\t\t  hb_tag_t     *tags /* OUT */)')
994print ('{')
995
996def print_subtag_matches (subtag, new_line):
997	if subtag:
998		if new_line:
999			print ()
1000			print ('\t&& ', end='')
1001		print ('subtag_matches (lang_str, limit, "-%s")' % subtag, end='')
1002
1003complex_tags = collections.defaultdict (list)
1004for initial, group in itertools.groupby ((lt_tags for lt_tags in [
1005			(LanguageTag (language), tags)
1006			for language, tags in sorted (ot.from_bcp_47.items (),
1007				key=lambda i: (-len (i[0]), i[0]))
1008		] if lt_tags[0].is_complex ()),
1009		key=lambda lt_tags: lt_tags[0].get_group ()):
1010	complex_tags[initial] += group
1011
1012for initial, items in sorted (complex_tags.items ()):
1013	if initial != 'und':
1014		continue
1015	for lt, tags in items:
1016		if not tags:
1017			continue
1018		if lt.variant in bcp_47.prefixes:
1019			expect (next (iter (bcp_47.prefixes[lt.variant])) == lt.language,
1020					'%s is not a valid prefix of %s' % (lt.language, lt.variant))
1021		print ('  if (', end='')
1022		print_subtag_matches (lt.script, False)
1023		print_subtag_matches (lt.region, False)
1024		print_subtag_matches (lt.variant, False)
1025		print (')')
1026		print ('  {')
1027		write ('    /* %s */' % bcp_47.get_name (lt))
1028		print ()
1029		if len (tags) == 1:
1030			write ('    tags[0] = %s;  /* %s */' % (hb_tag (tags[0]), ot.names[tags[0]]))
1031			print ()
1032			print ('    *count = 1;')
1033		else:
1034			print ('    hb_tag_t possible_tags[] = {')
1035			for tag in tags:
1036				write ('      %s,  /* %s */' % (hb_tag (tag), ot.names[tag]))
1037				print ()
1038			print ('    };')
1039			print ('    for (i = 0; i < %s && i < *count; i++)' % len (tags))
1040			print ('      tags[i] = possible_tags[i];')
1041			print ('    *count = i;')
1042		print ('    return true;')
1043		print ('  }')
1044
1045print ('  switch (lang_str[0])')
1046print ('  {')
1047for initial, items in sorted (complex_tags.items ()):
1048	if initial == 'und':
1049		continue
1050	print ("  case '%s':" % initial)
1051	for lt, tags in items:
1052		if not tags:
1053			continue
1054		print ('    if (', end='')
1055		script = lt.script
1056		region = lt.region
1057		if lt.grandfathered:
1058			print ('0 == strcmp (&lang_str[1], "%s")' % lt.language[1:], end='')
1059		else:
1060			string_literal = lt.language[1:] + '-'
1061			if script:
1062				string_literal += script
1063				script = None
1064				if region:
1065					string_literal += '-' + region
1066					region = None
1067			if string_literal[-1] == '-':
1068				print ('0 == strncmp (&lang_str[1], "%s", %i)' % (string_literal, len (string_literal)), end='')
1069			else:
1070				print ('lang_matches (&lang_str[1], "%s")' % string_literal, end='')
1071		print_subtag_matches (script, True)
1072		print_subtag_matches (region, True)
1073		print_subtag_matches (lt.variant, True)
1074		print (')')
1075		print ('    {')
1076		write ('      /* %s */' % bcp_47.get_name (lt))
1077		print ()
1078		if len (tags) == 1:
1079			write ('      tags[0] = %s;  /* %s */' % (hb_tag (tags[0]), ot.names[tags[0]]))
1080			print ()
1081			print ('      *count = 1;')
1082		else:
1083			print ('      unsigned int i;')
1084			print ('      hb_tag_t possible_tags[] = {')
1085			for tag in tags:
1086				write ('\t%s,  /* %s */' % (hb_tag (tag), ot.names[tag]))
1087				print ()
1088			print ('      };')
1089			print ('      for (i = 0; i < %s && i < *count; i++)' % len (tags))
1090			print ('\ttags[i] = possible_tags[i];')
1091			print ('      *count = i;')
1092		print ('      return true;')
1093		print ('    }')
1094	print ('    break;')
1095
1096print ('  }')
1097print ('  return false;')
1098print ('}')
1099print ()
1100print ('/**')
1101print (' * hb_ot_ambiguous_tag_to_language')
1102print (' * @tag: A language tag.')
1103print (' *')
1104print (' * Converts @tag to a BCP 47 language tag if it is ambiguous (it corresponds to')
1105print (' * many language tags) and the best tag is not the alphabetically first, or if')
1106print (' * the best tag consists of multiple subtags, or if the best tag does not appear')
1107print (' * in #ot_languages.')
1108print (' *')
1109print (' * Return value: The #hb_language_t corresponding to the BCP 47 language tag,')
1110print (' * or #HB_LANGUAGE_INVALID if @tag is not ambiguous.')
1111print (' **/')
1112print ('static hb_language_t')
1113print ('hb_ot_ambiguous_tag_to_language (hb_tag_t tag)')
1114print ('{')
1115print ('  switch (tag)')
1116print ('  {')
1117
1118def verify_disambiguation_dict ():
1119	"""Verify and normalize ``disambiguation``.
1120
1121	``disambiguation`` is a map of ambiguous OpenType language system
1122	tags to the particular BCP 47 tags they correspond to. This function
1123	checks that all its keys really are ambiguous and that each key's
1124	value is valid for that key. It checks that no ambiguous tag is
1125	missing, except when it can figure out which BCP 47 tag is the best
1126	by itself.
1127
1128	It modifies ``disambiguation`` to remove keys whose values are the
1129	same as those that the fallback would return anyway, and to add
1130	ambiguous keys whose disambiguations it determined automatically.
1131
1132	Raises:
1133		AssertionError: Verification failed.
1134	"""
1135	global bcp_47
1136	global disambiguation
1137	global ot
1138	for ot_tag, bcp_47_tags in ot.to_bcp_47.items ():
1139		if ot_tag == DEFAULT_LANGUAGE_SYSTEM:
1140			primary_tags = []
1141		else:
1142			primary_tags = list (t for t in bcp_47_tags if t not in bcp_47.grandfathered and ot.from_bcp_47.get (t)[0] == ot_tag)
1143		if len (primary_tags) == 1:
1144			expect (ot_tag not in disambiguation, 'unnecessary disambiguation for OT tag: %s' % ot_tag)
1145			if '-' in primary_tags[0]:
1146				disambiguation[ot_tag] = primary_tags[0]
1147			else:
1148				first_tag = sorted (t for t in bcp_47_tags if t not in bcp_47.grandfathered and ot_tag in ot.from_bcp_47.get (t))[0]
1149				if primary_tags[0] != first_tag:
1150					disambiguation[ot_tag] = primary_tags[0]
1151		elif len (primary_tags) == 0:
1152			expect (ot_tag not in disambiguation, 'There is no possible valid disambiguation for %s' % ot_tag)
1153		else:
1154			original_languages = [t for t in primary_tags if t in ot.from_bcp_47_uninherited and 'retired code' not in bcp_47.scopes.get (t, '')]
1155			if len (original_languages) == 1:
1156				macrolanguages = original_languages
1157			else:
1158				macrolanguages = [t for t in primary_tags if bcp_47.scopes.get (t) == ' [macrolanguage]']
1159			if len (macrolanguages) != 1:
1160				macrolanguages = list (t for t in primary_tags if bcp_47.scopes.get (t) == ' [collection]')
1161			if len (macrolanguages) != 1:
1162				macrolanguages = list (t for t in primary_tags if 'retired code' not in bcp_47.scopes.get (t, ''))
1163			if len (macrolanguages) != 1:
1164				expect (ot_tag in disambiguation, 'ambiguous OT tag: %s %s' % (ot_tag, str (macrolanguages)))
1165				expect (disambiguation[ot_tag] in bcp_47_tags,
1166						'%s is not a valid disambiguation for %s' % (disambiguation[ot_tag], ot_tag))
1167			elif ot_tag not in disambiguation:
1168				disambiguation[ot_tag] = macrolanguages[0]
1169			different_bcp_47_tags = sorted (t for t in bcp_47_tags if not same_tag (t, ot.from_bcp_47.get (t)))
1170			if different_bcp_47_tags and disambiguation[ot_tag] == different_bcp_47_tags[0] and '-' not in disambiguation[ot_tag]:
1171				del disambiguation[ot_tag]
1172	for ot_tag in disambiguation.keys ():
1173		expect (ot_tag in ot.to_bcp_47, 'unknown OT tag: %s' % ot_tag)
1174
1175verify_disambiguation_dict ()
1176for ot_tag, bcp_47_tag in sorted (disambiguation.items ()):
1177	write ('  case %s:  /* %s */' % (hb_tag (ot_tag), ot.names[ot_tag]))
1178	print ()
1179	write ('    return hb_language_from_string (\"%s\", -1);  /* %s */' % (bcp_47_tag, bcp_47.get_name (LanguageTag (bcp_47_tag))))
1180	print ()
1181
1182print ('  default:')
1183print ('    return HB_LANGUAGE_INVALID;')
1184print ('  }')
1185print ('}')
1186
1187print ()
1188print ('#endif /* HB_OT_TAG_TABLE_HH */')
1189print ()
1190print ('/* == End of generated table == */')
1191
1192