1#!/usr/bin/env python3
2# flake8: noqa
3
4import io
5import sys
6
7if len (sys.argv) != 5:
8	print ("""usage: ./gen-use-table.py IndicSyllabicCategory.txt IndicPositionalCategory.txt UnicodeData.txt Blocks.txt
9
10Input file, as of Unicode 12:
11* https://unicode.org/Public/UCD/latest/ucd/IndicSyllabicCategory.txt
12* https://unicode.org/Public/UCD/latest/ucd/IndicPositionalCategory.txt
13* https://unicode.org/Public/UCD/latest/ucd/UnicodeData.txt
14* https://unicode.org/Public/UCD/latest/ucd/Blocks.txt""", file=sys.stderr)
15	sys.exit (1)
16
17BLACKLISTED_BLOCKS = ["Thai", "Lao"]
18
19files = [io.open (x, encoding='utf-8') for x in sys.argv[1:]]
20
21headers = [[f.readline () for i in range (2)] for j,f in enumerate(files) if j != 2]
22headers.append (["UnicodeData.txt does not have a header."])
23
24data = [{} for f in files]
25values = [{} for f in files]
26for i, f in enumerate (files):
27	for line in f:
28
29		j = line.find ('#')
30		if j >= 0:
31			line = line[:j]
32
33		fields = [x.strip () for x in line.split (';')]
34		if len (fields) == 1:
35			continue
36
37		uu = fields[0].split ('..')
38		start = int (uu[0], 16)
39		if len (uu) == 1:
40			end = start
41		else:
42			end = int (uu[1], 16)
43
44		t = fields[1 if i != 2 else 2]
45
46		for u in range (start, end + 1):
47			data[i][u] = t
48		values[i][t] = values[i].get (t, 0) + end - start + 1
49
50defaults = ('Other', 'Not_Applicable', 'Cn', 'No_Block')
51
52# TODO Characters that are not in Unicode Indic files, but used in USE
53data[0][0x034F] = defaults[0]
54data[0][0x1B61] = defaults[0]
55data[0][0x1B63] = defaults[0]
56data[0][0x1B64] = defaults[0]
57data[0][0x1B65] = defaults[0]
58data[0][0x1B66] = defaults[0]
59data[0][0x1B67] = defaults[0]
60data[0][0x1B69] = defaults[0]
61data[0][0x1B6A] = defaults[0]
62data[0][0x2060] = defaults[0]
63# TODO https://github.com/harfbuzz/harfbuzz/pull/1685
64data[0][0x1B5B] = 'Consonant_Placeholder'
65data[0][0x1B5C] = 'Consonant_Placeholder'
66data[0][0x1B5F] = 'Consonant_Placeholder'
67data[0][0x1B62] = 'Consonant_Placeholder'
68data[0][0x1B68] = 'Consonant_Placeholder'
69# TODO https://github.com/harfbuzz/harfbuzz/issues/1035
70data[0][0x11C44] = 'Consonant_Placeholder'
71data[0][0x11C45] = 'Consonant_Placeholder'
72# TODO https://github.com/harfbuzz/harfbuzz/pull/1399
73data[0][0x111C8] = 'Consonant_Placeholder'
74for u in range (0xFE00, 0xFE0F + 1):
75	data[0][u] = defaults[0]
76
77# Merge data into one dict:
78for i,v in enumerate (defaults):
79	values[i][v] = values[i].get (v, 0) + 1
80combined = {}
81for i,d in enumerate (data):
82	for u,v in d.items ():
83		if i >= 2 and not u in combined:
84			continue
85		if not u in combined:
86			combined[u] = list (defaults)
87		combined[u][i] = v
88combined = {k:v for k,v in combined.items() if v[3] not in BLACKLISTED_BLOCKS}
89data = combined
90del combined
91num = len (data)
92
93
94property_names = [
95	# General_Category
96	'Cc', 'Cf', 'Cn', 'Co', 'Cs', 'Ll', 'Lm', 'Lo', 'Lt', 'Lu', 'Mc',
97	'Me', 'Mn', 'Nd', 'Nl', 'No', 'Pc', 'Pd', 'Pe', 'Pf', 'Pi', 'Po',
98	'Ps', 'Sc', 'Sk', 'Sm', 'So', 'Zl', 'Zp', 'Zs',
99	# Indic_Syllabic_Category
100	'Other',
101	'Bindu',
102	'Visarga',
103	'Avagraha',
104	'Nukta',
105	'Virama',
106	'Pure_Killer',
107	'Invisible_Stacker',
108	'Vowel_Independent',
109	'Vowel_Dependent',
110	'Vowel',
111	'Consonant_Placeholder',
112	'Consonant',
113	'Consonant_Dead',
114	'Consonant_With_Stacker',
115	'Consonant_Prefixed',
116	'Consonant_Preceding_Repha',
117	'Consonant_Succeeding_Repha',
118	'Consonant_Subjoined',
119	'Consonant_Medial',
120	'Consonant_Final',
121	'Consonant_Head_Letter',
122	'Consonant_Initial_Postfixed',
123	'Modifying_Letter',
124	'Tone_Letter',
125	'Tone_Mark',
126	'Gemination_Mark',
127	'Cantillation_Mark',
128	'Register_Shifter',
129	'Syllable_Modifier',
130	'Consonant_Killer',
131	'Non_Joiner',
132	'Joiner',
133	'Number_Joiner',
134	'Number',
135	'Brahmi_Joining_Number',
136	# Indic_Positional_Category
137	'Not_Applicable',
138	'Right',
139	'Left',
140	'Visual_Order_Left',
141	'Left_And_Right',
142	'Top',
143	'Bottom',
144	'Top_And_Bottom',
145	'Top_And_Right',
146	'Top_And_Left',
147	'Top_And_Left_And_Right',
148	'Bottom_And_Left',
149	'Bottom_And_Right',
150	'Top_And_Bottom_And_Right',
151	'Overstruck',
152]
153
154class PropertyValue(object):
155	def __init__(self, name_):
156		self.name = name_
157	def __str__(self):
158		return self.name
159	def __eq__(self, other):
160		return self.name == (other if isinstance(other, str) else other.name)
161	def __ne__(self, other):
162		return not (self == other)
163	def __hash__(self):
164		return hash(str(self))
165
166property_values = {}
167
168for name in property_names:
169	value = PropertyValue(name)
170	assert value not in property_values
171	assert value not in globals()
172	property_values[name] = value
173globals().update(property_values)
174
175
176def is_BASE(U, UISC, UGC):
177	return (UISC in [Number, Consonant, Consonant_Head_Letter,
178			#SPEC-DRAFT Consonant_Placeholder,
179			Tone_Letter,
180			Vowel_Independent #SPEC-DRAFT
181			] or
182		(UGC == Lo and UISC in [Avagraha, Bindu, Consonant_Final, Consonant_Medial,
183					Consonant_Subjoined, Vowel, Vowel_Dependent]))
184def is_BASE_IND(U, UISC, UGC):
185	#SPEC-DRAFT return (UISC in [Consonant_Dead, Modifying_Letter] or UGC == Po)
186	return (UISC in [Consonant_Dead, Modifying_Letter] or
187		(UGC == Po and not U in [0x104B, 0x104E, 0x1B5B, 0x1B5C, 0x1B5F, 0x2022, 0x111C8, 0x11A3F, 0x11A45, 0x11C44, 0x11C45]) or
188		False # SPEC-DRAFT-OUTDATED! U == 0x002D
189		)
190def is_BASE_NUM(U, UISC, UGC):
191	return UISC == Brahmi_Joining_Number
192def is_BASE_OTHER(U, UISC, UGC):
193	if UISC == Consonant_Placeholder: return True #SPEC-DRAFT
194	#SPEC-DRAFT return U in [0x00A0, 0x00D7, 0x2015, 0x2022, 0x25CC, 0x25FB, 0x25FC, 0x25FD, 0x25FE]
195	return U in [0x2015, 0x2022, 0x25FB, 0x25FC, 0x25FD, 0x25FE]
196def is_CGJ(U, UISC, UGC):
197	return U == 0x034F
198def is_CONS_FINAL(U, UISC, UGC):
199	return ((UISC == Consonant_Final and UGC != Lo) or
200		UISC == Consonant_Succeeding_Repha)
201def is_CONS_FINAL_MOD(U, UISC, UGC):
202	#SPEC-DRAFT return  UISC in [Consonant_Final_Modifier, Syllable_Modifier]
203	return  UISC == Syllable_Modifier
204def is_CONS_MED(U, UISC, UGC):
205	# Consonant_Initial_Postfixed is new in Unicode 11; not in the spec.
206	return (UISC == Consonant_Medial and UGC != Lo or
207		UISC == Consonant_Initial_Postfixed)
208def is_CONS_MOD(U, UISC, UGC):
209	return UISC in [Nukta, Gemination_Mark, Consonant_Killer]
210def is_CONS_SUB(U, UISC, UGC):
211	#SPEC-DRAFT return UISC == Consonant_Subjoined
212	return UISC == Consonant_Subjoined and UGC != Lo
213def is_CONS_WITH_STACKER(U, UISC, UGC):
214	return UISC == Consonant_With_Stacker
215def is_HALANT(U, UISC, UGC):
216	return (UISC in [Virama, Invisible_Stacker]
217		and not is_HALANT_OR_VOWEL_MODIFIER(U, UISC, UGC)
218		and not is_SAKOT(U, UISC, UGC))
219def is_HALANT_OR_VOWEL_MODIFIER(U, UISC, UGC):
220	# https://github.com/harfbuzz/harfbuzz/issues/1102
221	# https://github.com/harfbuzz/harfbuzz/issues/1379
222	return U in [0x11046, 0x1134D]
223def is_HALANT_NUM(U, UISC, UGC):
224	return UISC == Number_Joiner
225def is_ZWNJ(U, UISC, UGC):
226	return UISC == Non_Joiner
227def is_ZWJ(U, UISC, UGC):
228	return UISC == Joiner
229def is_Word_Joiner(U, UISC, UGC):
230	return U == 0x2060
231def is_OTHER(U, UISC, UGC):
232	#SPEC-OUTDATED return UGC == Zs # or any other SCRIPT_COMMON characters
233	return (UISC == Other
234		and not is_SYM(U, UISC, UGC)
235		and not is_SYM_MOD(U, UISC, UGC)
236		and not is_CGJ(U, UISC, UGC)
237		and not is_Word_Joiner(U, UISC, UGC)
238		and not is_VARIATION_SELECTOR(U, UISC, UGC)
239	)
240def is_Reserved(U, UISC, UGC):
241	return UGC == 'Cn'
242def is_REPHA(U, UISC, UGC):
243	return UISC in [Consonant_Preceding_Repha, Consonant_Prefixed]
244def is_SAKOT(U, UISC, UGC):
245	return U == 0x1A60
246def is_SYM(U, UISC, UGC):
247	if U == 0x25CC: return False #SPEC-DRAFT
248	#SPEC-DRAFT return UGC in [So, Sc] or UISC == Symbol_Letter
249	return UGC in [So, Sc] and U not in [0x1B62, 0x1B68]
250def is_SYM_MOD(U, UISC, UGC):
251	return U in [0x1B6B, 0x1B6C, 0x1B6D, 0x1B6E, 0x1B6F, 0x1B70, 0x1B71, 0x1B72, 0x1B73]
252def is_VARIATION_SELECTOR(U, UISC, UGC):
253	return 0xFE00 <= U <= 0xFE0F
254def is_VOWEL(U, UISC, UGC):
255	# https://github.com/harfbuzz/harfbuzz/issues/376
256	return (UISC == Pure_Killer or
257		(UGC != Lo and UISC in [Vowel, Vowel_Dependent] and U not in [0xAA29]))
258def is_VOWEL_MOD(U, UISC, UGC):
259	# https://github.com/harfbuzz/harfbuzz/issues/376
260	return (UISC in [Tone_Mark, Cantillation_Mark, Register_Shifter, Visarga] or
261		(UGC != Lo and (UISC == Bindu or U in [0xAA29])))
262
263use_mapping = {
264	'B':	is_BASE,
265	'IND':	is_BASE_IND,
266	'N':	is_BASE_NUM,
267	'GB':	is_BASE_OTHER,
268	'CGJ':	is_CGJ,
269	'F':	is_CONS_FINAL,
270	'FM':	is_CONS_FINAL_MOD,
271	'M':	is_CONS_MED,
272	'CM':	is_CONS_MOD,
273	'SUB':	is_CONS_SUB,
274	'CS':	is_CONS_WITH_STACKER,
275	'H':	is_HALANT,
276	'HVM':	is_HALANT_OR_VOWEL_MODIFIER,
277	'HN':	is_HALANT_NUM,
278	'ZWNJ':	is_ZWNJ,
279	'ZWJ':	is_ZWJ,
280	'WJ':	is_Word_Joiner,
281	'O':	is_OTHER,
282	'Rsv':	is_Reserved,
283	'R':	is_REPHA,
284	'S':	is_SYM,
285	'Sk':	is_SAKOT,
286	'SM':	is_SYM_MOD,
287	'VS':	is_VARIATION_SELECTOR,
288	'V':	is_VOWEL,
289	'VM':	is_VOWEL_MOD,
290}
291
292use_positions = {
293	'F': {
294		'Abv': [Top],
295		'Blw': [Bottom],
296		'Pst': [Right],
297	},
298	'M': {
299		'Abv': [Top],
300		'Blw': [Bottom, Bottom_And_Left],
301		'Pst': [Right],
302		'Pre': [Left],
303	},
304	'CM': {
305		'Abv': [Top],
306		'Blw': [Bottom],
307	},
308	'V': {
309		'Abv': [Top, Top_And_Bottom, Top_And_Bottom_And_Right, Top_And_Right],
310		'Blw': [Bottom, Overstruck, Bottom_And_Right],
311		'Pst': [Right, Top_And_Left, Top_And_Left_And_Right, Left_And_Right],
312		'Pre': [Left],
313	},
314	'VM': {
315		'Abv': [Top],
316		'Blw': [Bottom, Overstruck],
317		'Pst': [Right],
318		'Pre': [Left],
319	},
320	'SM': {
321		'Abv': [Top],
322		'Blw': [Bottom],
323	},
324	'H': None,
325	'HVM': None,
326	'B': None,
327	'FM': {
328		'Abv': [Top],
329		'Blw': [Bottom],
330		'Pst': [Not_Applicable],
331	},
332	'SUB': None,
333}
334
335def map_to_use(data):
336	out = {}
337	items = use_mapping.items()
338	for U,(UISC,UIPC,UGC,UBlock) in data.items():
339
340		# Resolve Indic_Syllabic_Category
341
342		# TODO: These don't have UISC assigned in Unicode 12.0, but have UIPC
343		if 0x1CE2 <= U <= 0x1CE8: UISC = Cantillation_Mark
344
345		# Tibetan:
346		# TODO: These don't have UISC assigned in Unicode 12.0, but have UIPC
347		if 0x0F18 <= U <= 0x0F19 or 0x0F3E <= U <= 0x0F3F: UISC = Vowel_Dependent
348		if 0x0F86 <= U <= 0x0F87: UISC = Tone_Mark
349		# Overrides to allow NFC order matching syllable
350		# https://github.com/harfbuzz/harfbuzz/issues/1012
351		if UBlock == 'Tibetan' and is_VOWEL (U, UISC, UGC):
352			if UIPC == Top:
353				UIPC = Bottom
354
355		# TODO: https://github.com/harfbuzz/harfbuzz/pull/982
356		# also  https://github.com/harfbuzz/harfbuzz/issues/1012
357		if UBlock == 'Chakma' and is_VOWEL (U, UISC, UGC):
358			if UIPC == Top:
359				UIPC = Bottom
360			elif UIPC == Bottom:
361				UIPC = Top
362
363		# TODO: https://github.com/harfbuzz/harfbuzz/pull/627
364		if 0x1BF2 <= U <= 0x1BF3: UISC = Nukta; UIPC = Bottom
365
366		# TODO: U+1CED should only be allowed after some of
367		# the nasalization marks, maybe only for U+1CE9..U+1CF1.
368		if U == 0x1CED: UISC = Tone_Mark
369
370		# TODO: https://github.com/harfbuzz/harfbuzz/issues/1105
371		if U == 0x11134: UISC = Gemination_Mark
372
373		values = [k for k,v in items if v(U,UISC,UGC)]
374		assert len(values) == 1, "%s %s %s %s" % (hex(U), UISC, UGC, values)
375		USE = values[0]
376
377		# Resolve Indic_Positional_Category
378
379		# TODO: These should die, but have UIPC in Unicode 12.0
380		if U in [0x953, 0x954]: UIPC = Not_Applicable
381
382		# TODO: In USE's override list but not in Unicode 12.0
383		if U == 0x103C: UIPC = Left
384
385		# TODO: https://github.com/harfbuzz/harfbuzz/pull/2012
386		if U == 0x1C29: UIPC = Left
387
388		# TODO: These are not in USE's override list that we have, nor are they in Unicode 12.0
389		if 0xA926 <= U <= 0xA92A: UIPC = Top
390		# TODO: https://github.com/harfbuzz/harfbuzz/pull/1037
391		#  and https://github.com/harfbuzz/harfbuzz/issues/1631
392		if U in [0x11302, 0x11303, 0x114C1]: UIPC = Top
393		if U == 0x1171E: UIPC = Left
394		if 0x1CF8 <= U <= 0x1CF9: UIPC = Top
395
396		assert (UIPC in [Not_Applicable, Visual_Order_Left] or
397			USE in use_positions), "%s %s %s %s %s" % (hex(U), UIPC, USE, UISC, UGC)
398
399		pos_mapping = use_positions.get(USE, None)
400		if pos_mapping:
401			values = [k for k,v in pos_mapping.items() if v and UIPC in v]
402			assert len(values) == 1, "%s %s %s %s %s %s" % (hex(U), UIPC, USE, UISC, UGC, values)
403			USE = USE + values[0]
404
405		out[U] = (USE, UBlock)
406	return out
407
408defaults = ('O', 'No_Block')
409data = map_to_use(data)
410
411print ("/* == Start of generated table == */")
412print ("/*")
413print (" * The following table is generated by running:")
414print (" *")
415print (" *   ./gen-use-table.py IndicSyllabicCategory.txt IndicPositionalCategory.txt UnicodeData.txt Blocks.txt")
416print (" *")
417print (" * on files with these headers:")
418print (" *")
419for h in headers:
420	for l in h:
421		print (" * %s" % (l.strip()))
422print (" */")
423print ()
424print ('#include "hb.hh"')
425print ()
426print ('#ifndef HB_NO_OT_SHAPE')
427print ()
428print ('#include "hb-ot-shape-complex-use.hh"')
429print ()
430
431total = 0
432used = 0
433last_block = None
434def print_block (block, start, end, data):
435	global total, used, last_block
436	if block and block != last_block:
437		print ()
438		print ()
439		print ("  /* %s */" % block)
440		if start % 16:
441			print (' ' * (20 + (start % 16 * 6)), end='')
442	num = 0
443	assert start % 8 == 0
444	assert (end+1) % 8 == 0
445	for u in range (start, end+1):
446		if u % 16 == 0:
447			print ()
448			print ("  /* %04X */" % u, end='')
449		if u in data:
450			num += 1
451		d = data.get (u, defaults)
452		print ("%6s," % d[0], end='')
453
454	total += end - start + 1
455	used += num
456	if block:
457		last_block = block
458
459uu = sorted (data.keys ())
460
461last = -100000
462num = 0
463offset = 0
464starts = []
465ends = []
466print ('#pragma GCC diagnostic push')
467print ('#pragma GCC diagnostic ignored "-Wunused-macros"')
468for k,v in sorted(use_mapping.items()):
469	if k in use_positions and use_positions[k]: continue
470	print ("#define %s	USE_%s	/* %s */" % (k, k, v.__name__[3:]))
471for k,v in sorted(use_positions.items()):
472	if not v: continue
473	for suf in v.keys():
474		tag = k + suf
475		print ("#define %s	USE_%s" % (tag, tag))
476print ('#pragma GCC diagnostic pop')
477print ("")
478print ("static const USE_TABLE_ELEMENT_TYPE use_table[] = {")
479for u in uu:
480	if u <= last:
481		continue
482	block = data[u][1]
483
484	start = u//8*8
485	end = start+1
486	while end in uu and block == data[end][1]:
487		end += 1
488	end = (end-1)//8*8 + 7
489
490	if start != last + 1:
491		if start - last <= 1+16*3:
492			print_block (None, last+1, start-1, data)
493			last = start-1
494		else:
495			if last >= 0:
496				ends.append (last + 1)
497				offset += ends[-1] - starts[-1]
498			print ()
499			print ()
500			print ("#define use_offset_0x%04xu %d" % (start, offset))
501			starts.append (start)
502
503	print_block (block, start, end, data)
504	last = end
505ends.append (last + 1)
506offset += ends[-1] - starts[-1]
507print ()
508print ()
509occupancy = used * 100. / total
510page_bits = 12
511print ("}; /* Table items: %d; occupancy: %d%% */" % (offset, occupancy))
512print ()
513print ("USE_TABLE_ELEMENT_TYPE")
514print ("hb_use_get_category (hb_codepoint_t u)")
515print ("{")
516print ("  switch (u >> %d)" % page_bits)
517print ("  {")
518pages = set([u>>page_bits for u in starts+ends])
519for p in sorted(pages):
520	print ("    case 0x%0Xu:" % p)
521	for (start,end) in zip (starts, ends):
522		if p not in [start>>page_bits, end>>page_bits]: continue
523		offset = "use_offset_0x%04xu" % start
524		print ("      if (hb_in_range<hb_codepoint_t> (u, 0x%04Xu, 0x%04Xu)) return use_table[u - 0x%04Xu + %s];" % (start, end-1, start, offset))
525	print ("      break;")
526	print ("")
527print ("    default:")
528print ("      break;")
529print ("  }")
530print ("  return USE_O;")
531print ("}")
532print ()
533for k in sorted(use_mapping.keys()):
534	if k in use_positions and use_positions[k]: continue
535	print ("#undef %s" % k)
536for k,v in sorted(use_positions.items()):
537	if not v: continue
538	for suf in v.keys():
539		tag = k + suf
540		print ("#undef %s" % tag)
541print ()
542print ()
543print ('#endif')
544print ("/* == End of generated table == */")
545
546# Maintain at least 50% occupancy in the table */
547if occupancy < 50:
548	raise Exception ("Table too sparse, please investigate: ", occupancy)
549