1#!/usr/bin/env python3
2# flake8: noqa: F821
3
4"""usage: ./gen-use-table.py IndicSyllabicCategory.txt IndicPositionalCategory.txt UnicodeData.txt ArabicShaping.txt Blocks.txt IndicSyllabicCategory-Additional.txt IndicPositionalCategory-Additional.txt
5
6Input files:
7* https://unicode.org/Public/UCD/latest/ucd/IndicSyllabicCategory.txt
8* https://unicode.org/Public/UCD/latest/ucd/IndicPositionalCategory.txt
9* https://unicode.org/Public/UCD/latest/ucd/UnicodeData.txt
10* https://unicode.org/Public/UCD/latest/ucd/ArabicShaping.txt
11* https://unicode.org/Public/UCD/latest/ucd/Blocks.txt
12* ms-use/IndicSyllabicCategory-Additional.txt
13* ms-use/IndicPositionalCategory-Additional.txt
14"""
15
16import sys
17
18if len (sys.argv) != 8:
19	sys.exit (__doc__)
20
21BLACKLISTED_BLOCKS = [
22	'Samaritan',
23	'Thai',
24	'Lao',
25]
26
27files = [open (x, encoding='utf-8') for x in sys.argv[1:]]
28
29headers = [[f.readline () for i in range (2)] for j,f in enumerate(files) if j != 2]
30for j in range(5, 7):
31	for line in files[j]:
32		line = line.rstrip()
33		if not line:
34			break
35		headers[j - 1].append(line)
36headers.append (["UnicodeData.txt does not have a header."])
37
38data = [{} for _ in files]
39values = [{} for _ in files]
40for i, f in enumerate (files):
41	extended = False
42
43	for line in f:
44
45		# TODO: https://github.com/MicrosoftDocs/typography-issues/issues/522
46		if extended and line.startswith ('# ') and line.find (';'):
47			line = line[2:]
48		elif 'USE_Syllabic_Category' in line:
49			extended = True
50
51		j = line.find ('#')
52		if j >= 0:
53			line = line[:j]
54
55		fields = [x.strip () for x in line.split (';')]
56		if len (fields) == 1:
57			continue
58
59		uu = fields[0].split ('..')
60		start = int (uu[0], 16)
61		if len (uu) == 1:
62			end = start
63		else:
64			end = int (uu[1], 16)
65
66		t = fields[1 if i not in [2, 3] else 2]
67
68		if i == 3:
69			t = 'jt_' + t
70		elif i == 5 and t == 'Consonant_Final_Modifier':
71			# TODO: https://github.com/MicrosoftDocs/typography-issues/issues/336
72			t = 'Syllable_Modifier'
73		elif i == 6 and t == 'NA':
74			t = 'Not_Applicable'
75
76		i0 = i if i < 5 else i - 5
77		for u in range (start, end + 1):
78			data[i0][u] = t
79		values[i0][t] = values[i0].get (t, 0) + end - start + 1
80
81defaults = ('Other', 'Not_Applicable', 'Cn', 'jt_X', 'No_Block')
82
83# TODO Characters that are not in Unicode Indic files, but used in USE
84data[0][0x0640] = defaults[0]
85data[0][0x1B61] = defaults[0]
86data[0][0x1B63] = defaults[0]
87data[0][0x1B64] = defaults[0]
88data[0][0x1B65] = defaults[0]
89data[0][0x1B66] = defaults[0]
90data[0][0x1B67] = defaults[0]
91data[0][0x1B69] = defaults[0]
92data[0][0x1B6A] = defaults[0]
93data[0][0x2060] = defaults[0]
94for u in range (0x07CA, 0x07EA + 1):
95	data[0][u] = defaults[0]
96data[0][0x07FA] = defaults[0]
97for u in range (0x0840, 0x0858 + 1):
98	data[0][u] = defaults[0]
99for u in range (0x1887, 0x18A8 + 1):
100	data[0][u] = defaults[0]
101data[0][0x18AA] = defaults[0]
102for u in range (0xA840, 0xA872 + 1):
103	data[0][u] = defaults[0]
104for u in range (0x10B80, 0x10B91 + 1):
105	data[0][u] = defaults[0]
106for u in range (0x10BA9, 0x10BAE + 1):
107	data[0][u] = defaults[0]
108data[0][0x10FB0] = defaults[0]
109for u in range (0x10FB2, 0x10FB6 + 1):
110	data[0][u] = defaults[0]
111for u in range (0x10FB8, 0x10FBF + 1):
112	data[0][u] = defaults[0]
113for u in range (0x10FC1, 0x10FC4 + 1):
114	data[0][u] = defaults[0]
115for u in range (0x10FC9, 0x10FCB + 1):
116	data[0][u] = defaults[0]
117# TODO https://github.com/harfbuzz/harfbuzz/pull/1685
118data[0][0x1B5B] = 'Consonant_Placeholder'
119data[0][0x1B5C] = 'Consonant_Placeholder'
120data[0][0x1B5F] = 'Consonant_Placeholder'
121data[0][0x1B62] = 'Consonant_Placeholder'
122data[0][0x1B68] = 'Consonant_Placeholder'
123# TODO https://github.com/harfbuzz/harfbuzz/issues/1035
124data[0][0x11C44] = 'Consonant_Placeholder'
125data[0][0x11C45] = 'Consonant_Placeholder'
126# TODO https://github.com/harfbuzz/harfbuzz/pull/1399
127data[0][0x111C8] = 'Consonant_Placeholder'
128
129# Merge data into one dict:
130for i,v in enumerate (defaults):
131	values[i][v] = values[i].get (v, 0) + 1
132combined = {}
133for i,d in enumerate (data):
134	for u,v in d.items ():
135		if i >= 2 and not u in combined:
136			continue
137		if not u in combined:
138			combined[u] = list (defaults)
139		combined[u][i] = v
140combined = {k:v for k,v in combined.items() if v[4] not in BLACKLISTED_BLOCKS}
141data = combined
142del combined
143
144
145property_names = [
146	# General_Category
147	'Cc', 'Cf', 'Cn', 'Co', 'Cs', 'Ll', 'Lm', 'Lo', 'Lt', 'Lu', 'Mc',
148	'Me', 'Mn', 'Nd', 'Nl', 'No', 'Pc', 'Pd', 'Pe', 'Pf', 'Pi', 'Po',
149	'Ps', 'Sc', 'Sk', 'Sm', 'So', 'Zl', 'Zp', 'Zs',
150	# Indic_Syllabic_Category
151	'Other',
152	'Bindu',
153	'Visarga',
154	'Avagraha',
155	'Nukta',
156	'Virama',
157	'Pure_Killer',
158	'Invisible_Stacker',
159	'Vowel_Independent',
160	'Vowel_Dependent',
161	'Vowel',
162	'Consonant_Placeholder',
163	'Consonant',
164	'Consonant_Dead',
165	'Consonant_With_Stacker',
166	'Consonant_Prefixed',
167	'Consonant_Preceding_Repha',
168	'Consonant_Succeeding_Repha',
169	'Consonant_Subjoined',
170	'Consonant_Medial',
171	'Consonant_Final',
172	'Consonant_Head_Letter',
173	'Consonant_Initial_Postfixed',
174	'Modifying_Letter',
175	'Tone_Letter',
176	'Tone_Mark',
177	'Gemination_Mark',
178	'Cantillation_Mark',
179	'Register_Shifter',
180	'Syllable_Modifier',
181	'Consonant_Killer',
182	'Non_Joiner',
183	'Joiner',
184	'Number_Joiner',
185	'Number',
186	'Brahmi_Joining_Number',
187	'Hieroglyph',
188	'Hieroglyph_Joiner',
189	'Hieroglyph_Segment_Begin',
190	'Hieroglyph_Segment_End',
191	# Indic_Positional_Category
192	'Not_Applicable',
193	'Right',
194	'Left',
195	'Visual_Order_Left',
196	'Left_And_Right',
197	'Top',
198	'Bottom',
199	'Top_And_Bottom',
200	'Top_And_Bottom_And_Left',
201	'Top_And_Right',
202	'Top_And_Left',
203	'Top_And_Left_And_Right',
204	'Bottom_And_Left',
205	'Bottom_And_Right',
206	'Top_And_Bottom_And_Right',
207	'Overstruck',
208	# Joining_Type
209	'jt_C',
210	'jt_D',
211	'jt_L',
212	'jt_R',
213	'jt_T',
214	'jt_U',
215	'jt_X',
216]
217
218class PropertyValue(object):
219	def __init__(self, name_):
220		self.name = name_
221	def __str__(self):
222		return self.name
223	def __eq__(self, other):
224		return self.name == (other if isinstance(other, str) else other.name)
225	def __ne__(self, other):
226		return not (self == other)
227	def __hash__(self):
228		return hash(str(self))
229
230property_values = {}
231
232for name in property_names:
233	value = PropertyValue(name)
234	assert value not in property_values
235	assert value not in globals()
236	property_values[name] = value
237globals().update(property_values)
238
239
240def is_BASE(U, UISC, UGC, AJT):
241	return (UISC in [Number, Consonant, Consonant_Head_Letter,
242			Tone_Letter,
243			Vowel_Independent,
244			] or
245		# TODO: https://github.com/MicrosoftDocs/typography-issues/issues/484
246		AJT in [jt_C, jt_D, jt_L, jt_R] and UISC != Joiner or
247		(UGC == Lo and UISC in [Avagraha, Bindu, Consonant_Final, Consonant_Medial,
248					Consonant_Subjoined, Vowel, Vowel_Dependent]))
249def is_BASE_NUM(U, UISC, UGC, AJT):
250	return UISC == Brahmi_Joining_Number
251def is_BASE_OTHER(U, UISC, UGC, AJT):
252	if UISC == Consonant_Placeholder: return True
253	return U in [0x2015, 0x2022, 0x25FB, 0x25FC, 0x25FD, 0x25FE]
254def is_CONS_FINAL(U, UISC, UGC, AJT):
255	return ((UISC == Consonant_Final and UGC != Lo) or
256		UISC == Consonant_Succeeding_Repha)
257def is_CONS_FINAL_MOD(U, UISC, UGC, AJT):
258	return UISC == Syllable_Modifier
259def is_CONS_MED(U, UISC, UGC, AJT):
260	# Consonant_Initial_Postfixed is new in Unicode 11; not in the spec.
261	return (UISC == Consonant_Medial and UGC != Lo or
262		UISC == Consonant_Initial_Postfixed)
263def is_CONS_MOD(U, UISC, UGC, AJT):
264	return (UISC in [Nukta, Gemination_Mark, Consonant_Killer] and
265		not is_SYM_MOD(U, UISC, UGC, AJT))
266def is_CONS_SUB(U, UISC, UGC, AJT):
267	return UISC == Consonant_Subjoined and UGC != Lo
268def is_CONS_WITH_STACKER(U, UISC, UGC, AJT):
269	return UISC == Consonant_With_Stacker
270def is_HALANT(U, UISC, UGC, AJT):
271	return (UISC in [Virama, Invisible_Stacker]
272		and not is_HALANT_OR_VOWEL_MODIFIER(U, UISC, UGC, AJT)
273		and not is_SAKOT(U, UISC, UGC, AJT))
274def is_HALANT_OR_VOWEL_MODIFIER(U, UISC, UGC, AJT):
275	# https://github.com/harfbuzz/harfbuzz/issues/1102
276	# https://github.com/harfbuzz/harfbuzz/issues/1379
277	return U in [0x11046, 0x1134D]
278def is_HALANT_NUM(U, UISC, UGC, AJT):
279	return UISC == Number_Joiner
280def is_HIEROGLYPH(U, UISC, UGC, AJT):
281	return UISC == Hieroglyph
282def is_HIEROGLYPH_JOINER(U, UISC, UGC, AJT):
283	return UISC == Hieroglyph_Joiner
284def is_HIEROGLYPH_SEGMENT_BEGIN(U, UISC, UGC, AJT):
285	return UISC == Hieroglyph_Segment_Begin
286def is_HIEROGLYPH_SEGMENT_END(U, UISC, UGC, AJT):
287	return UISC == Hieroglyph_Segment_End
288def is_ZWNJ(U, UISC, UGC, AJT):
289	return UISC == Non_Joiner
290def is_OTHER(U, UISC, UGC, AJT):
291	return ((UGC in [Cn, Po] or UISC in [Consonant_Dead, Joiner, Modifying_Letter, Other])
292		and not is_BASE(U, UISC, UGC, AJT)
293		and not is_BASE_OTHER(U, UISC, UGC, AJT)
294		and not is_SYM(U, UISC, UGC, AJT)
295		and not is_SYM_MOD(U, UISC, UGC, AJT)
296	)
297def is_REPHA(U, UISC, UGC, AJT):
298	return UISC in [Consonant_Preceding_Repha, Consonant_Prefixed]
299def is_SAKOT(U, UISC, UGC, AJT):
300	return U == 0x1A60
301def is_SYM(U, UISC, UGC, AJT):
302	if U in [0x25CC, 0x1E14F]: return False
303	return UGC in [So, Sc] and U not in [0x0F01, 0x1B62, 0x1B68]
304def is_SYM_MOD(U, UISC, UGC, AJT):
305	return U in [0x1B6B, 0x1B6C, 0x1B6D, 0x1B6E, 0x1B6F, 0x1B70, 0x1B71, 0x1B72, 0x1B73]
306def is_VOWEL(U, UISC, UGC, AJT):
307	# https://github.com/harfbuzz/harfbuzz/issues/376
308	return (UISC == Pure_Killer or
309		(UGC != Lo and UISC in [Vowel, Vowel_Dependent] and U not in [0xAA29]))
310def is_VOWEL_MOD(U, UISC, UGC, AJT):
311	# https://github.com/harfbuzz/harfbuzz/issues/376
312	return (UISC in [Tone_Mark, Cantillation_Mark, Register_Shifter, Visarga] or
313		(UGC != Lo and (UISC == Bindu or U in [0xAA29])))
314
315# CGJ, VS, WJ, and ZWJ are handled in find_syllables
316use_mapping = {
317	'B':	is_BASE,
318	'N':	is_BASE_NUM,
319	'GB':	is_BASE_OTHER,
320	'F':	is_CONS_FINAL,
321	'FM':	is_CONS_FINAL_MOD,
322	'M':	is_CONS_MED,
323	'CM':	is_CONS_MOD,
324	'SUB':	is_CONS_SUB,
325	'CS':	is_CONS_WITH_STACKER,
326	'H':	is_HALANT,
327	'HVM':	is_HALANT_OR_VOWEL_MODIFIER,
328	'HN':	is_HALANT_NUM,
329	'G':	is_HIEROGLYPH,
330	'J':	is_HIEROGLYPH_JOINER,
331	'SB':	is_HIEROGLYPH_SEGMENT_BEGIN,
332	'SE':	is_HIEROGLYPH_SEGMENT_END,
333	'ZWNJ':	is_ZWNJ,
334	'O':	is_OTHER,
335	'R':	is_REPHA,
336	'S':	is_SYM,
337	'Sk':	is_SAKOT,
338	'SM':	is_SYM_MOD,
339	'V':	is_VOWEL,
340	'VM':	is_VOWEL_MOD,
341}
342
343use_positions = {
344	'F': {
345		'Abv': [Top],
346		'Blw': [Bottom],
347		'Pst': [Right],
348	},
349	'M': {
350		'Abv': [Top],
351		'Blw': [Bottom, Bottom_And_Left, Bottom_And_Right],
352		'Pst': [Right],
353		'Pre': [Left, Top_And_Bottom_And_Left],
354	},
355	'CM': {
356		'Abv': [Top],
357		'Blw': [Bottom, Overstruck],
358	},
359	'V': {
360		'Abv': [Top, Top_And_Bottom, Top_And_Bottom_And_Right, Top_And_Right],
361		'Blw': [Bottom, Overstruck, Bottom_And_Right],
362		'Pst': [Right],
363		'Pre': [Left, Top_And_Left, Top_And_Left_And_Right, Left_And_Right],
364	},
365	'VM': {
366		'Abv': [Top],
367		'Blw': [Bottom, Overstruck],
368		'Pst': [Right],
369		'Pre': [Left],
370	},
371	'SM': {
372		'Abv': [Top],
373		'Blw': [Bottom],
374	},
375	'H': None,
376	'HVM': None,
377	'B': None,
378	'FM': {
379		'Abv': [Top],
380		'Blw': [Bottom],
381		'Pst': [Not_Applicable],
382	},
383	'R': None,
384	'SUB': None,
385}
386
387def map_to_use(data):
388	out = {}
389	items = use_mapping.items()
390	for U,(UISC,UIPC,UGC,AJT,UBlock) in data.items():
391
392		# Resolve Indic_Syllabic_Category
393
394		# TODO: These don't have UISC assigned in Unicode 13.0.0, but have UIPC
395		if 0x1CE2 <= U <= 0x1CE8: UISC = Cantillation_Mark
396
397		# Tibetan:
398		# TODO: These don't have UISC assigned in Unicode 13.0.0, but have UIPC
399		if 0x0F18 <= U <= 0x0F19 or 0x0F3E <= U <= 0x0F3F: UISC = Vowel_Dependent
400
401		# TODO: https://github.com/harfbuzz/harfbuzz/pull/627
402		if 0x1BF2 <= U <= 0x1BF3: UISC = Nukta; UIPC = Bottom
403
404		# TODO: U+1CED should only be allowed after some of
405		# the nasalization marks, maybe only for U+1CE9..U+1CF1.
406		if U == 0x1CED: UISC = Tone_Mark
407
408		# TODO: https://github.com/microsoft/font-tools/issues/1
409		if U == 0xA982: UISC = Consonant_Succeeding_Repha
410
411		values = [k for k,v in items if v(U,UISC,UGC,AJT)]
412		assert len(values) == 1, "%s %s %s %s %s" % (hex(U), UISC, UGC, AJT, values)
413		USE = values[0]
414
415		# Resolve Indic_Positional_Category
416
417		# TODO: These should die, but have UIPC in Unicode 13.0.0
418		if U in [0x953, 0x954]: UIPC = Not_Applicable
419
420		# TODO: These are not in USE's override list that we have, nor are they in Unicode 13.0.0
421		if 0xA926 <= U <= 0xA92A: UIPC = Top
422		# TODO: https://github.com/harfbuzz/harfbuzz/pull/1037
423		#  and https://github.com/harfbuzz/harfbuzz/issues/1631
424		if U in [0x11302, 0x11303, 0x114C1]: UIPC = Top
425		if 0x1CF8 <= U <= 0x1CF9: UIPC = Top
426
427		# TODO: https://github.com/harfbuzz/harfbuzz/pull/982
428		# also  https://github.com/harfbuzz/harfbuzz/issues/1012
429		if 0x1112A <= U <= 0x1112B: UIPC = Top
430		if 0x11131 <= U <= 0x11132: UIPC = Top
431
432		assert (UIPC in [Not_Applicable, Visual_Order_Left] or U == 0x0F7F or
433			USE in use_positions), "%s %s %s %s %s %s" % (hex(U), UIPC, USE, UISC, UGC, AJT)
434
435		pos_mapping = use_positions.get(USE, None)
436		if pos_mapping:
437			values = [k for k,v in pos_mapping.items() if v and UIPC in v]
438			assert len(values) == 1, "%s %s %s %s %s %s %s" % (hex(U), UIPC, USE, UISC, UGC, AJT, values)
439			USE = USE + values[0]
440
441		out[U] = (USE, UBlock)
442	return out
443
444defaults = ('O', 'No_Block')
445data = map_to_use(data)
446
447print ("/* == Start of generated table == */")
448print ("/*")
449print (" * The following table is generated by running:")
450print (" *")
451print (" *   {} IndicSyllabicCategory.txt IndicPositionalCategory.txt UnicodeData.txt ArabicShaping.txt Blocks.txt IndicSyllabicCategory-Additional.txt IndicPositionalCategory-Additional.txt".format (sys.argv[0]))
452print (" *")
453print (" * on files with these headers:")
454print (" *")
455for h in headers:
456	for l in h:
457		print (" * %s" % (l.strip()))
458print (" */")
459print ()
460print ("#ifndef HB_OT_SHAPE_COMPLEX_USE_TABLE_HH")
461print ("#define HB_OT_SHAPE_COMPLEX_USE_TABLE_HH")
462print ()
463print ('#include "hb.hh"')
464print ()
465print ('#include "hb-ot-shape-complex-use-machine.hh"')
466print ()
467
468total = 0
469used = 0
470last_block = None
471def print_block (block, start, end, data):
472	global total, used, last_block
473	if block and block != last_block:
474		print ()
475		print ()
476		print ("  /* %s */" % block)
477		if start % 16:
478			print (' ' * (20 + (start % 16 * 6)), end='')
479	num = 0
480	assert start % 8 == 0
481	assert (end+1) % 8 == 0
482	for u in range (start, end+1):
483		if u % 16 == 0:
484			print ()
485			print ("  /* %04X */" % u, end='')
486		if u in data:
487			num += 1
488		d = data.get (u, defaults)
489		print ("%6s," % d[0], end='')
490
491	total += end - start + 1
492	used += num
493	if block:
494		last_block = block
495
496uu = sorted (data.keys ())
497
498last = -100000
499num = 0
500offset = 0
501starts = []
502ends = []
503print ('#pragma GCC diagnostic push')
504print ('#pragma GCC diagnostic ignored "-Wunused-macros"')
505for k,v in sorted(use_mapping.items()):
506	if k in use_positions and use_positions[k]: continue
507	print ("#define %s	USE(%s)	/* %s */" % (k, k, v.__name__[3:]))
508for k,v in sorted(use_positions.items()):
509	if not v: continue
510	for suf in v.keys():
511		tag = k + suf
512		print ("#define %s	USE(%s)" % (tag, tag))
513print ('#pragma GCC diagnostic pop')
514print ("")
515print ("static const uint8_t use_table[] = {")
516for u in uu:
517	if u <= last:
518		continue
519	if data[u][0] == 'O':
520		continue
521	block = data[u][1]
522
523	start = u//8*8
524	end = start+1
525	while end in uu and block == data[end][1]:
526		end += 1
527	end = (end-1)//8*8 + 7
528
529	if start != last + 1:
530		if start - last <= 1+16*3:
531			print_block (None, last+1, start-1, data)
532		else:
533			if last >= 0:
534				ends.append (last + 1)
535				offset += ends[-1] - starts[-1]
536			print ()
537			print ()
538			print ("#define use_offset_0x%04xu %d" % (start, offset))
539			starts.append (start)
540
541	print_block (block, start, end, data)
542	last = end
543ends.append (last + 1)
544offset += ends[-1] - starts[-1]
545print ()
546print ()
547occupancy = used * 100. / total
548page_bits = 12
549print ("}; /* Table items: %d; occupancy: %d%% */" % (offset, occupancy))
550print ()
551print ("static inline uint8_t")
552print ("hb_use_get_category (hb_codepoint_t u)")
553print ("{")
554print ("  switch (u >> %d)" % page_bits)
555print ("  {")
556pages = set([u>>page_bits for u in starts+ends])
557for p in sorted(pages):
558	print ("    case 0x%0Xu:" % p)
559	for (start,end) in zip (starts, ends):
560		if p not in [start>>page_bits, end>>page_bits]: continue
561		offset = "use_offset_0x%04xu" % start
562		print ("      if (hb_in_range<hb_codepoint_t> (u, 0x%04Xu, 0x%04Xu)) return use_table[u - 0x%04Xu + %s];" % (start, end-1, start, offset))
563	print ("      break;")
564	print ("")
565print ("    default:")
566print ("      break;")
567print ("  }")
568print ("  return USE(O);")
569print ("}")
570print ()
571for k in sorted(use_mapping.keys()):
572	if k in use_positions and use_positions[k]: continue
573	print ("#undef %s" % k)
574for k,v in sorted(use_positions.items()):
575	if not v: continue
576	for suf in v.keys():
577		tag = k + suf
578		print ("#undef %s" % tag)
579print ()
580print ()
581print ("#endif /* HB_OT_SHAPE_COMPLEX_USE_TABLE_HH */")
582print ("/* == End of generated table == */")
583
584# Maintain at least 50% occupancy in the table */
585if occupancy < 50:
586	raise Exception ("Table too sparse, please investigate: ", occupancy)
587