1#!/usr/bin/env python
2# flake8: noqa
3
4from __future__ import print_function, division, absolute_import
5
6import io
7import sys
8
9if len (sys.argv) != 5:
10	print ("usage: ./gen-use-table.py IndicSyllabicCategory.txt IndicPositionalCategory.txt UnicodeData.txt Blocks.txt", file=sys.stderr)
11	sys.exit (1)
12
13BLACKLISTED_BLOCKS = ["Thai", "Lao"]
14
15files = [io.open (x, encoding='utf-8') for x in sys.argv[1:]]
16
17headers = [[f.readline () for i in range (2)] for j,f in enumerate(files) if j != 2]
18headers.append (["UnicodeData.txt does not have a header."])
19
20data = [{} for f in files]
21values = [{} for f in files]
22for i, f in enumerate (files):
23	for line in f:
24
25		j = line.find ('#')
26		if j >= 0:
27			line = line[:j]
28
29		fields = [x.strip () for x in line.split (';')]
30		if len (fields) == 1:
31			continue
32
33		uu = fields[0].split ('..')
34		start = int (uu[0], 16)
35		if len (uu) == 1:
36			end = start
37		else:
38			end = int (uu[1], 16)
39
40		t = fields[1 if i != 2 else 2]
41
42		for u in range (start, end + 1):
43			data[i][u] = t
44		values[i][t] = values[i].get (t, 0) + end - start + 1
45
46defaults = ('Other', 'Not_Applicable', 'Cn', 'No_Block')
47
48# TODO Characters that are not in Unicode Indic files, but used in USE
49data[0][0x034F] = defaults[0]
50data[0][0x1B61] = defaults[0]
51data[0][0x1B63] = defaults[0]
52data[0][0x1B64] = defaults[0]
53data[0][0x1B65] = defaults[0]
54data[0][0x1B66] = defaults[0]
55data[0][0x1B67] = defaults[0]
56data[0][0x1B69] = defaults[0]
57data[0][0x1B6A] = defaults[0]
58data[0][0x2060] = defaults[0]
59# TODO https://github.com/harfbuzz/harfbuzz/pull/1685
60data[0][0x1B5B] = 'Consonant_Placeholder'
61data[0][0x1B5C] = 'Consonant_Placeholder'
62data[0][0x1B5F] = 'Consonant_Placeholder'
63data[0][0x1B62] = 'Consonant_Placeholder'
64data[0][0x1B68] = 'Consonant_Placeholder'
65# TODO https://github.com/harfbuzz/harfbuzz/issues/1035
66data[0][0x11C44] = 'Consonant_Placeholder'
67data[0][0x11C45] = 'Consonant_Placeholder'
68# TODO https://github.com/harfbuzz/harfbuzz/pull/1399
69data[0][0x111C8] = 'Consonant_Placeholder'
70for u in range (0xFE00, 0xFE0F + 1):
71	data[0][u] = defaults[0]
72
73# Merge data into one dict:
74for i,v in enumerate (defaults):
75	values[i][v] = values[i].get (v, 0) + 1
76combined = {}
77for i,d in enumerate (data):
78	for u,v in d.items ():
79		if i >= 2 and not u in combined:
80			continue
81		if not u in combined:
82			combined[u] = list (defaults)
83		combined[u][i] = v
84combined = {k:v for k,v in combined.items() if v[3] not in BLACKLISTED_BLOCKS}
85data = combined
86del combined
87num = len (data)
88
89
90property_names = [
91	# General_Category
92	'Cc', 'Cf', 'Cn', 'Co', 'Cs', 'Ll', 'Lm', 'Lo', 'Lt', 'Lu', 'Mc',
93	'Me', 'Mn', 'Nd', 'Nl', 'No', 'Pc', 'Pd', 'Pe', 'Pf', 'Pi', 'Po',
94	'Ps', 'Sc', 'Sk', 'Sm', 'So', 'Zl', 'Zp', 'Zs',
95	# Indic_Syllabic_Category
96	'Other',
97	'Bindu',
98	'Visarga',
99	'Avagraha',
100	'Nukta',
101	'Virama',
102	'Pure_Killer',
103	'Invisible_Stacker',
104	'Vowel_Independent',
105	'Vowel_Dependent',
106	'Vowel',
107	'Consonant_Placeholder',
108	'Consonant',
109	'Consonant_Dead',
110	'Consonant_With_Stacker',
111	'Consonant_Prefixed',
112	'Consonant_Preceding_Repha',
113	'Consonant_Succeeding_Repha',
114	'Consonant_Subjoined',
115	'Consonant_Medial',
116	'Consonant_Final',
117	'Consonant_Head_Letter',
118	'Consonant_Initial_Postfixed',
119	'Modifying_Letter',
120	'Tone_Letter',
121	'Tone_Mark',
122	'Gemination_Mark',
123	'Cantillation_Mark',
124	'Register_Shifter',
125	'Syllable_Modifier',
126	'Consonant_Killer',
127	'Non_Joiner',
128	'Joiner',
129	'Number_Joiner',
130	'Number',
131	'Brahmi_Joining_Number',
132	# Indic_Positional_Category
133	'Not_Applicable',
134	'Right',
135	'Left',
136	'Visual_Order_Left',
137	'Left_And_Right',
138	'Top',
139	'Bottom',
140	'Top_And_Bottom',
141	'Top_And_Right',
142	'Top_And_Left',
143	'Top_And_Left_And_Right',
144	'Bottom_And_Left',
145	'Bottom_And_Right',
146	'Top_And_Bottom_And_Right',
147	'Overstruck',
148]
149
150try:
151	basestring
152except NameError:
153	basestring = str
154
155class PropertyValue(object):
156	def __init__(self, name_):
157		self.name = name_
158	def __str__(self):
159		return self.name
160	def __eq__(self, other):
161		return self.name == (other if isinstance(other, basestring) else other.name)
162	def __ne__(self, other):
163		return not (self == other)
164	def __hash__(self):
165		return hash(str(self))
166
167property_values = {}
168
169for name in property_names:
170	value = PropertyValue(name)
171	assert value not in property_values
172	assert value not in globals()
173	property_values[name] = value
174globals().update(property_values)
175
176
177def is_BASE(U, UISC, UGC):
178	return (UISC in [Number, Consonant, Consonant_Head_Letter,
179			#SPEC-DRAFT Consonant_Placeholder,
180			Tone_Letter,
181			Vowel_Independent #SPEC-DRAFT
182			] or
183		(UGC == Lo and UISC in [Avagraha, Bindu, Consonant_Final, Consonant_Medial,
184					Consonant_Subjoined, Vowel, Vowel_Dependent]))
185def is_BASE_IND(U, UISC, UGC):
186	#SPEC-DRAFT return (UISC in [Consonant_Dead, Modifying_Letter] or UGC == Po)
187	return (UISC in [Consonant_Dead, Modifying_Letter] or
188		(UGC == Po and not U in [0x104B, 0x104E, 0x1B5B, 0x1B5C, 0x1B5F, 0x2022, 0x111C8, 0x11A3F, 0x11A45, 0x11C44, 0x11C45]) or
189		False # SPEC-DRAFT-OUTDATED! U == 0x002D
190		)
191def is_BASE_NUM(U, UISC, UGC):
192	return UISC == Brahmi_Joining_Number
193def is_BASE_OTHER(U, UISC, UGC):
194	if UISC == Consonant_Placeholder: return True #SPEC-DRAFT
195	#SPEC-DRAFT return U in [0x00A0, 0x00D7, 0x2015, 0x2022, 0x25CC, 0x25FB, 0x25FC, 0x25FD, 0x25FE]
196	return U in [0x2015, 0x2022, 0x25FB, 0x25FC, 0x25FD, 0x25FE]
197def is_CGJ(U, UISC, UGC):
198	return U == 0x034F
199def is_CONS_FINAL(U, UISC, UGC):
200	return ((UISC == Consonant_Final and UGC != Lo) or
201		UISC == Consonant_Succeeding_Repha)
202def is_CONS_FINAL_MOD(U, UISC, UGC):
203	#SPEC-DRAFT return  UISC in [Consonant_Final_Modifier, Syllable_Modifier]
204	return  UISC == Syllable_Modifier
205def is_CONS_MED(U, UISC, UGC):
206	# Consonant_Initial_Postfixed is new in Unicode 11; not in the spec.
207	return (UISC == Consonant_Medial and UGC != Lo or
208		UISC == Consonant_Initial_Postfixed)
209def is_CONS_MOD(U, UISC, UGC):
210	return UISC in [Nukta, Gemination_Mark, Consonant_Killer]
211def is_CONS_SUB(U, UISC, UGC):
212	#SPEC-DRAFT return UISC == Consonant_Subjoined
213	return UISC == Consonant_Subjoined and UGC != Lo
214def is_CONS_WITH_STACKER(U, UISC, UGC):
215	return UISC == Consonant_With_Stacker
216def is_HALANT(U, UISC, UGC):
217	return (UISC in [Virama, Invisible_Stacker]
218		and not is_HALANT_OR_VOWEL_MODIFIER(U, UISC, UGC)
219		and not is_SAKOT(U, UISC, UGC))
220def is_HALANT_OR_VOWEL_MODIFIER(U, UISC, UGC):
221	# https://github.com/harfbuzz/harfbuzz/issues/1102
222	# https://github.com/harfbuzz/harfbuzz/issues/1379
223	return U in [0x11046, 0x1134D]
224def is_HALANT_NUM(U, UISC, UGC):
225	return UISC == Number_Joiner
226def is_ZWNJ(U, UISC, UGC):
227	return UISC == Non_Joiner
228def is_ZWJ(U, UISC, UGC):
229	return UISC == Joiner
230def is_Word_Joiner(U, UISC, UGC):
231	return U == 0x2060
232def is_OTHER(U, UISC, UGC):
233	#SPEC-OUTDATED return UGC == Zs # or any other SCRIPT_COMMON characters
234	return (UISC == Other
235		and not is_SYM(U, UISC, UGC)
236		and not is_SYM_MOD(U, UISC, UGC)
237		and not is_CGJ(U, UISC, UGC)
238		and not is_Word_Joiner(U, UISC, UGC)
239		and not is_VARIATION_SELECTOR(U, UISC, UGC)
240	)
241def is_Reserved(U, UISC, UGC):
242	return UGC == 'Cn'
243def is_REPHA(U, UISC, UGC):
244	return UISC in [Consonant_Preceding_Repha, Consonant_Prefixed]
245def is_SAKOT(U, UISC, UGC):
246	return U == 0x1A60
247def is_SYM(U, UISC, UGC):
248	if U == 0x25CC: return False #SPEC-DRAFT
249	#SPEC-DRAFT return UGC in [So, Sc] or UISC == Symbol_Letter
250	return UGC in [So, Sc] and U not in [0x1B62, 0x1B68]
251def is_SYM_MOD(U, UISC, UGC):
252	return U in [0x1B6B, 0x1B6C, 0x1B6D, 0x1B6E, 0x1B6F, 0x1B70, 0x1B71, 0x1B72, 0x1B73]
253def is_VARIATION_SELECTOR(U, UISC, UGC):
254	return 0xFE00 <= U <= 0xFE0F
255def is_VOWEL(U, UISC, UGC):
256	# https://github.com/harfbuzz/harfbuzz/issues/376
257	return (UISC == Pure_Killer or
258		(UGC != Lo and UISC in [Vowel, Vowel_Dependent] and U not in [0xAA29]))
259def is_VOWEL_MOD(U, UISC, UGC):
260	# https://github.com/harfbuzz/harfbuzz/issues/376
261	return (UISC in [Tone_Mark, Cantillation_Mark, Register_Shifter, Visarga] or
262		(UGC != Lo and (UISC == Bindu or U in [0xAA29])))
263
264use_mapping = {
265	'B':	is_BASE,
266	'IND':	is_BASE_IND,
267	'N':	is_BASE_NUM,
268	'GB':	is_BASE_OTHER,
269	'CGJ':	is_CGJ,
270	'F':	is_CONS_FINAL,
271	'FM':	is_CONS_FINAL_MOD,
272	'M':	is_CONS_MED,
273	'CM':	is_CONS_MOD,
274	'SUB':	is_CONS_SUB,
275	'CS':	is_CONS_WITH_STACKER,
276	'H':	is_HALANT,
277	'HVM':	is_HALANT_OR_VOWEL_MODIFIER,
278	'HN':	is_HALANT_NUM,
279	'ZWNJ':	is_ZWNJ,
280	'ZWJ':	is_ZWJ,
281	'WJ':	is_Word_Joiner,
282	'O':	is_OTHER,
283	'Rsv':	is_Reserved,
284	'R':	is_REPHA,
285	'S':	is_SYM,
286	'Sk':	is_SAKOT,
287	'SM':	is_SYM_MOD,
288	'VS':	is_VARIATION_SELECTOR,
289	'V':	is_VOWEL,
290	'VM':	is_VOWEL_MOD,
291}
292
293use_positions = {
294	'F': {
295		'Abv': [Top],
296		'Blw': [Bottom],
297		'Pst': [Right],
298	},
299	'M': {
300		'Abv': [Top],
301		'Blw': [Bottom, Bottom_And_Left],
302		'Pst': [Right],
303		'Pre': [Left],
304	},
305	'CM': {
306		'Abv': [Top],
307		'Blw': [Bottom],
308	},
309	'V': {
310		'Abv': [Top, Top_And_Bottom, Top_And_Bottom_And_Right, Top_And_Right],
311		'Blw': [Bottom, Overstruck, Bottom_And_Right],
312		'Pst': [Right, Top_And_Left, Top_And_Left_And_Right, Left_And_Right],
313		'Pre': [Left],
314	},
315	'VM': {
316		'Abv': [Top],
317		'Blw': [Bottom, Overstruck],
318		'Pst': [Right],
319		'Pre': [Left],
320	},
321	'SM': {
322		'Abv': [Top],
323		'Blw': [Bottom],
324	},
325	'H': None,
326	'HVM': None,
327	'B': None,
328	'FM': {
329		'Abv': [Top],
330		'Blw': [Bottom],
331		'Pst': [Not_Applicable],
332	},
333	'SUB': None,
334}
335
336def map_to_use(data):
337	out = {}
338	items = use_mapping.items()
339	for U,(UISC,UIPC,UGC,UBlock) in data.items():
340
341		# Resolve Indic_Syllabic_Category
342
343		# TODO: These don't have UISC assigned in Unicode 12.0, but have UIPC
344		if 0x1CE2 <= U <= 0x1CE8: UISC = Cantillation_Mark
345
346		# Tibetan:
347		# TODO: These don't have UISC assigned in Unicode 12.0, but have UIPC
348		if 0x0F18 <= U <= 0x0F19 or 0x0F3E <= U <= 0x0F3F: UISC = Vowel_Dependent
349		if 0x0F86 <= U <= 0x0F87: UISC = Tone_Mark
350		# Overrides to allow NFC order matching syllable
351		# https://github.com/harfbuzz/harfbuzz/issues/1012
352		if UBlock == 'Tibetan' and is_VOWEL (U, UISC, UGC):
353			if UIPC == Top:
354				UIPC = Bottom
355
356		# TODO: https://github.com/harfbuzz/harfbuzz/pull/982
357		# also  https://github.com/harfbuzz/harfbuzz/issues/1012
358		if UBlock == 'Chakma' and is_VOWEL (U, UISC, UGC):
359			if UIPC == Top:
360				UIPC = Bottom
361			elif UIPC == Bottom:
362				UIPC = Top
363
364		# TODO: https://github.com/harfbuzz/harfbuzz/pull/627
365		if 0x1BF2 <= U <= 0x1BF3: UISC = Nukta; UIPC = Bottom
366
367		# TODO: U+1CED should only be allowed after some of
368		# the nasalization marks, maybe only for U+1CE9..U+1CF1.
369		if U == 0x1CED: UISC = Tone_Mark
370
371		# TODO: https://github.com/harfbuzz/harfbuzz/issues/1105
372		if U == 0x11134: UISC = Gemination_Mark
373
374		values = [k for k,v in items if v(U,UISC,UGC)]
375		assert len(values) == 1, "%s %s %s %s" % (hex(U), UISC, UGC, values)
376		USE = values[0]
377
378		# Resolve Indic_Positional_Category
379
380		# TODO: These should die, but have UIPC in Unicode 12.0
381		if U in [0x953, 0x954]: UIPC = Not_Applicable
382
383		# TODO: In USE's override list but not in Unicode 12.0
384		if U == 0x103C: UIPC = Left
385
386		# TODO: https://github.com/harfbuzz/harfbuzz/pull/2012
387		if U == 0x1C29: UIPC = Left
388
389		# TODO: These are not in USE's override list that we have, nor are they in Unicode 12.0
390		if 0xA926 <= U <= 0xA92A: UIPC = Top
391		# TODO: https://github.com/harfbuzz/harfbuzz/pull/1037
392		#  and https://github.com/harfbuzz/harfbuzz/issues/1631
393		if U in [0x11302, 0x11303, 0x114C1]: UIPC = Top
394		if U == 0x1171E: UIPC = Left
395		if 0x1CF8 <= U <= 0x1CF9: UIPC = Top
396
397		assert (UIPC in [Not_Applicable, Visual_Order_Left] or
398			USE in use_positions), "%s %s %s %s %s" % (hex(U), UIPC, USE, UISC, UGC)
399
400		pos_mapping = use_positions.get(USE, None)
401		if pos_mapping:
402			values = [k for k,v in pos_mapping.items() if v and UIPC in v]
403			assert len(values) == 1, "%s %s %s %s %s %s" % (hex(U), UIPC, USE, UISC, UGC, values)
404			USE = USE + values[0]
405
406		out[U] = (USE, UBlock)
407	return out
408
409defaults = ('O', 'No_Block')
410data = map_to_use(data)
411
412print ("/* == Start of generated table == */")
413print ("/*")
414print (" * The following table is generated by running:")
415print (" *")
416print (" *   ./gen-use-table.py IndicSyllabicCategory.txt IndicPositionalCategory.txt UnicodeData.txt Blocks.txt")
417print (" *")
418print (" * on files with these headers:")
419print (" *")
420for h in headers:
421	for l in h:
422		print (" * %s" % (l.strip()))
423print (" */")
424print ()
425print ('#include "hb.hh"')
426print ()
427print ('#ifndef HB_NO_OT_SHAPE')
428print ()
429print ('#include "hb-ot-shape-complex-use.hh"')
430print ()
431
432total = 0
433used = 0
434last_block = None
435def print_block (block, start, end, data):
436	global total, used, last_block
437	if block and block != last_block:
438		print ()
439		print ()
440		print ("  /* %s */" % block)
441		if start % 16:
442			print (' ' * (20 + (start % 16 * 6)), end='')
443	num = 0
444	assert start % 8 == 0
445	assert (end+1) % 8 == 0
446	for u in range (start, end+1):
447		if u % 16 == 0:
448			print ()
449			print ("  /* %04X */" % u, end='')
450		if u in data:
451			num += 1
452		d = data.get (u, defaults)
453		print ("%6s," % d[0], end='')
454
455	total += end - start + 1
456	used += num
457	if block:
458		last_block = block
459
460uu = sorted (data.keys ())
461
462last = -100000
463num = 0
464offset = 0
465starts = []
466ends = []
467print ('#pragma GCC diagnostic push')
468print ('#pragma GCC diagnostic ignored "-Wunused-macros"')
469for k,v in sorted(use_mapping.items()):
470	if k in use_positions and use_positions[k]: continue
471	print ("#define %s	USE_%s	/* %s */" % (k, k, v.__name__[3:]))
472for k,v in sorted(use_positions.items()):
473	if not v: continue
474	for suf in v.keys():
475		tag = k + suf
476		print ("#define %s	USE_%s" % (tag, tag))
477print ('#pragma GCC diagnostic pop')
478print ("")
479print ("static const USE_TABLE_ELEMENT_TYPE use_table[] = {")
480for u in uu:
481	if u <= last:
482		continue
483	block = data[u][1]
484
485	start = u//8*8
486	end = start+1
487	while end in uu and block == data[end][1]:
488		end += 1
489	end = (end-1)//8*8 + 7
490
491	if start != last + 1:
492		if start - last <= 1+16*3:
493			print_block (None, last+1, start-1, data)
494			last = start-1
495		else:
496			if last >= 0:
497				ends.append (last + 1)
498				offset += ends[-1] - starts[-1]
499			print ()
500			print ()
501			print ("#define use_offset_0x%04xu %d" % (start, offset))
502			starts.append (start)
503
504	print_block (block, start, end, data)
505	last = end
506ends.append (last + 1)
507offset += ends[-1] - starts[-1]
508print ()
509print ()
510occupancy = used * 100. / total
511page_bits = 12
512print ("}; /* Table items: %d; occupancy: %d%% */" % (offset, occupancy))
513print ()
514print ("USE_TABLE_ELEMENT_TYPE")
515print ("hb_use_get_category (hb_codepoint_t u)")
516print ("{")
517print ("  switch (u >> %d)" % page_bits)
518print ("  {")
519pages = set([u>>page_bits for u in starts+ends])
520for p in sorted(pages):
521	print ("    case 0x%0Xu:" % p)
522	for (start,end) in zip (starts, ends):
523		if p not in [start>>page_bits, end>>page_bits]: continue
524		offset = "use_offset_0x%04xu" % start
525		print ("      if (hb_in_range<hb_codepoint_t> (u, 0x%04Xu, 0x%04Xu)) return use_table[u - 0x%04Xu + %s];" % (start, end-1, start, offset))
526	print ("      break;")
527	print ("")
528print ("    default:")
529print ("      break;")
530print ("  }")
531print ("  return USE_O;")
532print ("}")
533print ()
534for k in sorted(use_mapping.keys()):
535	if k in use_positions and use_positions[k]: continue
536	print ("#undef %s" % k)
537for k,v in sorted(use_positions.items()):
538	if not v: continue
539	for suf in v.keys():
540		tag = k + suf
541		print ("#undef %s" % tag)
542print ()
543print ()
544print ('#endif')
545print ("/* == End of generated table == */")
546
547# Maintain at least 50% occupancy in the table */
548if occupancy < 50:
549	raise Exception ("Table too sparse, please investigate: ", occupancy)
550