1#!/usr/bin/env python3
2
3"""Generator of the function to prohibit certain vowel sequences.
4
5It creates ``_hb_preprocess_text_vowel_constraints``, which inserts dotted
6circles into sequences prohibited by the USE script development spec.
7This function should be used as the ``preprocess_text`` of an
8``hb_ot_complex_shaper_t``.
9
10usage: ./gen-vowel-constraints.py ms-use/IndicShapingInvalidCluster.txt Scripts.txt
11
12Input file:
13* https://unicode.org/Public/UCD/latest/ucd/Scripts.txt
14"""
15
16import collections
17def write (s):
18	sys.stdout.flush ()
19	sys.stdout.buffer.write (s.encode ('utf-8'))
20import sys
21
22if len (sys.argv) != 3:
23	sys.exit (__doc__)
24
25with open (sys.argv[2], encoding='utf-8') as f:
26	scripts_header = [f.readline () for i in range (2)]
27	scripts = {}
28	script_order = {}
29	for line in f:
30		j = line.find ('#')
31		if j >= 0:
32			line = line[:j]
33		fields = [x.strip () for x in line.split (';')]
34		if len (fields) == 1:
35			continue
36		uu = fields[0].split ('..')
37		start = int (uu[0], 16)
38		if len (uu) == 1:
39			end = start
40		else:
41			end = int (uu[1], 16)
42		script = fields[1]
43		for u in range (start, end + 1):
44			scripts[u] = script
45		if script not in script_order:
46			script_order[script] = start
47
48class ConstraintSet (object):
49	"""A set of prohibited code point sequences.
50
51	Args:
52		constraint (List[int]): A prohibited code point sequence.
53
54	"""
55	def __init__ (self, constraint):
56		# Either a list or a dictionary. As a list of code points, it
57		# represents a prohibited code point sequence. As a dictionary,
58		# it represents a set of prohibited sequences, where each item
59		# represents the set of prohibited sequences starting with the
60		# key (a code point) concatenated with any of the values
61		# (ConstraintSets).
62		self._c = constraint
63
64	def add (self, constraint):
65		"""Add a constraint to this set."""
66		if not constraint:
67			return
68		first = constraint[0]
69		rest = constraint[1:]
70		if isinstance (self._c, list):
71			if constraint == self._c[:len (constraint)]:
72				self._c = constraint
73			elif self._c != constraint[:len (self._c)]:
74				self._c = {self._c[0]: ConstraintSet (self._c[1:])}
75		if isinstance (self._c, dict):
76			if first in self._c:
77				self._c[first].add (rest)
78			else:
79				self._c[first] = ConstraintSet (rest)
80
81	@staticmethod
82	def _indent (depth):
83		return ('  ' * depth).replace ('        ', '\t')
84
85	def __str__ (self, index=0, depth=4):
86		s = []
87		indent = self._indent (depth)
88		if isinstance (self._c, list):
89			if len (self._c) == 0:
90				assert index == 2, 'Cannot use `matched` for this constraint; the general case has not been implemented'
91				s.append ('{}matched = true;\n'.format (indent))
92			elif len (self._c) == 1:
93				assert index == 1, 'Cannot use `matched` for this constraint; the general case has not been implemented'
94				s.append ('{}matched = 0x{:04X}u == buffer->cur ({}).codepoint;\n'.format (indent, next (iter (self._c)), index or ''))
95			else:
96				s.append ('{}if (0x{:04X}u == buffer->cur ({}).codepoint &&\n'.format (indent, self._c[0], index or ''))
97				if index:
98					s.append ('{}buffer->idx + {} < count &&\n'.format (self._indent (depth + 2), index + 1))
99				for i, cp in enumerate (self._c[1:], start=1):
100					s.append ('{}0x{:04X}u == buffer->cur ({}).codepoint{}\n'.format (
101						self._indent (depth + 2), cp, index + i, ')' if i == len (self._c) - 1 else ' &&'))
102				s.append ('{}{{\n'.format (indent))
103				for i in range (index):
104					s.append ('{}(void) buffer->next_glyph ();\n'.format (self._indent (depth + 1)))
105				s.append ('{}matched = true;\n'.format (self._indent (depth + 1)))
106				s.append ('{}}}\n'.format (indent))
107		else:
108			s.append ('{}switch (buffer->cur ({}).codepoint)\n'.format(indent, index or ''))
109			s.append ('{}{{\n'.format (indent))
110			cases = collections.defaultdict (set)
111			for first, rest in sorted (self._c.items ()):
112				cases[rest.__str__ (index + 1, depth + 2)].add (first)
113			for body, labels in sorted (cases.items (), key=lambda b_ls: sorted (b_ls[1])[0]):
114				for i, cp in enumerate (sorted (labels)):
115					if i % 4 == 0:
116						s.append (self._indent (depth + 1))
117					else:
118						s.append (' ')
119					s.append ('case 0x{:04X}u:{}'.format (cp, '\n' if i % 4 == 3 else ''))
120				if len (labels) % 4 != 0:
121					s.append ('\n')
122				s.append (body)
123				s.append ('{}break;\n'.format (self._indent (depth + 2)))
124			s.append ('{}}}\n'.format (indent))
125		return ''.join (s)
126
127constraints = {}
128with open (sys.argv[1], encoding='utf-8') as f:
129	constraints_header = []
130	while True:
131		line = f.readline ().strip ()
132		if line == '#':
133			break
134		constraints_header.append(line)
135	for line in f:
136		j = line.find ('#')
137		if j >= 0:
138			line = line[:j]
139		constraint = [int (cp, 16) for cp in line.split (';')[0].split ()]
140		if not constraint: continue
141		assert 2 <= len (constraint), 'Prohibited sequence is too short: {}'.format (constraint)
142		script = scripts[constraint[0]]
143		if script in constraints:
144			constraints[script].add (constraint)
145		else:
146			constraints[script] = ConstraintSet (constraint)
147		assert constraints, 'No constraints found'
148
149print ('/* == Start of generated functions == */')
150print ('/*')
151print (' * The following functions are generated by running:')
152print (' *')
153print (' *   %s ms-use/IndicShapingInvalidCluster.txt Scripts.txt' % sys.argv[0])
154print (' *')
155print (' * on files with these headers:')
156print (' *')
157for line in constraints_header:
158	print (' * %s' % line.strip ())
159print (' *')
160for line in scripts_header:
161	print (' * %s' % line.strip ())
162print (' */')
163
164print ()
165print ('#include "hb.hh"')
166print ()
167print ('#ifndef HB_NO_OT_SHAPE')
168print ()
169print ('#include "hb-ot-shape-complex-vowel-constraints.hh"')
170print ()
171print ('static void')
172print ('_output_dotted_circle (hb_buffer_t *buffer)')
173print ('{')
174print ('  (void) buffer->output_glyph (0x25CCu);')
175print ('  _hb_glyph_info_reset_continuation (&buffer->prev());')
176print ('}')
177print ()
178print ('static void')
179print ('_output_with_dotted_circle (hb_buffer_t *buffer)')
180print ('{')
181print ('  _output_dotted_circle (buffer);')
182print ('  (void) buffer->next_glyph ();')
183print ('}')
184print ()
185
186print ('void')
187print ('_hb_preprocess_text_vowel_constraints (const hb_ot_shape_plan_t *plan HB_UNUSED,')
188print ('\t\t\t\t       hb_buffer_t              *buffer,')
189print ('\t\t\t\t       hb_font_t                *font HB_UNUSED)')
190print ('{')
191print ('#ifdef HB_NO_OT_SHAPE_COMPLEX_VOWEL_CONSTRAINTS')
192print ('  return;')
193print ('#endif')
194print ('  if (buffer->flags & HB_BUFFER_FLAG_DO_NOT_INSERT_DOTTED_CIRCLE)')
195print ('    return;')
196print ()
197print ('  /* UGLY UGLY UGLY business of adding dotted-circle in the middle of')
198print ('   * vowel-sequences that look like another vowel.  Data for each script')
199print ('   * collected from the USE script development spec.')
200print ('   *')
201print ('   * https://github.com/harfbuzz/harfbuzz/issues/1019')
202print ('   */')
203print ('  buffer->clear_output ();')
204print ('  unsigned int count = buffer->len;')
205print ('  switch ((unsigned) buffer->props.script)')
206print ('  {')
207
208for script, constraints in sorted (constraints.items (), key=lambda s_c: script_order[s_c[0]]):
209	print ('    case HB_SCRIPT_{}:'.format (script.upper ()))
210	print ('      for (buffer->idx = 0; buffer->idx + 1 < count && buffer->successful;)')
211	print ('      {')
212	print ('\tbool matched = false;')
213	write (str (constraints))
214	print ('\t(void) buffer->next_glyph ();')
215	print ('\tif (matched) _output_with_dotted_circle (buffer);')
216	print ('      }')
217	print ('      break;')
218	print ()
219
220print ('    default:')
221print ('      break;')
222print ('  }')
223print ('  buffer->swap_buffers ();')
224print ('}')
225
226print ()
227print ()
228print ('#endif')
229print ('/* == End of generated functions == */')
230