1#!/usr/bin/env python3
2
3"""Generator of the function to prohibit certain vowel sequences.
4
5It creates ``_hb_preprocess_text_vowel_constraints``, which inserts dotted
6circles into sequences prohibited by the USE script development spec.
7This function should be used as the ``preprocess_text`` of an
8``hb_ot_complex_shaper_t``.
9
10"""
11
12import collections
13from html.parser import HTMLParser
14def write (s):
15	sys.stdout.flush ()
16	sys.stdout.buffer.write (s.encode ('utf-8'))
17import itertools
18import io
19import sys
20
21if len (sys.argv) != 3:
22	print ("""usage: ./gen-vowel-constraints.py ms-use/IndicShapingInvalidCluster.txt Scripts.txt
23
24Input file, as of Unicode 12:
25* https://unicode.org/Public/UCD/latest/ucd/Scripts.txt""", file=sys.stderr)
26	sys.exit (1)
27
28with io.open (sys.argv[2], encoding='utf-8') as f:
29	scripts_header = [f.readline () for i in range (2)]
30	scripts = {}
31	script_order = {}
32	for line in f:
33		j = line.find ('#')
34		if j >= 0:
35			line = line[:j]
36		fields = [x.strip () for x in line.split (';')]
37		if len (fields) == 1:
38			continue
39		uu = fields[0].split ('..')
40		start = int (uu[0], 16)
41		if len (uu) == 1:
42			end = start
43		else:
44			end = int (uu[1], 16)
45		script = fields[1]
46		for u in range (start, end + 1):
47			scripts[u] = script
48		if script not in script_order:
49			script_order[script] = start
50
51class ConstraintSet (object):
52	"""A set of prohibited code point sequences.
53
54	Args:
55		constraint (List[int]): A prohibited code point sequence.
56
57	"""
58	def __init__ (self, constraint):
59		# Either a list or a dictionary. As a list of code points, it
60		# represents a prohibited code point sequence. As a dictionary,
61		# it represents a set of prohibited sequences, where each item
62		# represents the set of prohibited sequences starting with the
63		# key (a code point) concatenated with any of the values
64		# (ConstraintSets).
65		self._c = constraint
66
67	def add (self, constraint):
68		"""Add a constraint to this set."""
69		if not constraint:
70			return
71		first = constraint[0]
72		rest = constraint[1:]
73		if isinstance (self._c, list):
74			if constraint == self._c[:len (constraint)]:
75				self._c = constraint
76			elif self._c != constraint[:len (self._c)]:
77				self._c = {self._c[0]: ConstraintSet (self._c[1:])}
78		if isinstance (self._c, dict):
79			if first in self._c:
80				self._c[first].add (rest)
81			else:
82				self._c[first] = ConstraintSet (rest)
83
84	@staticmethod
85	def _indent (depth):
86		return ('  ' * depth).replace ('        ', '\t')
87
88	def __str__ (self, index=0, depth=4):
89		s = []
90		indent = self._indent (depth)
91		if isinstance (self._c, list):
92			if len (self._c) == 0:
93				assert index == 2, 'Cannot use `matched` for this constraint; the general case has not been implemented'
94				s.append ('{}matched = true;\n'.format (indent))
95			elif len (self._c) == 1:
96				assert index == 1, 'Cannot use `matched` for this constraint; the general case has not been implemented'
97				s.append ('{}matched = 0x{:04X}u == buffer->cur ({}).codepoint;\n'.format (indent, next (iter (self._c)), index or ''))
98			else:
99				s.append ('{}if (0x{:04X}u == buffer->cur ({}).codepoint &&\n'.format (indent, self._c[0], index or ''))
100				if index:
101					s.append ('{}buffer->idx + {} < count &&\n'.format (self._indent (depth + 2), index + 1))
102				for i, cp in enumerate (self._c[1:], start=1):
103					s.append ('{}0x{:04X}u == buffer->cur ({}).codepoint{}\n'.format (
104						self._indent (depth + 2), cp, index + i, ')' if i == len (self._c) - 1 else ' &&'))
105				s.append ('{}{{\n'.format (indent))
106				for i in range (index + 1):
107					s.append ('{}buffer->next_glyph ();\n'.format (self._indent (depth + 1)))
108				s.append ('{}_output_dotted_circle (buffer);\n'.format (self._indent (depth + 1)))
109				s.append ('{}}}\n'.format (indent))
110		else:
111			s.append ('{}switch (buffer->cur ({}).codepoint)\n'.format(indent, index or ''))
112			s.append ('{}{{\n'.format (indent))
113			cases = collections.defaultdict (set)
114			for first, rest in sorted (self._c.items ()):
115				cases[rest.__str__ (index + 1, depth + 2)].add (first)
116			for body, labels in sorted (cases.items (), key=lambda b_ls: sorted (b_ls[1])[0]):
117				for i, cp in enumerate (sorted (labels)):
118					if i % 4 == 0:
119						s.append (self._indent (depth + 1))
120					else:
121						s.append (' ')
122					s.append ('case 0x{:04X}u:{}'.format (cp, '\n' if i % 4 == 3 else ''))
123				if len (labels) % 4 != 0:
124					s.append ('\n')
125				s.append (body)
126				s.append ('{}break;\n'.format (self._indent (depth + 2)))
127			s.append ('{}}}\n'.format (indent))
128		return ''.join (s)
129
130constraints = {}
131with io.open (sys.argv[1], encoding='utf-8') as f:
132	constraints_header = []
133	while True:
134		line = f.readline ().strip ()
135		if line == '#':
136			break
137		constraints_header.append(line)
138	for line in f:
139		j = line.find ('#')
140		if j >= 0:
141			line = line[:j]
142		constraint = [int (cp, 16) for cp in line.split (';')[0].split ()]
143		if not constraint: continue
144		assert 2 <= len (constraint), 'Prohibited sequence is too short: {}'.format (constraint)
145		script = scripts[constraint[0]]
146		if script in constraints:
147			constraints[script].add (constraint)
148		else:
149			constraints[script] = ConstraintSet (constraint)
150		assert constraints, 'No constraints found'
151
152print ('/* == Start of generated functions == */')
153print ('/*')
154print (' * The following functions are generated by running:')
155print (' *')
156print (' *   %s ms-use/IndicShapingInvalidCluster.txt Scripts.txt' % sys.argv[0])
157print (' *')
158print (' * on files with these headers:')
159print (' *')
160for line in constraints_header:
161	print (' * %s' % line.strip ())
162print (' *')
163for line in scripts_header:
164	print (' * %s' % line.strip ())
165print (' */')
166
167print ()
168print ('#include "hb.hh"')
169print ()
170print ('#ifndef HB_NO_OT_SHAPE')
171print ()
172print ('#include "hb-ot-shape-complex-vowel-constraints.hh"')
173print ()
174print ('static void')
175print ('_output_dotted_circle (hb_buffer_t *buffer)')
176print ('{')
177print ('  hb_glyph_info_t &dottedcircle = buffer->output_glyph (0x25CCu);')
178print ('  _hb_glyph_info_reset_continuation (&dottedcircle);')
179print ('}')
180print ()
181print ('static void')
182print ('_output_with_dotted_circle (hb_buffer_t *buffer)')
183print ('{')
184print ('  _output_dotted_circle (buffer);')
185print ('  buffer->next_glyph ();')
186print ('}')
187print ()
188
189print ('void')
190print ('_hb_preprocess_text_vowel_constraints (const hb_ot_shape_plan_t *plan HB_UNUSED,')
191print ('\t\t\t\t       hb_buffer_t              *buffer,')
192print ('\t\t\t\t       hb_font_t                *font HB_UNUSED)')
193print ('{')
194print ('#ifdef HB_NO_OT_SHAPE_COMPLEX_VOWEL_CONSTRAINTS')
195print ('  return;')
196print ('#endif')
197print ('  if (buffer->flags & HB_BUFFER_FLAG_DO_NOT_INSERT_DOTTED_CIRCLE)')
198print ('    return;')
199print ()
200print ('  /* UGLY UGLY UGLY business of adding dotted-circle in the middle of')
201print ('   * vowel-sequences that look like another vowel.  Data for each script')
202print ('   * collected from the USE script development spec.')
203print ('   *')
204print ('   * https://github.com/harfbuzz/harfbuzz/issues/1019')
205print ('   */')
206print ('  bool processed = false;')
207print ('  buffer->clear_output ();')
208print ('  unsigned int count = buffer->len;')
209print ('  switch ((unsigned) buffer->props.script)')
210print ('  {')
211
212for script, constraints in sorted (constraints.items (), key=lambda s_c: script_order[s_c[0]]):
213	print ('    case HB_SCRIPT_{}:'.format (script.upper ()))
214	print ('      for (buffer->idx = 0; buffer->idx + 1 < count && buffer->successful;)')
215	print ('      {')
216	print ('\tbool matched = false;')
217	write (str (constraints))
218	print ('\tbuffer->next_glyph ();')
219	print ('\tif (matched) _output_with_dotted_circle (buffer);')
220	print ('      }')
221	print ('      processed = true;')
222	print ('      break;')
223	print ()
224
225print ('    default:')
226print ('      break;')
227print ('  }')
228print ('  if (processed)')
229print ('  {')
230print ('    if (buffer->idx < count)')
231print ('      buffer->next_glyph ();')
232print ('    buffer->swap_buffers ();')
233print ('  }')
234print ('}')
235
236print ()
237print ()
238print ('#endif')
239print ('/* == End of generated functions == */')
240