1#!/usr/bin/env python3 2 3"""Generator of the function to prohibit certain vowel sequences. 4 5It creates ``_hb_preprocess_text_vowel_constraints``, which inserts dotted 6circles into sequences prohibited by the USE script development spec. 7This function should be used as the ``preprocess_text`` of an 8``hb_ot_complex_shaper_t``. 9 10""" 11 12import collections 13from html.parser import HTMLParser 14def write (s): 15 sys.stdout.flush () 16 sys.stdout.buffer.write (s.encode ('utf-8')) 17import itertools 18import io 19import sys 20 21if len (sys.argv) != 3: 22 print ("""usage: ./gen-vowel-constraints.py ms-use/IndicShapingInvalidCluster.txt Scripts.txt 23 24Input file, as of Unicode 12: 25* https://unicode.org/Public/UCD/latest/ucd/Scripts.txt""", file=sys.stderr) 26 sys.exit (1) 27 28with io.open (sys.argv[2], encoding='utf-8') as f: 29 scripts_header = [f.readline () for i in range (2)] 30 scripts = {} 31 script_order = {} 32 for line in f: 33 j = line.find ('#') 34 if j >= 0: 35 line = line[:j] 36 fields = [x.strip () for x in line.split (';')] 37 if len (fields) == 1: 38 continue 39 uu = fields[0].split ('..') 40 start = int (uu[0], 16) 41 if len (uu) == 1: 42 end = start 43 else: 44 end = int (uu[1], 16) 45 script = fields[1] 46 for u in range (start, end + 1): 47 scripts[u] = script 48 if script not in script_order: 49 script_order[script] = start 50 51class ConstraintSet (object): 52 """A set of prohibited code point sequences. 53 54 Args: 55 constraint (List[int]): A prohibited code point sequence. 56 57 """ 58 def __init__ (self, constraint): 59 # Either a list or a dictionary. As a list of code points, it 60 # represents a prohibited code point sequence. As a dictionary, 61 # it represents a set of prohibited sequences, where each item 62 # represents the set of prohibited sequences starting with the 63 # key (a code point) concatenated with any of the values 64 # (ConstraintSets). 65 self._c = constraint 66 67 def add (self, constraint): 68 """Add a constraint to this set.""" 69 if not constraint: 70 return 71 first = constraint[0] 72 rest = constraint[1:] 73 if isinstance (self._c, list): 74 if constraint == self._c[:len (constraint)]: 75 self._c = constraint 76 elif self._c != constraint[:len (self._c)]: 77 self._c = {self._c[0]: ConstraintSet (self._c[1:])} 78 if isinstance (self._c, dict): 79 if first in self._c: 80 self._c[first].add (rest) 81 else: 82 self._c[first] = ConstraintSet (rest) 83 84 @staticmethod 85 def _indent (depth): 86 return (' ' * depth).replace (' ', '\t') 87 88 def __str__ (self, index=0, depth=4): 89 s = [] 90 indent = self._indent (depth) 91 if isinstance (self._c, list): 92 if len (self._c) == 0: 93 assert index == 2, 'Cannot use `matched` for this constraint; the general case has not been implemented' 94 s.append ('{}matched = true;\n'.format (indent)) 95 elif len (self._c) == 1: 96 assert index == 1, 'Cannot use `matched` for this constraint; the general case has not been implemented' 97 s.append ('{}matched = 0x{:04X}u == buffer->cur ({}).codepoint;\n'.format (indent, next (iter (self._c)), index or '')) 98 else: 99 s.append ('{}if (0x{:04X}u == buffer->cur ({}).codepoint &&\n'.format (indent, self._c[0], index or '')) 100 if index: 101 s.append ('{}buffer->idx + {} < count &&\n'.format (self._indent (depth + 2), index + 1)) 102 for i, cp in enumerate (self._c[1:], start=1): 103 s.append ('{}0x{:04X}u == buffer->cur ({}).codepoint{}\n'.format ( 104 self._indent (depth + 2), cp, index + i, ')' if i == len (self._c) - 1 else ' &&')) 105 s.append ('{}{{\n'.format (indent)) 106 for i in range (index + 1): 107 s.append ('{}buffer->next_glyph ();\n'.format (self._indent (depth + 1))) 108 s.append ('{}_output_dotted_circle (buffer);\n'.format (self._indent (depth + 1))) 109 s.append ('{}}}\n'.format (indent)) 110 else: 111 s.append ('{}switch (buffer->cur ({}).codepoint)\n'.format(indent, index or '')) 112 s.append ('{}{{\n'.format (indent)) 113 cases = collections.defaultdict (set) 114 for first, rest in sorted (self._c.items ()): 115 cases[rest.__str__ (index + 1, depth + 2)].add (first) 116 for body, labels in sorted (cases.items (), key=lambda b_ls: sorted (b_ls[1])[0]): 117 for i, cp in enumerate (sorted (labels)): 118 if i % 4 == 0: 119 s.append (self._indent (depth + 1)) 120 else: 121 s.append (' ') 122 s.append ('case 0x{:04X}u:{}'.format (cp, '\n' if i % 4 == 3 else '')) 123 if len (labels) % 4 != 0: 124 s.append ('\n') 125 s.append (body) 126 s.append ('{}break;\n'.format (self._indent (depth + 2))) 127 s.append ('{}}}\n'.format (indent)) 128 return ''.join (s) 129 130constraints = {} 131with io.open (sys.argv[1], encoding='utf-8') as f: 132 constraints_header = [] 133 while True: 134 line = f.readline ().strip () 135 if line == '#': 136 break 137 constraints_header.append(line) 138 for line in f: 139 j = line.find ('#') 140 if j >= 0: 141 line = line[:j] 142 constraint = [int (cp, 16) for cp in line.split (';')[0].split ()] 143 if not constraint: continue 144 assert 2 <= len (constraint), 'Prohibited sequence is too short: {}'.format (constraint) 145 script = scripts[constraint[0]] 146 if script in constraints: 147 constraints[script].add (constraint) 148 else: 149 constraints[script] = ConstraintSet (constraint) 150 assert constraints, 'No constraints found' 151 152print ('/* == Start of generated functions == */') 153print ('/*') 154print (' * The following functions are generated by running:') 155print (' *') 156print (' * %s ms-use/IndicShapingInvalidCluster.txt Scripts.txt' % sys.argv[0]) 157print (' *') 158print (' * on files with these headers:') 159print (' *') 160for line in constraints_header: 161 print (' * %s' % line.strip ()) 162print (' *') 163for line in scripts_header: 164 print (' * %s' % line.strip ()) 165print (' */') 166 167print () 168print ('#include "hb.hh"') 169print () 170print ('#ifndef HB_NO_OT_SHAPE') 171print () 172print ('#include "hb-ot-shape-complex-vowel-constraints.hh"') 173print () 174print ('static void') 175print ('_output_dotted_circle (hb_buffer_t *buffer)') 176print ('{') 177print (' hb_glyph_info_t &dottedcircle = buffer->output_glyph (0x25CCu);') 178print (' _hb_glyph_info_reset_continuation (&dottedcircle);') 179print ('}') 180print () 181print ('static void') 182print ('_output_with_dotted_circle (hb_buffer_t *buffer)') 183print ('{') 184print (' _output_dotted_circle (buffer);') 185print (' buffer->next_glyph ();') 186print ('}') 187print () 188 189print ('void') 190print ('_hb_preprocess_text_vowel_constraints (const hb_ot_shape_plan_t *plan HB_UNUSED,') 191print ('\t\t\t\t hb_buffer_t *buffer,') 192print ('\t\t\t\t hb_font_t *font HB_UNUSED)') 193print ('{') 194print ('#ifdef HB_NO_OT_SHAPE_COMPLEX_VOWEL_CONSTRAINTS') 195print (' return;') 196print ('#endif') 197print (' if (buffer->flags & HB_BUFFER_FLAG_DO_NOT_INSERT_DOTTED_CIRCLE)') 198print (' return;') 199print () 200print (' /* UGLY UGLY UGLY business of adding dotted-circle in the middle of') 201print (' * vowel-sequences that look like another vowel. Data for each script') 202print (' * collected from the USE script development spec.') 203print (' *') 204print (' * https://github.com/harfbuzz/harfbuzz/issues/1019') 205print (' */') 206print (' bool processed = false;') 207print (' buffer->clear_output ();') 208print (' unsigned int count = buffer->len;') 209print (' switch ((unsigned) buffer->props.script)') 210print (' {') 211 212for script, constraints in sorted (constraints.items (), key=lambda s_c: script_order[s_c[0]]): 213 print (' case HB_SCRIPT_{}:'.format (script.upper ())) 214 print (' for (buffer->idx = 0; buffer->idx + 1 < count && buffer->successful;)') 215 print (' {') 216 print ('\tbool matched = false;') 217 write (str (constraints)) 218 print ('\tbuffer->next_glyph ();') 219 print ('\tif (matched) _output_with_dotted_circle (buffer);') 220 print (' }') 221 print (' processed = true;') 222 print (' break;') 223 print () 224 225print (' default:') 226print (' break;') 227print (' }') 228print (' if (processed)') 229print (' {') 230print (' if (buffer->idx < count)') 231print (' buffer->next_glyph ();') 232print (' buffer->swap_buffers ();') 233print (' }') 234print ('}') 235 236print () 237print () 238print ('#endif') 239print ('/* == End of generated functions == */') 240