1#!/usr/bin/env python3 2 3"""Generator of the function to prohibit certain vowel sequences. 4 5It creates ``_hb_preprocess_text_vowel_constraints``, which inserts dotted 6circles into sequences prohibited by the USE script development spec. 7This function should be used as the ``preprocess_text`` of an 8``hb_ot_complex_shaper_t``. 9 10usage: ./gen-vowel-constraints.py ms-use/IndicShapingInvalidCluster.txt Scripts.txt 11 12Input file: 13* https://unicode.org/Public/UCD/latest/ucd/Scripts.txt 14""" 15 16import collections 17def write (s): 18 sys.stdout.flush () 19 sys.stdout.buffer.write (s.encode ('utf-8')) 20import sys 21 22if len (sys.argv) != 3: 23 sys.exit (__doc__) 24 25with open (sys.argv[2], encoding='utf-8') as f: 26 scripts_header = [f.readline () for i in range (2)] 27 scripts = {} 28 script_order = {} 29 for line in f: 30 j = line.find ('#') 31 if j >= 0: 32 line = line[:j] 33 fields = [x.strip () for x in line.split (';')] 34 if len (fields) == 1: 35 continue 36 uu = fields[0].split ('..') 37 start = int (uu[0], 16) 38 if len (uu) == 1: 39 end = start 40 else: 41 end = int (uu[1], 16) 42 script = fields[1] 43 for u in range (start, end + 1): 44 scripts[u] = script 45 if script not in script_order: 46 script_order[script] = start 47 48class ConstraintSet (object): 49 """A set of prohibited code point sequences. 50 51 Args: 52 constraint (List[int]): A prohibited code point sequence. 53 54 """ 55 def __init__ (self, constraint): 56 # Either a list or a dictionary. As a list of code points, it 57 # represents a prohibited code point sequence. As a dictionary, 58 # it represents a set of prohibited sequences, where each item 59 # represents the set of prohibited sequences starting with the 60 # key (a code point) concatenated with any of the values 61 # (ConstraintSets). 62 self._c = constraint 63 64 def add (self, constraint): 65 """Add a constraint to this set.""" 66 if not constraint: 67 return 68 first = constraint[0] 69 rest = constraint[1:] 70 if isinstance (self._c, list): 71 if constraint == self._c[:len (constraint)]: 72 self._c = constraint 73 elif self._c != constraint[:len (self._c)]: 74 self._c = {self._c[0]: ConstraintSet (self._c[1:])} 75 if isinstance (self._c, dict): 76 if first in self._c: 77 self._c[first].add (rest) 78 else: 79 self._c[first] = ConstraintSet (rest) 80 81 @staticmethod 82 def _indent (depth): 83 return (' ' * depth).replace (' ', '\t') 84 85 def __str__ (self, index=0, depth=4): 86 s = [] 87 indent = self._indent (depth) 88 if isinstance (self._c, list): 89 if len (self._c) == 0: 90 assert index == 2, 'Cannot use `matched` for this constraint; the general case has not been implemented' 91 s.append ('{}matched = true;\n'.format (indent)) 92 elif len (self._c) == 1: 93 assert index == 1, 'Cannot use `matched` for this constraint; the general case has not been implemented' 94 s.append ('{}matched = 0x{:04X}u == buffer->cur ({}).codepoint;\n'.format (indent, next (iter (self._c)), index or '')) 95 else: 96 s.append ('{}if (0x{:04X}u == buffer->cur ({}).codepoint &&\n'.format (indent, self._c[0], index or '')) 97 if index: 98 s.append ('{}buffer->idx + {} < count &&\n'.format (self._indent (depth + 2), index + 1)) 99 for i, cp in enumerate (self._c[1:], start=1): 100 s.append ('{}0x{:04X}u == buffer->cur ({}).codepoint{}\n'.format ( 101 self._indent (depth + 2), cp, index + i, ')' if i == len (self._c) - 1 else ' &&')) 102 s.append ('{}{{\n'.format (indent)) 103 for i in range (index): 104 s.append ('{}(void) buffer->next_glyph ();\n'.format (self._indent (depth + 1))) 105 s.append ('{}matched = true;\n'.format (self._indent (depth + 1))) 106 s.append ('{}}}\n'.format (indent)) 107 else: 108 s.append ('{}switch (buffer->cur ({}).codepoint)\n'.format(indent, index or '')) 109 s.append ('{}{{\n'.format (indent)) 110 cases = collections.defaultdict (set) 111 for first, rest in sorted (self._c.items ()): 112 cases[rest.__str__ (index + 1, depth + 2)].add (first) 113 for body, labels in sorted (cases.items (), key=lambda b_ls: sorted (b_ls[1])[0]): 114 for i, cp in enumerate (sorted (labels)): 115 if i % 4 == 0: 116 s.append (self._indent (depth + 1)) 117 else: 118 s.append (' ') 119 s.append ('case 0x{:04X}u:{}'.format (cp, '\n' if i % 4 == 3 else '')) 120 if len (labels) % 4 != 0: 121 s.append ('\n') 122 s.append (body) 123 s.append ('{}break;\n'.format (self._indent (depth + 2))) 124 s.append ('{}}}\n'.format (indent)) 125 return ''.join (s) 126 127constraints = {} 128with open (sys.argv[1], encoding='utf-8') as f: 129 constraints_header = [] 130 while True: 131 line = f.readline ().strip () 132 if line == '#': 133 break 134 constraints_header.append(line) 135 for line in f: 136 j = line.find ('#') 137 if j >= 0: 138 line = line[:j] 139 constraint = [int (cp, 16) for cp in line.split (';')[0].split ()] 140 if not constraint: continue 141 assert 2 <= len (constraint), 'Prohibited sequence is too short: {}'.format (constraint) 142 script = scripts[constraint[0]] 143 if script in constraints: 144 constraints[script].add (constraint) 145 else: 146 constraints[script] = ConstraintSet (constraint) 147 assert constraints, 'No constraints found' 148 149print ('/* == Start of generated functions == */') 150print ('/*') 151print (' * The following functions are generated by running:') 152print (' *') 153print (' * %s ms-use/IndicShapingInvalidCluster.txt Scripts.txt' % sys.argv[0]) 154print (' *') 155print (' * on files with these headers:') 156print (' *') 157for line in constraints_header: 158 print (' * %s' % line.strip ()) 159print (' *') 160for line in scripts_header: 161 print (' * %s' % line.strip ()) 162print (' */') 163 164print () 165print ('#include "hb.hh"') 166print () 167print ('#ifndef HB_NO_OT_SHAPE') 168print () 169print ('#include "hb-ot-shape-complex-vowel-constraints.hh"') 170print () 171print ('static void') 172print ('_output_dotted_circle (hb_buffer_t *buffer)') 173print ('{') 174print (' (void) buffer->output_glyph (0x25CCu);') 175print (' _hb_glyph_info_reset_continuation (&buffer->prev());') 176print ('}') 177print () 178print ('static void') 179print ('_output_with_dotted_circle (hb_buffer_t *buffer)') 180print ('{') 181print (' _output_dotted_circle (buffer);') 182print (' (void) buffer->next_glyph ();') 183print ('}') 184print () 185 186print ('void') 187print ('_hb_preprocess_text_vowel_constraints (const hb_ot_shape_plan_t *plan HB_UNUSED,') 188print ('\t\t\t\t hb_buffer_t *buffer,') 189print ('\t\t\t\t hb_font_t *font HB_UNUSED)') 190print ('{') 191print ('#ifdef HB_NO_OT_SHAPE_COMPLEX_VOWEL_CONSTRAINTS') 192print (' return;') 193print ('#endif') 194print (' if (buffer->flags & HB_BUFFER_FLAG_DO_NOT_INSERT_DOTTED_CIRCLE)') 195print (' return;') 196print () 197print (' /* UGLY UGLY UGLY business of adding dotted-circle in the middle of') 198print (' * vowel-sequences that look like another vowel. Data for each script') 199print (' * collected from the USE script development spec.') 200print (' *') 201print (' * https://github.com/harfbuzz/harfbuzz/issues/1019') 202print (' */') 203print (' buffer->clear_output ();') 204print (' unsigned int count = buffer->len;') 205print (' switch ((unsigned) buffer->props.script)') 206print (' {') 207 208for script, constraints in sorted (constraints.items (), key=lambda s_c: script_order[s_c[0]]): 209 print (' case HB_SCRIPT_{}:'.format (script.upper ())) 210 print (' for (buffer->idx = 0; buffer->idx + 1 < count && buffer->successful;)') 211 print (' {') 212 print ('\tbool matched = false;') 213 write (str (constraints)) 214 print ('\t(void) buffer->next_glyph ();') 215 print ('\tif (matched) _output_with_dotted_circle (buffer);') 216 print (' }') 217 print (' break;') 218 print () 219 220print (' default:') 221print (' break;') 222print (' }') 223print (' buffer->swap_buffers ();') 224print ('}') 225 226print () 227print () 228print ('#endif') 229print ('/* == End of generated functions == */') 230