1#!/usr/bin/env python3 2 3import io, sys 4 5if len (sys.argv) != 4: 6 print ("""usage: ./gen-indic-table.py IndicSyllabicCategory.txt IndicPositionalCategory.txt Blocks.txt 7 8Input files, as of Unicode 12: 9* https://unicode.org/Public/UCD/latest/ucd/IndicSyllabicCategory.txt 10* https://unicode.org/Public/UCD/latest/ucd/IndicPositionalCategory.txt 11* https://unicode.org/Public/UCD/latest/ucd/Blocks.txt""", file=sys.stderr) 12 sys.exit (1) 13 14ALLOWED_SINGLES = [0x00A0, 0x25CC] 15ALLOWED_BLOCKS = [ 16 'Basic Latin', 17 'Latin-1 Supplement', 18 'Devanagari', 19 'Bengali', 20 'Gurmukhi', 21 'Gujarati', 22 'Oriya', 23 'Tamil', 24 'Telugu', 25 'Kannada', 26 'Malayalam', 27 'Sinhala', 28 'Myanmar', 29 'Khmer', 30 'Vedic Extensions', 31 'General Punctuation', 32 'Superscripts and Subscripts', 33 'Devanagari Extended', 34 'Myanmar Extended-B', 35 'Myanmar Extended-A', 36] 37 38files = [io.open (x, encoding='utf-8') for x in sys.argv[1:]] 39 40headers = [[f.readline () for i in range (2)] for f in files] 41 42data = [{} for f in files] 43values = [{} for f in files] 44for i, f in enumerate (files): 45 for line in f: 46 47 j = line.find ('#') 48 if j >= 0: 49 line = line[:j] 50 51 fields = [x.strip () for x in line.split (';')] 52 if len (fields) == 1: 53 continue 54 55 uu = fields[0].split ('..') 56 start = int (uu[0], 16) 57 if len (uu) == 1: 58 end = start 59 else: 60 end = int (uu[1], 16) 61 62 t = fields[1] 63 64 for u in range (start, end + 1): 65 data[i][u] = t 66 values[i][t] = values[i].get (t, 0) + end - start + 1 67 68# Merge data into one dict: 69defaults = ('Other', 'Not_Applicable', 'No_Block') 70for i,v in enumerate (defaults): 71 values[i][v] = values[i].get (v, 0) + 1 72combined = {} 73for i,d in enumerate (data): 74 for u,v in d.items (): 75 if i == 2 and not u in combined: 76 continue 77 if not u in combined: 78 combined[u] = list (defaults) 79 combined[u][i] = v 80combined = {k:v for k,v in combined.items() if k in ALLOWED_SINGLES or v[2] in ALLOWED_BLOCKS} 81data = combined 82del combined 83num = len (data) 84 85# Move the outliers NO-BREAK SPACE and DOTTED CIRCLE out 86singles = {} 87for u in ALLOWED_SINGLES: 88 singles[u] = data[u] 89 del data[u] 90 91print ("/* == Start of generated table == */") 92print ("/*") 93print (" * The following table is generated by running:") 94print (" *") 95print (" * ./gen-indic-table.py IndicSyllabicCategory.txt IndicPositionalCategory.txt Blocks.txt") 96print (" *") 97print (" * on files with these headers:") 98print (" *") 99for h in headers: 100 for l in h: 101 print (" * %s" % (l.strip())) 102print (" */") 103print () 104print ('#include "hb.hh"') 105print () 106print ('#ifndef HB_NO_OT_SHAPE') 107print () 108print ('#include "hb-ot-shape-complex-indic.hh"') 109print () 110 111# Shorten values 112short = [{ 113 "Bindu": 'Bi', 114 "Cantillation_Mark": 'Ca', 115 "Joiner": 'ZWJ', 116 "Non_Joiner": 'ZWNJ', 117 "Number": 'Nd', 118 "Visarga": 'Vs', 119 "Vowel": 'Vo', 120 "Vowel_Dependent": 'M', 121 "Consonant_Prefixed": 'CPrf', 122 "Other": 'x', 123},{ 124 "Not_Applicable": 'x', 125}] 126all_shorts = [{},{}] 127 128# Add some of the values, to make them more readable, and to avoid duplicates 129 130 131for i in range (2): 132 for v,s in short[i].items (): 133 all_shorts[i][s] = v 134 135what = ["INDIC_SYLLABIC_CATEGORY", "INDIC_MATRA_CATEGORY"] 136what_short = ["ISC", "IMC"] 137print ('#pragma GCC diagnostic push') 138print ('#pragma GCC diagnostic ignored "-Wunused-macros"') 139cat_defs = [] 140for i in range (2): 141 vv = sorted (values[i].keys ()) 142 for v in vv: 143 v_no_and = v.replace ('_And_', '_') 144 if v in short[i]: 145 s = short[i][v] 146 else: 147 s = ''.join ([c for c in v_no_and if ord ('A') <= ord (c) <= ord ('Z')]) 148 if s in all_shorts[i]: 149 raise Exception ("Duplicate short value alias", v, all_shorts[i][s]) 150 all_shorts[i][s] = v 151 short[i][v] = s 152 cat_defs.append ((what_short[i] + '_' + s, what[i] + '_' + v.upper (), str (values[i][v]), v)) 153 154maxlen_s = max ([len (c[0]) for c in cat_defs]) 155maxlen_l = max ([len (c[1]) for c in cat_defs]) 156maxlen_n = max ([len (c[2]) for c in cat_defs]) 157for s in what_short: 158 print () 159 for c in [c for c in cat_defs if s in c[0]]: 160 print ("#define %s %s /* %s chars; %s */" % 161 (c[0].ljust (maxlen_s), c[1].ljust (maxlen_l), c[2].rjust (maxlen_n), c[3])) 162print () 163print ('#pragma GCC diagnostic pop') 164print () 165print ("#define _(S,M) INDIC_COMBINE_CATEGORIES (ISC_##S, IMC_##M)") 166print () 167print () 168 169total = 0 170used = 0 171last_block = None 172def print_block (block, start, end, data): 173 global total, used, last_block 174 if block and block != last_block: 175 print () 176 print () 177 print (" /* %s */" % block) 178 num = 0 179 assert start % 8 == 0 180 assert (end+1) % 8 == 0 181 for u in range (start, end+1): 182 if u % 8 == 0: 183 print () 184 print (" /* %04X */" % u, end="") 185 if u in data: 186 num += 1 187 d = data.get (u, defaults) 188 print ("%9s" % ("_(%s,%s)," % (short[0][d[0]], short[1][d[1]])), end="") 189 190 total += end - start + 1 191 used += num 192 if block: 193 last_block = block 194 195uu = sorted (data.keys ()) 196 197last = -100000 198num = 0 199offset = 0 200starts = [] 201ends = [] 202print ("static const INDIC_TABLE_ELEMENT_TYPE indic_table[] = {") 203for u in uu: 204 if u <= last: 205 continue 206 block = data[u][2] 207 208 start = u//8*8 209 end = start+1 210 while end in uu and block == data[end][2]: 211 end += 1 212 end = (end-1)//8*8 + 7 213 214 if start != last + 1: 215 if start - last <= 1+16*3: 216 print_block (None, last+1, start-1, data) 217 last = start-1 218 else: 219 if last >= 0: 220 ends.append (last + 1) 221 offset += ends[-1] - starts[-1] 222 print () 223 print () 224 print ("#define indic_offset_0x%04xu %d" % (start, offset)) 225 starts.append (start) 226 227 print_block (block, start, end, data) 228 last = end 229ends.append (last + 1) 230offset += ends[-1] - starts[-1] 231print () 232print () 233occupancy = used * 100. / total 234page_bits = 12 235print ("}; /* Table items: %d; occupancy: %d%% */" % (offset, occupancy)) 236print () 237print ("INDIC_TABLE_ELEMENT_TYPE") 238print ("hb_indic_get_categories (hb_codepoint_t u)") 239print ("{") 240print (" switch (u >> %d)" % page_bits) 241print (" {") 242pages = set ([u>>page_bits for u in starts+ends+list (singles.keys ())]) 243for p in sorted(pages): 244 print (" case 0x%0Xu:" % p) 245 for u,d in singles.items (): 246 if p != u>>page_bits: continue 247 print (" if (unlikely (u == 0x%04Xu)) return _(%s,%s);" % (u, short[0][d[0]], short[1][d[1]])) 248 for (start,end) in zip (starts, ends): 249 if p not in [start>>page_bits, end>>page_bits]: continue 250 offset = "indic_offset_0x%04xu" % start 251 print (" if (hb_in_range<hb_codepoint_t> (u, 0x%04Xu, 0x%04Xu)) return indic_table[u - 0x%04Xu + %s];" % (start, end-1, start, offset)) 252 print (" break;") 253 print ("") 254print (" default:") 255print (" break;") 256print (" }") 257print (" return _(x,x);") 258print ("}") 259print () 260print ("#undef _") 261for i in range (2): 262 print () 263 vv = sorted (values[i].keys ()) 264 for v in vv: 265 print ("#undef %s_%s" % 266 (what_short[i], short[i][v])) 267print () 268print ('#endif') 269print () 270print ("/* == End of generated table == */") 271 272# Maintain at least 30% occupancy in the table */ 273if occupancy < 30: 274 raise Exception ("Table too sparse, please investigate: ", occupancy) 275