1#!/usr/bin/env python3 2# flake8: noqa 3 4import io 5import sys 6 7if len (sys.argv) != 5: 8 print ("""usage: ./gen-use-table.py IndicSyllabicCategory.txt IndicPositionalCategory.txt UnicodeData.txt Blocks.txt 9 10Input file, as of Unicode 12: 11* https://unicode.org/Public/UCD/latest/ucd/IndicSyllabicCategory.txt 12* https://unicode.org/Public/UCD/latest/ucd/IndicPositionalCategory.txt 13* https://unicode.org/Public/UCD/latest/ucd/UnicodeData.txt 14* https://unicode.org/Public/UCD/latest/ucd/Blocks.txt""", file=sys.stderr) 15 sys.exit (1) 16 17BLACKLISTED_BLOCKS = ["Thai", "Lao"] 18 19files = [io.open (x, encoding='utf-8') for x in sys.argv[1:]] 20 21headers = [[f.readline () for i in range (2)] for j,f in enumerate(files) if j != 2] 22headers.append (["UnicodeData.txt does not have a header."]) 23 24data = [{} for f in files] 25values = [{} for f in files] 26for i, f in enumerate (files): 27 for line in f: 28 29 j = line.find ('#') 30 if j >= 0: 31 line = line[:j] 32 33 fields = [x.strip () for x in line.split (';')] 34 if len (fields) == 1: 35 continue 36 37 uu = fields[0].split ('..') 38 start = int (uu[0], 16) 39 if len (uu) == 1: 40 end = start 41 else: 42 end = int (uu[1], 16) 43 44 t = fields[1 if i != 2 else 2] 45 46 for u in range (start, end + 1): 47 data[i][u] = t 48 values[i][t] = values[i].get (t, 0) + end - start + 1 49 50defaults = ('Other', 'Not_Applicable', 'Cn', 'No_Block') 51 52# TODO Characters that are not in Unicode Indic files, but used in USE 53data[0][0x034F] = defaults[0] 54data[0][0x1B61] = defaults[0] 55data[0][0x1B63] = defaults[0] 56data[0][0x1B64] = defaults[0] 57data[0][0x1B65] = defaults[0] 58data[0][0x1B66] = defaults[0] 59data[0][0x1B67] = defaults[0] 60data[0][0x1B69] = defaults[0] 61data[0][0x1B6A] = defaults[0] 62data[0][0x2060] = defaults[0] 63# TODO https://github.com/harfbuzz/harfbuzz/pull/1685 64data[0][0x1B5B] = 'Consonant_Placeholder' 65data[0][0x1B5C] = 'Consonant_Placeholder' 66data[0][0x1B5F] = 'Consonant_Placeholder' 67data[0][0x1B62] = 'Consonant_Placeholder' 68data[0][0x1B68] = 'Consonant_Placeholder' 69# TODO https://github.com/harfbuzz/harfbuzz/issues/1035 70data[0][0x11C44] = 'Consonant_Placeholder' 71data[0][0x11C45] = 'Consonant_Placeholder' 72# TODO https://github.com/harfbuzz/harfbuzz/pull/1399 73data[0][0x111C8] = 'Consonant_Placeholder' 74for u in range (0xFE00, 0xFE0F + 1): 75 data[0][u] = defaults[0] 76 77# Merge data into one dict: 78for i,v in enumerate (defaults): 79 values[i][v] = values[i].get (v, 0) + 1 80combined = {} 81for i,d in enumerate (data): 82 for u,v in d.items (): 83 if i >= 2 and not u in combined: 84 continue 85 if not u in combined: 86 combined[u] = list (defaults) 87 combined[u][i] = v 88combined = {k:v for k,v in combined.items() if v[3] not in BLACKLISTED_BLOCKS} 89data = combined 90del combined 91num = len (data) 92 93 94property_names = [ 95 # General_Category 96 'Cc', 'Cf', 'Cn', 'Co', 'Cs', 'Ll', 'Lm', 'Lo', 'Lt', 'Lu', 'Mc', 97 'Me', 'Mn', 'Nd', 'Nl', 'No', 'Pc', 'Pd', 'Pe', 'Pf', 'Pi', 'Po', 98 'Ps', 'Sc', 'Sk', 'Sm', 'So', 'Zl', 'Zp', 'Zs', 99 # Indic_Syllabic_Category 100 'Other', 101 'Bindu', 102 'Visarga', 103 'Avagraha', 104 'Nukta', 105 'Virama', 106 'Pure_Killer', 107 'Invisible_Stacker', 108 'Vowel_Independent', 109 'Vowel_Dependent', 110 'Vowel', 111 'Consonant_Placeholder', 112 'Consonant', 113 'Consonant_Dead', 114 'Consonant_With_Stacker', 115 'Consonant_Prefixed', 116 'Consonant_Preceding_Repha', 117 'Consonant_Succeeding_Repha', 118 'Consonant_Subjoined', 119 'Consonant_Medial', 120 'Consonant_Final', 121 'Consonant_Head_Letter', 122 'Consonant_Initial_Postfixed', 123 'Modifying_Letter', 124 'Tone_Letter', 125 'Tone_Mark', 126 'Gemination_Mark', 127 'Cantillation_Mark', 128 'Register_Shifter', 129 'Syllable_Modifier', 130 'Consonant_Killer', 131 'Non_Joiner', 132 'Joiner', 133 'Number_Joiner', 134 'Number', 135 'Brahmi_Joining_Number', 136 # Indic_Positional_Category 137 'Not_Applicable', 138 'Right', 139 'Left', 140 'Visual_Order_Left', 141 'Left_And_Right', 142 'Top', 143 'Bottom', 144 'Top_And_Bottom', 145 'Top_And_Right', 146 'Top_And_Left', 147 'Top_And_Left_And_Right', 148 'Bottom_And_Left', 149 'Bottom_And_Right', 150 'Top_And_Bottom_And_Right', 151 'Overstruck', 152] 153 154class PropertyValue(object): 155 def __init__(self, name_): 156 self.name = name_ 157 def __str__(self): 158 return self.name 159 def __eq__(self, other): 160 return self.name == (other if isinstance(other, str) else other.name) 161 def __ne__(self, other): 162 return not (self == other) 163 def __hash__(self): 164 return hash(str(self)) 165 166property_values = {} 167 168for name in property_names: 169 value = PropertyValue(name) 170 assert value not in property_values 171 assert value not in globals() 172 property_values[name] = value 173globals().update(property_values) 174 175 176def is_BASE(U, UISC, UGC): 177 return (UISC in [Number, Consonant, Consonant_Head_Letter, 178 #SPEC-DRAFT Consonant_Placeholder, 179 Tone_Letter, 180 Vowel_Independent #SPEC-DRAFT 181 ] or 182 (UGC == Lo and UISC in [Avagraha, Bindu, Consonant_Final, Consonant_Medial, 183 Consonant_Subjoined, Vowel, Vowel_Dependent])) 184def is_BASE_IND(U, UISC, UGC): 185 #SPEC-DRAFT return (UISC in [Consonant_Dead, Modifying_Letter] or UGC == Po) 186 return (UISC in [Consonant_Dead, Modifying_Letter] or 187 (UGC == Po and not U in [0x104B, 0x104E, 0x1B5B, 0x1B5C, 0x1B5F, 0x2022, 0x111C8, 0x11A3F, 0x11A45, 0x11C44, 0x11C45]) or 188 False # SPEC-DRAFT-OUTDATED! U == 0x002D 189 ) 190def is_BASE_NUM(U, UISC, UGC): 191 return UISC == Brahmi_Joining_Number 192def is_BASE_OTHER(U, UISC, UGC): 193 if UISC == Consonant_Placeholder: return True #SPEC-DRAFT 194 #SPEC-DRAFT return U in [0x00A0, 0x00D7, 0x2015, 0x2022, 0x25CC, 0x25FB, 0x25FC, 0x25FD, 0x25FE] 195 return U in [0x2015, 0x2022, 0x25FB, 0x25FC, 0x25FD, 0x25FE] 196def is_CGJ(U, UISC, UGC): 197 return U == 0x034F 198def is_CONS_FINAL(U, UISC, UGC): 199 return ((UISC == Consonant_Final and UGC != Lo) or 200 UISC == Consonant_Succeeding_Repha) 201def is_CONS_FINAL_MOD(U, UISC, UGC): 202 #SPEC-DRAFT return UISC in [Consonant_Final_Modifier, Syllable_Modifier] 203 return UISC == Syllable_Modifier 204def is_CONS_MED(U, UISC, UGC): 205 # Consonant_Initial_Postfixed is new in Unicode 11; not in the spec. 206 return (UISC == Consonant_Medial and UGC != Lo or 207 UISC == Consonant_Initial_Postfixed) 208def is_CONS_MOD(U, UISC, UGC): 209 return UISC in [Nukta, Gemination_Mark, Consonant_Killer] 210def is_CONS_SUB(U, UISC, UGC): 211 #SPEC-DRAFT return UISC == Consonant_Subjoined 212 return UISC == Consonant_Subjoined and UGC != Lo 213def is_CONS_WITH_STACKER(U, UISC, UGC): 214 return UISC == Consonant_With_Stacker 215def is_HALANT(U, UISC, UGC): 216 return (UISC in [Virama, Invisible_Stacker] 217 and not is_HALANT_OR_VOWEL_MODIFIER(U, UISC, UGC) 218 and not is_SAKOT(U, UISC, UGC)) 219def is_HALANT_OR_VOWEL_MODIFIER(U, UISC, UGC): 220 # https://github.com/harfbuzz/harfbuzz/issues/1102 221 # https://github.com/harfbuzz/harfbuzz/issues/1379 222 return U in [0x11046, 0x1134D] 223def is_HALANT_NUM(U, UISC, UGC): 224 return UISC == Number_Joiner 225def is_ZWNJ(U, UISC, UGC): 226 return UISC == Non_Joiner 227def is_ZWJ(U, UISC, UGC): 228 return UISC == Joiner 229def is_Word_Joiner(U, UISC, UGC): 230 return U == 0x2060 231def is_OTHER(U, UISC, UGC): 232 #SPEC-OUTDATED return UGC == Zs # or any other SCRIPT_COMMON characters 233 return (UISC == Other 234 and not is_SYM(U, UISC, UGC) 235 and not is_SYM_MOD(U, UISC, UGC) 236 and not is_CGJ(U, UISC, UGC) 237 and not is_Word_Joiner(U, UISC, UGC) 238 and not is_VARIATION_SELECTOR(U, UISC, UGC) 239 ) 240def is_Reserved(U, UISC, UGC): 241 return UGC == 'Cn' 242def is_REPHA(U, UISC, UGC): 243 return UISC in [Consonant_Preceding_Repha, Consonant_Prefixed] 244def is_SAKOT(U, UISC, UGC): 245 return U == 0x1A60 246def is_SYM(U, UISC, UGC): 247 if U == 0x25CC: return False #SPEC-DRAFT 248 #SPEC-DRAFT return UGC in [So, Sc] or UISC == Symbol_Letter 249 return UGC in [So, Sc] and U not in [0x1B62, 0x1B68] 250def is_SYM_MOD(U, UISC, UGC): 251 return U in [0x1B6B, 0x1B6C, 0x1B6D, 0x1B6E, 0x1B6F, 0x1B70, 0x1B71, 0x1B72, 0x1B73] 252def is_VARIATION_SELECTOR(U, UISC, UGC): 253 return 0xFE00 <= U <= 0xFE0F 254def is_VOWEL(U, UISC, UGC): 255 # https://github.com/harfbuzz/harfbuzz/issues/376 256 return (UISC == Pure_Killer or 257 (UGC != Lo and UISC in [Vowel, Vowel_Dependent] and U not in [0xAA29])) 258def is_VOWEL_MOD(U, UISC, UGC): 259 # https://github.com/harfbuzz/harfbuzz/issues/376 260 return (UISC in [Tone_Mark, Cantillation_Mark, Register_Shifter, Visarga] or 261 (UGC != Lo and (UISC == Bindu or U in [0xAA29]))) 262 263use_mapping = { 264 'B': is_BASE, 265 'IND': is_BASE_IND, 266 'N': is_BASE_NUM, 267 'GB': is_BASE_OTHER, 268 'CGJ': is_CGJ, 269 'F': is_CONS_FINAL, 270 'FM': is_CONS_FINAL_MOD, 271 'M': is_CONS_MED, 272 'CM': is_CONS_MOD, 273 'SUB': is_CONS_SUB, 274 'CS': is_CONS_WITH_STACKER, 275 'H': is_HALANT, 276 'HVM': is_HALANT_OR_VOWEL_MODIFIER, 277 'HN': is_HALANT_NUM, 278 'ZWNJ': is_ZWNJ, 279 'ZWJ': is_ZWJ, 280 'WJ': is_Word_Joiner, 281 'O': is_OTHER, 282 'Rsv': is_Reserved, 283 'R': is_REPHA, 284 'S': is_SYM, 285 'Sk': is_SAKOT, 286 'SM': is_SYM_MOD, 287 'VS': is_VARIATION_SELECTOR, 288 'V': is_VOWEL, 289 'VM': is_VOWEL_MOD, 290} 291 292use_positions = { 293 'F': { 294 'Abv': [Top], 295 'Blw': [Bottom], 296 'Pst': [Right], 297 }, 298 'M': { 299 'Abv': [Top], 300 'Blw': [Bottom, Bottom_And_Left], 301 'Pst': [Right], 302 'Pre': [Left], 303 }, 304 'CM': { 305 'Abv': [Top], 306 'Blw': [Bottom], 307 }, 308 'V': { 309 'Abv': [Top, Top_And_Bottom, Top_And_Bottom_And_Right, Top_And_Right], 310 'Blw': [Bottom, Overstruck, Bottom_And_Right], 311 'Pst': [Right, Top_And_Left, Top_And_Left_And_Right, Left_And_Right], 312 'Pre': [Left], 313 }, 314 'VM': { 315 'Abv': [Top], 316 'Blw': [Bottom, Overstruck], 317 'Pst': [Right], 318 'Pre': [Left], 319 }, 320 'SM': { 321 'Abv': [Top], 322 'Blw': [Bottom], 323 }, 324 'H': None, 325 'HVM': None, 326 'B': None, 327 'FM': { 328 'Abv': [Top], 329 'Blw': [Bottom], 330 'Pst': [Not_Applicable], 331 }, 332 'SUB': None, 333} 334 335def map_to_use(data): 336 out = {} 337 items = use_mapping.items() 338 for U,(UISC,UIPC,UGC,UBlock) in data.items(): 339 340 # Resolve Indic_Syllabic_Category 341 342 # TODO: These don't have UISC assigned in Unicode 12.0, but have UIPC 343 if 0x1CE2 <= U <= 0x1CE8: UISC = Cantillation_Mark 344 345 # Tibetan: 346 # TODO: These don't have UISC assigned in Unicode 12.0, but have UIPC 347 if 0x0F18 <= U <= 0x0F19 or 0x0F3E <= U <= 0x0F3F: UISC = Vowel_Dependent 348 if 0x0F86 <= U <= 0x0F87: UISC = Tone_Mark 349 # Overrides to allow NFC order matching syllable 350 # https://github.com/harfbuzz/harfbuzz/issues/1012 351 if UBlock == 'Tibetan' and is_VOWEL (U, UISC, UGC): 352 if UIPC == Top: 353 UIPC = Bottom 354 355 # TODO: https://github.com/harfbuzz/harfbuzz/pull/982 356 # also https://github.com/harfbuzz/harfbuzz/issues/1012 357 if UBlock == 'Chakma' and is_VOWEL (U, UISC, UGC): 358 if UIPC == Top: 359 UIPC = Bottom 360 elif UIPC == Bottom: 361 UIPC = Top 362 363 # TODO: https://github.com/harfbuzz/harfbuzz/pull/627 364 if 0x1BF2 <= U <= 0x1BF3: UISC = Nukta; UIPC = Bottom 365 366 # TODO: U+1CED should only be allowed after some of 367 # the nasalization marks, maybe only for U+1CE9..U+1CF1. 368 if U == 0x1CED: UISC = Tone_Mark 369 370 # TODO: https://github.com/harfbuzz/harfbuzz/issues/1105 371 if U == 0x11134: UISC = Gemination_Mark 372 373 values = [k for k,v in items if v(U,UISC,UGC)] 374 assert len(values) == 1, "%s %s %s %s" % (hex(U), UISC, UGC, values) 375 USE = values[0] 376 377 # Resolve Indic_Positional_Category 378 379 # TODO: These should die, but have UIPC in Unicode 12.0 380 if U in [0x953, 0x954]: UIPC = Not_Applicable 381 382 # TODO: In USE's override list but not in Unicode 12.0 383 if U == 0x103C: UIPC = Left 384 385 # TODO: https://github.com/harfbuzz/harfbuzz/pull/2012 386 if U == 0x1C29: UIPC = Left 387 388 # TODO: These are not in USE's override list that we have, nor are they in Unicode 12.0 389 if 0xA926 <= U <= 0xA92A: UIPC = Top 390 # TODO: https://github.com/harfbuzz/harfbuzz/pull/1037 391 # and https://github.com/harfbuzz/harfbuzz/issues/1631 392 if U in [0x11302, 0x11303, 0x114C1]: UIPC = Top 393 if U == 0x1171E: UIPC = Left 394 if 0x1CF8 <= U <= 0x1CF9: UIPC = Top 395 396 assert (UIPC in [Not_Applicable, Visual_Order_Left] or 397 USE in use_positions), "%s %s %s %s %s" % (hex(U), UIPC, USE, UISC, UGC) 398 399 pos_mapping = use_positions.get(USE, None) 400 if pos_mapping: 401 values = [k for k,v in pos_mapping.items() if v and UIPC in v] 402 assert len(values) == 1, "%s %s %s %s %s %s" % (hex(U), UIPC, USE, UISC, UGC, values) 403 USE = USE + values[0] 404 405 out[U] = (USE, UBlock) 406 return out 407 408defaults = ('O', 'No_Block') 409data = map_to_use(data) 410 411print ("/* == Start of generated table == */") 412print ("/*") 413print (" * The following table is generated by running:") 414print (" *") 415print (" * ./gen-use-table.py IndicSyllabicCategory.txt IndicPositionalCategory.txt UnicodeData.txt Blocks.txt") 416print (" *") 417print (" * on files with these headers:") 418print (" *") 419for h in headers: 420 for l in h: 421 print (" * %s" % (l.strip())) 422print (" */") 423print () 424print ('#include "hb.hh"') 425print () 426print ('#ifndef HB_NO_OT_SHAPE') 427print () 428print ('#include "hb-ot-shape-complex-use.hh"') 429print () 430 431total = 0 432used = 0 433last_block = None 434def print_block (block, start, end, data): 435 global total, used, last_block 436 if block and block != last_block: 437 print () 438 print () 439 print (" /* %s */" % block) 440 if start % 16: 441 print (' ' * (20 + (start % 16 * 6)), end='') 442 num = 0 443 assert start % 8 == 0 444 assert (end+1) % 8 == 0 445 for u in range (start, end+1): 446 if u % 16 == 0: 447 print () 448 print (" /* %04X */" % u, end='') 449 if u in data: 450 num += 1 451 d = data.get (u, defaults) 452 print ("%6s," % d[0], end='') 453 454 total += end - start + 1 455 used += num 456 if block: 457 last_block = block 458 459uu = sorted (data.keys ()) 460 461last = -100000 462num = 0 463offset = 0 464starts = [] 465ends = [] 466print ('#pragma GCC diagnostic push') 467print ('#pragma GCC diagnostic ignored "-Wunused-macros"') 468for k,v in sorted(use_mapping.items()): 469 if k in use_positions and use_positions[k]: continue 470 print ("#define %s USE_%s /* %s */" % (k, k, v.__name__[3:])) 471for k,v in sorted(use_positions.items()): 472 if not v: continue 473 for suf in v.keys(): 474 tag = k + suf 475 print ("#define %s USE_%s" % (tag, tag)) 476print ('#pragma GCC diagnostic pop') 477print ("") 478print ("static const USE_TABLE_ELEMENT_TYPE use_table[] = {") 479for u in uu: 480 if u <= last: 481 continue 482 block = data[u][1] 483 484 start = u//8*8 485 end = start+1 486 while end in uu and block == data[end][1]: 487 end += 1 488 end = (end-1)//8*8 + 7 489 490 if start != last + 1: 491 if start - last <= 1+16*3: 492 print_block (None, last+1, start-1, data) 493 last = start-1 494 else: 495 if last >= 0: 496 ends.append (last + 1) 497 offset += ends[-1] - starts[-1] 498 print () 499 print () 500 print ("#define use_offset_0x%04xu %d" % (start, offset)) 501 starts.append (start) 502 503 print_block (block, start, end, data) 504 last = end 505ends.append (last + 1) 506offset += ends[-1] - starts[-1] 507print () 508print () 509occupancy = used * 100. / total 510page_bits = 12 511print ("}; /* Table items: %d; occupancy: %d%% */" % (offset, occupancy)) 512print () 513print ("USE_TABLE_ELEMENT_TYPE") 514print ("hb_use_get_category (hb_codepoint_t u)") 515print ("{") 516print (" switch (u >> %d)" % page_bits) 517print (" {") 518pages = set([u>>page_bits for u in starts+ends]) 519for p in sorted(pages): 520 print (" case 0x%0Xu:" % p) 521 for (start,end) in zip (starts, ends): 522 if p not in [start>>page_bits, end>>page_bits]: continue 523 offset = "use_offset_0x%04xu" % start 524 print (" if (hb_in_range<hb_codepoint_t> (u, 0x%04Xu, 0x%04Xu)) return use_table[u - 0x%04Xu + %s];" % (start, end-1, start, offset)) 525 print (" break;") 526 print ("") 527print (" default:") 528print (" break;") 529print (" }") 530print (" return USE_O;") 531print ("}") 532print () 533for k in sorted(use_mapping.keys()): 534 if k in use_positions and use_positions[k]: continue 535 print ("#undef %s" % k) 536for k,v in sorted(use_positions.items()): 537 if not v: continue 538 for suf in v.keys(): 539 tag = k + suf 540 print ("#undef %s" % tag) 541print () 542print () 543print ('#endif') 544print ("/* == End of generated table == */") 545 546# Maintain at least 50% occupancy in the table */ 547if occupancy < 50: 548 raise Exception ("Table too sparse, please investigate: ", occupancy) 549