1#!/usr/bin/env python 2# flake8: noqa 3 4from __future__ import print_function, division, absolute_import 5 6import io 7import sys 8 9if len (sys.argv) != 5: 10 print ("usage: ./gen-use-table.py IndicSyllabicCategory.txt IndicPositionalCategory.txt UnicodeData.txt Blocks.txt", file=sys.stderr) 11 sys.exit (1) 12 13BLACKLISTED_BLOCKS = ["Thai", "Lao"] 14 15files = [io.open (x, encoding='utf-8') for x in sys.argv[1:]] 16 17headers = [[f.readline () for i in range (2)] for j,f in enumerate(files) if j != 2] 18headers.append (["UnicodeData.txt does not have a header."]) 19 20data = [{} for f in files] 21values = [{} for f in files] 22for i, f in enumerate (files): 23 for line in f: 24 25 j = line.find ('#') 26 if j >= 0: 27 line = line[:j] 28 29 fields = [x.strip () for x in line.split (';')] 30 if len (fields) == 1: 31 continue 32 33 uu = fields[0].split ('..') 34 start = int (uu[0], 16) 35 if len (uu) == 1: 36 end = start 37 else: 38 end = int (uu[1], 16) 39 40 t = fields[1 if i != 2 else 2] 41 42 for u in range (start, end + 1): 43 data[i][u] = t 44 values[i][t] = values[i].get (t, 0) + end - start + 1 45 46defaults = ('Other', 'Not_Applicable', 'Cn', 'No_Block') 47 48# TODO Characters that are not in Unicode Indic files, but used in USE 49data[0][0x034F] = defaults[0] 50data[0][0x1B61] = defaults[0] 51data[0][0x1B63] = defaults[0] 52data[0][0x1B64] = defaults[0] 53data[0][0x1B65] = defaults[0] 54data[0][0x1B66] = defaults[0] 55data[0][0x1B67] = defaults[0] 56data[0][0x1B69] = defaults[0] 57data[0][0x1B6A] = defaults[0] 58data[0][0x2060] = defaults[0] 59# TODO https://github.com/harfbuzz/harfbuzz/pull/1685 60data[0][0x1B5B] = 'Consonant_Placeholder' 61data[0][0x1B5C] = 'Consonant_Placeholder' 62data[0][0x1B5F] = 'Consonant_Placeholder' 63data[0][0x1B62] = 'Consonant_Placeholder' 64data[0][0x1B68] = 'Consonant_Placeholder' 65# TODO https://github.com/harfbuzz/harfbuzz/issues/1035 66data[0][0x11C44] = 'Consonant_Placeholder' 67data[0][0x11C45] = 'Consonant_Placeholder' 68# TODO https://github.com/harfbuzz/harfbuzz/pull/1399 69data[0][0x111C8] = 'Consonant_Placeholder' 70for u in range (0xFE00, 0xFE0F + 1): 71 data[0][u] = defaults[0] 72 73# Merge data into one dict: 74for i,v in enumerate (defaults): 75 values[i][v] = values[i].get (v, 0) + 1 76combined = {} 77for i,d in enumerate (data): 78 for u,v in d.items (): 79 if i >= 2 and not u in combined: 80 continue 81 if not u in combined: 82 combined[u] = list (defaults) 83 combined[u][i] = v 84combined = {k:v for k,v in combined.items() if v[3] not in BLACKLISTED_BLOCKS} 85data = combined 86del combined 87num = len (data) 88 89 90property_names = [ 91 # General_Category 92 'Cc', 'Cf', 'Cn', 'Co', 'Cs', 'Ll', 'Lm', 'Lo', 'Lt', 'Lu', 'Mc', 93 'Me', 'Mn', 'Nd', 'Nl', 'No', 'Pc', 'Pd', 'Pe', 'Pf', 'Pi', 'Po', 94 'Ps', 'Sc', 'Sk', 'Sm', 'So', 'Zl', 'Zp', 'Zs', 95 # Indic_Syllabic_Category 96 'Other', 97 'Bindu', 98 'Visarga', 99 'Avagraha', 100 'Nukta', 101 'Virama', 102 'Pure_Killer', 103 'Invisible_Stacker', 104 'Vowel_Independent', 105 'Vowel_Dependent', 106 'Vowel', 107 'Consonant_Placeholder', 108 'Consonant', 109 'Consonant_Dead', 110 'Consonant_With_Stacker', 111 'Consonant_Prefixed', 112 'Consonant_Preceding_Repha', 113 'Consonant_Succeeding_Repha', 114 'Consonant_Subjoined', 115 'Consonant_Medial', 116 'Consonant_Final', 117 'Consonant_Head_Letter', 118 'Consonant_Initial_Postfixed', 119 'Modifying_Letter', 120 'Tone_Letter', 121 'Tone_Mark', 122 'Gemination_Mark', 123 'Cantillation_Mark', 124 'Register_Shifter', 125 'Syllable_Modifier', 126 'Consonant_Killer', 127 'Non_Joiner', 128 'Joiner', 129 'Number_Joiner', 130 'Number', 131 'Brahmi_Joining_Number', 132 # Indic_Positional_Category 133 'Not_Applicable', 134 'Right', 135 'Left', 136 'Visual_Order_Left', 137 'Left_And_Right', 138 'Top', 139 'Bottom', 140 'Top_And_Bottom', 141 'Top_And_Right', 142 'Top_And_Left', 143 'Top_And_Left_And_Right', 144 'Bottom_And_Left', 145 'Bottom_And_Right', 146 'Top_And_Bottom_And_Right', 147 'Overstruck', 148] 149 150try: 151 basestring 152except NameError: 153 basestring = str 154 155class PropertyValue(object): 156 def __init__(self, name_): 157 self.name = name_ 158 def __str__(self): 159 return self.name 160 def __eq__(self, other): 161 return self.name == (other if isinstance(other, basestring) else other.name) 162 def __ne__(self, other): 163 return not (self == other) 164 def __hash__(self): 165 return hash(str(self)) 166 167property_values = {} 168 169for name in property_names: 170 value = PropertyValue(name) 171 assert value not in property_values 172 assert value not in globals() 173 property_values[name] = value 174globals().update(property_values) 175 176 177def is_BASE(U, UISC, UGC): 178 return (UISC in [Number, Consonant, Consonant_Head_Letter, 179 #SPEC-DRAFT Consonant_Placeholder, 180 Tone_Letter, 181 Vowel_Independent #SPEC-DRAFT 182 ] or 183 (UGC == Lo and UISC in [Avagraha, Bindu, Consonant_Final, Consonant_Medial, 184 Consonant_Subjoined, Vowel, Vowel_Dependent])) 185def is_BASE_IND(U, UISC, UGC): 186 #SPEC-DRAFT return (UISC in [Consonant_Dead, Modifying_Letter] or UGC == Po) 187 return (UISC in [Consonant_Dead, Modifying_Letter] or 188 (UGC == Po and not U in [0x104B, 0x104E, 0x1B5B, 0x1B5C, 0x1B5F, 0x2022, 0x111C8, 0x11A3F, 0x11A45, 0x11C44, 0x11C45]) or 189 False # SPEC-DRAFT-OUTDATED! U == 0x002D 190 ) 191def is_BASE_NUM(U, UISC, UGC): 192 return UISC == Brahmi_Joining_Number 193def is_BASE_OTHER(U, UISC, UGC): 194 if UISC == Consonant_Placeholder: return True #SPEC-DRAFT 195 #SPEC-DRAFT return U in [0x00A0, 0x00D7, 0x2015, 0x2022, 0x25CC, 0x25FB, 0x25FC, 0x25FD, 0x25FE] 196 return U in [0x2015, 0x2022, 0x25FB, 0x25FC, 0x25FD, 0x25FE] 197def is_CGJ(U, UISC, UGC): 198 return U == 0x034F 199def is_CONS_FINAL(U, UISC, UGC): 200 return ((UISC == Consonant_Final and UGC != Lo) or 201 UISC == Consonant_Succeeding_Repha) 202def is_CONS_FINAL_MOD(U, UISC, UGC): 203 #SPEC-DRAFT return UISC in [Consonant_Final_Modifier, Syllable_Modifier] 204 return UISC == Syllable_Modifier 205def is_CONS_MED(U, UISC, UGC): 206 # Consonant_Initial_Postfixed is new in Unicode 11; not in the spec. 207 return (UISC == Consonant_Medial and UGC != Lo or 208 UISC == Consonant_Initial_Postfixed) 209def is_CONS_MOD(U, UISC, UGC): 210 return UISC in [Nukta, Gemination_Mark, Consonant_Killer] 211def is_CONS_SUB(U, UISC, UGC): 212 #SPEC-DRAFT return UISC == Consonant_Subjoined 213 return UISC == Consonant_Subjoined and UGC != Lo 214def is_CONS_WITH_STACKER(U, UISC, UGC): 215 return UISC == Consonant_With_Stacker 216def is_HALANT(U, UISC, UGC): 217 return (UISC in [Virama, Invisible_Stacker] 218 and not is_HALANT_OR_VOWEL_MODIFIER(U, UISC, UGC) 219 and not is_SAKOT(U, UISC, UGC)) 220def is_HALANT_OR_VOWEL_MODIFIER(U, UISC, UGC): 221 # https://github.com/harfbuzz/harfbuzz/issues/1102 222 # https://github.com/harfbuzz/harfbuzz/issues/1379 223 return U in [0x11046, 0x1134D] 224def is_HALANT_NUM(U, UISC, UGC): 225 return UISC == Number_Joiner 226def is_ZWNJ(U, UISC, UGC): 227 return UISC == Non_Joiner 228def is_ZWJ(U, UISC, UGC): 229 return UISC == Joiner 230def is_Word_Joiner(U, UISC, UGC): 231 return U == 0x2060 232def is_OTHER(U, UISC, UGC): 233 #SPEC-OUTDATED return UGC == Zs # or any other SCRIPT_COMMON characters 234 return (UISC == Other 235 and not is_SYM(U, UISC, UGC) 236 and not is_SYM_MOD(U, UISC, UGC) 237 and not is_CGJ(U, UISC, UGC) 238 and not is_Word_Joiner(U, UISC, UGC) 239 and not is_VARIATION_SELECTOR(U, UISC, UGC) 240 ) 241def is_Reserved(U, UISC, UGC): 242 return UGC == 'Cn' 243def is_REPHA(U, UISC, UGC): 244 return UISC in [Consonant_Preceding_Repha, Consonant_Prefixed] 245def is_SAKOT(U, UISC, UGC): 246 return U == 0x1A60 247def is_SYM(U, UISC, UGC): 248 if U == 0x25CC: return False #SPEC-DRAFT 249 #SPEC-DRAFT return UGC in [So, Sc] or UISC == Symbol_Letter 250 return UGC in [So, Sc] and U not in [0x1B62, 0x1B68] 251def is_SYM_MOD(U, UISC, UGC): 252 return U in [0x1B6B, 0x1B6C, 0x1B6D, 0x1B6E, 0x1B6F, 0x1B70, 0x1B71, 0x1B72, 0x1B73] 253def is_VARIATION_SELECTOR(U, UISC, UGC): 254 return 0xFE00 <= U <= 0xFE0F 255def is_VOWEL(U, UISC, UGC): 256 # https://github.com/harfbuzz/harfbuzz/issues/376 257 return (UISC == Pure_Killer or 258 (UGC != Lo and UISC in [Vowel, Vowel_Dependent] and U not in [0xAA29])) 259def is_VOWEL_MOD(U, UISC, UGC): 260 # https://github.com/harfbuzz/harfbuzz/issues/376 261 return (UISC in [Tone_Mark, Cantillation_Mark, Register_Shifter, Visarga] or 262 (UGC != Lo and (UISC == Bindu or U in [0xAA29]))) 263 264use_mapping = { 265 'B': is_BASE, 266 'IND': is_BASE_IND, 267 'N': is_BASE_NUM, 268 'GB': is_BASE_OTHER, 269 'CGJ': is_CGJ, 270 'F': is_CONS_FINAL, 271 'FM': is_CONS_FINAL_MOD, 272 'M': is_CONS_MED, 273 'CM': is_CONS_MOD, 274 'SUB': is_CONS_SUB, 275 'CS': is_CONS_WITH_STACKER, 276 'H': is_HALANT, 277 'HVM': is_HALANT_OR_VOWEL_MODIFIER, 278 'HN': is_HALANT_NUM, 279 'ZWNJ': is_ZWNJ, 280 'ZWJ': is_ZWJ, 281 'WJ': is_Word_Joiner, 282 'O': is_OTHER, 283 'Rsv': is_Reserved, 284 'R': is_REPHA, 285 'S': is_SYM, 286 'Sk': is_SAKOT, 287 'SM': is_SYM_MOD, 288 'VS': is_VARIATION_SELECTOR, 289 'V': is_VOWEL, 290 'VM': is_VOWEL_MOD, 291} 292 293use_positions = { 294 'F': { 295 'Abv': [Top], 296 'Blw': [Bottom], 297 'Pst': [Right], 298 }, 299 'M': { 300 'Abv': [Top], 301 'Blw': [Bottom, Bottom_And_Left], 302 'Pst': [Right], 303 'Pre': [Left], 304 }, 305 'CM': { 306 'Abv': [Top], 307 'Blw': [Bottom], 308 }, 309 'V': { 310 'Abv': [Top, Top_And_Bottom, Top_And_Bottom_And_Right, Top_And_Right], 311 'Blw': [Bottom, Overstruck, Bottom_And_Right], 312 'Pst': [Right, Top_And_Left, Top_And_Left_And_Right, Left_And_Right], 313 'Pre': [Left], 314 }, 315 'VM': { 316 'Abv': [Top], 317 'Blw': [Bottom, Overstruck], 318 'Pst': [Right], 319 'Pre': [Left], 320 }, 321 'SM': { 322 'Abv': [Top], 323 'Blw': [Bottom], 324 }, 325 'H': None, 326 'HVM': None, 327 'B': None, 328 'FM': { 329 'Abv': [Top], 330 'Blw': [Bottom], 331 'Pst': [Not_Applicable], 332 }, 333 'SUB': None, 334} 335 336def map_to_use(data): 337 out = {} 338 items = use_mapping.items() 339 for U,(UISC,UIPC,UGC,UBlock) in data.items(): 340 341 # Resolve Indic_Syllabic_Category 342 343 # TODO: These don't have UISC assigned in Unicode 12.0, but have UIPC 344 if 0x1CE2 <= U <= 0x1CE8: UISC = Cantillation_Mark 345 346 # Tibetan: 347 # TODO: These don't have UISC assigned in Unicode 12.0, but have UIPC 348 if 0x0F18 <= U <= 0x0F19 or 0x0F3E <= U <= 0x0F3F: UISC = Vowel_Dependent 349 if 0x0F86 <= U <= 0x0F87: UISC = Tone_Mark 350 # Overrides to allow NFC order matching syllable 351 # https://github.com/harfbuzz/harfbuzz/issues/1012 352 if UBlock == 'Tibetan' and is_VOWEL (U, UISC, UGC): 353 if UIPC == Top: 354 UIPC = Bottom 355 356 # TODO: https://github.com/harfbuzz/harfbuzz/pull/982 357 # also https://github.com/harfbuzz/harfbuzz/issues/1012 358 if UBlock == 'Chakma' and is_VOWEL (U, UISC, UGC): 359 if UIPC == Top: 360 UIPC = Bottom 361 elif UIPC == Bottom: 362 UIPC = Top 363 364 # TODO: https://github.com/harfbuzz/harfbuzz/pull/627 365 if 0x1BF2 <= U <= 0x1BF3: UISC = Nukta; UIPC = Bottom 366 367 # TODO: U+1CED should only be allowed after some of 368 # the nasalization marks, maybe only for U+1CE9..U+1CF1. 369 if U == 0x1CED: UISC = Tone_Mark 370 371 # TODO: https://github.com/harfbuzz/harfbuzz/issues/1105 372 if U == 0x11134: UISC = Gemination_Mark 373 374 values = [k for k,v in items if v(U,UISC,UGC)] 375 assert len(values) == 1, "%s %s %s %s" % (hex(U), UISC, UGC, values) 376 USE = values[0] 377 378 # Resolve Indic_Positional_Category 379 380 # TODO: These should die, but have UIPC in Unicode 12.0 381 if U in [0x953, 0x954]: UIPC = Not_Applicable 382 383 # TODO: In USE's override list but not in Unicode 12.0 384 if U == 0x103C: UIPC = Left 385 386 # TODO: https://github.com/harfbuzz/harfbuzz/pull/2012 387 if U == 0x1C29: UIPC = Left 388 389 # TODO: These are not in USE's override list that we have, nor are they in Unicode 12.0 390 if 0xA926 <= U <= 0xA92A: UIPC = Top 391 # TODO: https://github.com/harfbuzz/harfbuzz/pull/1037 392 # and https://github.com/harfbuzz/harfbuzz/issues/1631 393 if U in [0x11302, 0x11303, 0x114C1]: UIPC = Top 394 if U == 0x1171E: UIPC = Left 395 if 0x1CF8 <= U <= 0x1CF9: UIPC = Top 396 397 assert (UIPC in [Not_Applicable, Visual_Order_Left] or 398 USE in use_positions), "%s %s %s %s %s" % (hex(U), UIPC, USE, UISC, UGC) 399 400 pos_mapping = use_positions.get(USE, None) 401 if pos_mapping: 402 values = [k for k,v in pos_mapping.items() if v and UIPC in v] 403 assert len(values) == 1, "%s %s %s %s %s %s" % (hex(U), UIPC, USE, UISC, UGC, values) 404 USE = USE + values[0] 405 406 out[U] = (USE, UBlock) 407 return out 408 409defaults = ('O', 'No_Block') 410data = map_to_use(data) 411 412print ("/* == Start of generated table == */") 413print ("/*") 414print (" * The following table is generated by running:") 415print (" *") 416print (" * ./gen-use-table.py IndicSyllabicCategory.txt IndicPositionalCategory.txt UnicodeData.txt Blocks.txt") 417print (" *") 418print (" * on files with these headers:") 419print (" *") 420for h in headers: 421 for l in h: 422 print (" * %s" % (l.strip())) 423print (" */") 424print () 425print ('#include "hb.hh"') 426print () 427print ('#ifndef HB_NO_OT_SHAPE') 428print () 429print ('#include "hb-ot-shape-complex-use.hh"') 430print () 431 432total = 0 433used = 0 434last_block = None 435def print_block (block, start, end, data): 436 global total, used, last_block 437 if block and block != last_block: 438 print () 439 print () 440 print (" /* %s */" % block) 441 if start % 16: 442 print (' ' * (20 + (start % 16 * 6)), end='') 443 num = 0 444 assert start % 8 == 0 445 assert (end+1) % 8 == 0 446 for u in range (start, end+1): 447 if u % 16 == 0: 448 print () 449 print (" /* %04X */" % u, end='') 450 if u in data: 451 num += 1 452 d = data.get (u, defaults) 453 print ("%6s," % d[0], end='') 454 455 total += end - start + 1 456 used += num 457 if block: 458 last_block = block 459 460uu = sorted (data.keys ()) 461 462last = -100000 463num = 0 464offset = 0 465starts = [] 466ends = [] 467print ('#pragma GCC diagnostic push') 468print ('#pragma GCC diagnostic ignored "-Wunused-macros"') 469for k,v in sorted(use_mapping.items()): 470 if k in use_positions and use_positions[k]: continue 471 print ("#define %s USE_%s /* %s */" % (k, k, v.__name__[3:])) 472for k,v in sorted(use_positions.items()): 473 if not v: continue 474 for suf in v.keys(): 475 tag = k + suf 476 print ("#define %s USE_%s" % (tag, tag)) 477print ('#pragma GCC diagnostic pop') 478print ("") 479print ("static const USE_TABLE_ELEMENT_TYPE use_table[] = {") 480for u in uu: 481 if u <= last: 482 continue 483 block = data[u][1] 484 485 start = u//8*8 486 end = start+1 487 while end in uu and block == data[end][1]: 488 end += 1 489 end = (end-1)//8*8 + 7 490 491 if start != last + 1: 492 if start - last <= 1+16*3: 493 print_block (None, last+1, start-1, data) 494 last = start-1 495 else: 496 if last >= 0: 497 ends.append (last + 1) 498 offset += ends[-1] - starts[-1] 499 print () 500 print () 501 print ("#define use_offset_0x%04xu %d" % (start, offset)) 502 starts.append (start) 503 504 print_block (block, start, end, data) 505 last = end 506ends.append (last + 1) 507offset += ends[-1] - starts[-1] 508print () 509print () 510occupancy = used * 100. / total 511page_bits = 12 512print ("}; /* Table items: %d; occupancy: %d%% */" % (offset, occupancy)) 513print () 514print ("USE_TABLE_ELEMENT_TYPE") 515print ("hb_use_get_category (hb_codepoint_t u)") 516print ("{") 517print (" switch (u >> %d)" % page_bits) 518print (" {") 519pages = set([u>>page_bits for u in starts+ends]) 520for p in sorted(pages): 521 print (" case 0x%0Xu:" % p) 522 for (start,end) in zip (starts, ends): 523 if p not in [start>>page_bits, end>>page_bits]: continue 524 offset = "use_offset_0x%04xu" % start 525 print (" if (hb_in_range<hb_codepoint_t> (u, 0x%04Xu, 0x%04Xu)) return use_table[u - 0x%04Xu + %s];" % (start, end-1, start, offset)) 526 print (" break;") 527 print ("") 528print (" default:") 529print (" break;") 530print (" }") 531print (" return USE_O;") 532print ("}") 533print () 534for k in sorted(use_mapping.keys()): 535 if k in use_positions and use_positions[k]: continue 536 print ("#undef %s" % k) 537for k,v in sorted(use_positions.items()): 538 if not v: continue 539 for suf in v.keys(): 540 tag = k + suf 541 print ("#undef %s" % tag) 542print () 543print () 544print ('#endif') 545print ("/* == End of generated table == */") 546 547# Maintain at least 50% occupancy in the table */ 548if occupancy < 50: 549 raise Exception ("Table too sparse, please investigate: ", occupancy) 550