1#!/usr/bin/env python3 2# flake8: noqa: F821 3 4"""usage: ./gen-use-table.py IndicSyllabicCategory.txt IndicPositionalCategory.txt UnicodeData.txt ArabicShaping.txt Blocks.txt IndicSyllabicCategory-Additional.txt IndicPositionalCategory-Additional.txt 5 6Input files: 7* https://unicode.org/Public/UCD/latest/ucd/IndicSyllabicCategory.txt 8* https://unicode.org/Public/UCD/latest/ucd/IndicPositionalCategory.txt 9* https://unicode.org/Public/UCD/latest/ucd/UnicodeData.txt 10* https://unicode.org/Public/UCD/latest/ucd/ArabicShaping.txt 11* https://unicode.org/Public/UCD/latest/ucd/Blocks.txt 12* ms-use/IndicSyllabicCategory-Additional.txt 13* ms-use/IndicPositionalCategory-Additional.txt 14""" 15 16import sys 17 18if len (sys.argv) != 8: 19 sys.exit (__doc__) 20 21BLACKLISTED_BLOCKS = [ 22 'Samaritan', 23 'Thai', 24 'Lao', 25] 26 27files = [open (x, encoding='utf-8') for x in sys.argv[1:]] 28 29headers = [[f.readline () for i in range (2)] for j,f in enumerate(files) if j != 2] 30for j in range(5, 7): 31 for line in files[j]: 32 line = line.rstrip() 33 if not line: 34 break 35 headers[j - 1].append(line) 36headers.append (["UnicodeData.txt does not have a header."]) 37 38data = [{} for _ in files] 39values = [{} for _ in files] 40for i, f in enumerate (files): 41 extended = False 42 43 for line in f: 44 45 # TODO: https://github.com/MicrosoftDocs/typography-issues/issues/522 46 if extended and line.startswith ('# ') and line.find (';'): 47 line = line[2:] 48 elif 'USE_Syllabic_Category' in line: 49 extended = True 50 51 j = line.find ('#') 52 if j >= 0: 53 line = line[:j] 54 55 fields = [x.strip () for x in line.split (';')] 56 if len (fields) == 1: 57 continue 58 59 uu = fields[0].split ('..') 60 start = int (uu[0], 16) 61 if len (uu) == 1: 62 end = start 63 else: 64 end = int (uu[1], 16) 65 66 t = fields[1 if i not in [2, 3] else 2] 67 68 if i == 3: 69 t = 'jt_' + t 70 elif i == 5 and t == 'Consonant_Final_Modifier': 71 # TODO: https://github.com/MicrosoftDocs/typography-issues/issues/336 72 t = 'Syllable_Modifier' 73 elif i == 6 and t == 'NA': 74 t = 'Not_Applicable' 75 76 i0 = i if i < 5 else i - 5 77 for u in range (start, end + 1): 78 data[i0][u] = t 79 values[i0][t] = values[i0].get (t, 0) + end - start + 1 80 81defaults = ('Other', 'Not_Applicable', 'Cn', 'jt_X', 'No_Block') 82 83# TODO Characters that are not in Unicode Indic files, but used in USE 84data[0][0x0640] = defaults[0] 85data[0][0x1B61] = defaults[0] 86data[0][0x1B63] = defaults[0] 87data[0][0x1B64] = defaults[0] 88data[0][0x1B65] = defaults[0] 89data[0][0x1B66] = defaults[0] 90data[0][0x1B67] = defaults[0] 91data[0][0x1B69] = defaults[0] 92data[0][0x1B6A] = defaults[0] 93data[0][0x2060] = defaults[0] 94for u in range (0x07CA, 0x07EA + 1): 95 data[0][u] = defaults[0] 96data[0][0x07FA] = defaults[0] 97for u in range (0x0840, 0x0858 + 1): 98 data[0][u] = defaults[0] 99for u in range (0x1887, 0x18A8 + 1): 100 data[0][u] = defaults[0] 101data[0][0x18AA] = defaults[0] 102for u in range (0xA840, 0xA872 + 1): 103 data[0][u] = defaults[0] 104for u in range (0x10B80, 0x10B91 + 1): 105 data[0][u] = defaults[0] 106for u in range (0x10BA9, 0x10BAE + 1): 107 data[0][u] = defaults[0] 108data[0][0x10FB0] = defaults[0] 109for u in range (0x10FB2, 0x10FB6 + 1): 110 data[0][u] = defaults[0] 111for u in range (0x10FB8, 0x10FBF + 1): 112 data[0][u] = defaults[0] 113for u in range (0x10FC1, 0x10FC4 + 1): 114 data[0][u] = defaults[0] 115for u in range (0x10FC9, 0x10FCB + 1): 116 data[0][u] = defaults[0] 117# TODO https://github.com/harfbuzz/harfbuzz/pull/1685 118data[0][0x1B5B] = 'Consonant_Placeholder' 119data[0][0x1B5C] = 'Consonant_Placeholder' 120data[0][0x1B5F] = 'Consonant_Placeholder' 121data[0][0x1B62] = 'Consonant_Placeholder' 122data[0][0x1B68] = 'Consonant_Placeholder' 123# TODO https://github.com/harfbuzz/harfbuzz/issues/1035 124data[0][0x11C44] = 'Consonant_Placeholder' 125data[0][0x11C45] = 'Consonant_Placeholder' 126# TODO https://github.com/harfbuzz/harfbuzz/pull/1399 127data[0][0x111C8] = 'Consonant_Placeholder' 128 129# Merge data into one dict: 130for i,v in enumerate (defaults): 131 values[i][v] = values[i].get (v, 0) + 1 132combined = {} 133for i,d in enumerate (data): 134 for u,v in d.items (): 135 if i >= 2 and not u in combined: 136 continue 137 if not u in combined: 138 combined[u] = list (defaults) 139 combined[u][i] = v 140combined = {k:v for k,v in combined.items() if v[4] not in BLACKLISTED_BLOCKS} 141data = combined 142del combined 143 144 145property_names = [ 146 # General_Category 147 'Cc', 'Cf', 'Cn', 'Co', 'Cs', 'Ll', 'Lm', 'Lo', 'Lt', 'Lu', 'Mc', 148 'Me', 'Mn', 'Nd', 'Nl', 'No', 'Pc', 'Pd', 'Pe', 'Pf', 'Pi', 'Po', 149 'Ps', 'Sc', 'Sk', 'Sm', 'So', 'Zl', 'Zp', 'Zs', 150 # Indic_Syllabic_Category 151 'Other', 152 'Bindu', 153 'Visarga', 154 'Avagraha', 155 'Nukta', 156 'Virama', 157 'Pure_Killer', 158 'Invisible_Stacker', 159 'Vowel_Independent', 160 'Vowel_Dependent', 161 'Vowel', 162 'Consonant_Placeholder', 163 'Consonant', 164 'Consonant_Dead', 165 'Consonant_With_Stacker', 166 'Consonant_Prefixed', 167 'Consonant_Preceding_Repha', 168 'Consonant_Succeeding_Repha', 169 'Consonant_Subjoined', 170 'Consonant_Medial', 171 'Consonant_Final', 172 'Consonant_Head_Letter', 173 'Consonant_Initial_Postfixed', 174 'Modifying_Letter', 175 'Tone_Letter', 176 'Tone_Mark', 177 'Gemination_Mark', 178 'Cantillation_Mark', 179 'Register_Shifter', 180 'Syllable_Modifier', 181 'Consonant_Killer', 182 'Non_Joiner', 183 'Joiner', 184 'Number_Joiner', 185 'Number', 186 'Brahmi_Joining_Number', 187 'Hieroglyph', 188 'Hieroglyph_Joiner', 189 'Hieroglyph_Segment_Begin', 190 'Hieroglyph_Segment_End', 191 # Indic_Positional_Category 192 'Not_Applicable', 193 'Right', 194 'Left', 195 'Visual_Order_Left', 196 'Left_And_Right', 197 'Top', 198 'Bottom', 199 'Top_And_Bottom', 200 'Top_And_Bottom_And_Left', 201 'Top_And_Right', 202 'Top_And_Left', 203 'Top_And_Left_And_Right', 204 'Bottom_And_Left', 205 'Bottom_And_Right', 206 'Top_And_Bottom_And_Right', 207 'Overstruck', 208 # Joining_Type 209 'jt_C', 210 'jt_D', 211 'jt_L', 212 'jt_R', 213 'jt_T', 214 'jt_U', 215 'jt_X', 216] 217 218class PropertyValue(object): 219 def __init__(self, name_): 220 self.name = name_ 221 def __str__(self): 222 return self.name 223 def __eq__(self, other): 224 return self.name == (other if isinstance(other, str) else other.name) 225 def __ne__(self, other): 226 return not (self == other) 227 def __hash__(self): 228 return hash(str(self)) 229 230property_values = {} 231 232for name in property_names: 233 value = PropertyValue(name) 234 assert value not in property_values 235 assert value not in globals() 236 property_values[name] = value 237globals().update(property_values) 238 239 240def is_BASE(U, UISC, UGC, AJT): 241 return (UISC in [Number, Consonant, Consonant_Head_Letter, 242 Tone_Letter, 243 Vowel_Independent, 244 ] or 245 # TODO: https://github.com/MicrosoftDocs/typography-issues/issues/484 246 AJT in [jt_C, jt_D, jt_L, jt_R] and UISC != Joiner or 247 (UGC == Lo and UISC in [Avagraha, Bindu, Consonant_Final, Consonant_Medial, 248 Consonant_Subjoined, Vowel, Vowel_Dependent])) 249def is_BASE_NUM(U, UISC, UGC, AJT): 250 return UISC == Brahmi_Joining_Number 251def is_BASE_OTHER(U, UISC, UGC, AJT): 252 if UISC == Consonant_Placeholder: return True 253 return U in [0x2015, 0x2022, 0x25FB, 0x25FC, 0x25FD, 0x25FE] 254def is_CONS_FINAL(U, UISC, UGC, AJT): 255 return ((UISC == Consonant_Final and UGC != Lo) or 256 UISC == Consonant_Succeeding_Repha) 257def is_CONS_FINAL_MOD(U, UISC, UGC, AJT): 258 return UISC == Syllable_Modifier 259def is_CONS_MED(U, UISC, UGC, AJT): 260 # Consonant_Initial_Postfixed is new in Unicode 11; not in the spec. 261 return (UISC == Consonant_Medial and UGC != Lo or 262 UISC == Consonant_Initial_Postfixed) 263def is_CONS_MOD(U, UISC, UGC, AJT): 264 return (UISC in [Nukta, Gemination_Mark, Consonant_Killer] and 265 not is_SYM_MOD(U, UISC, UGC, AJT)) 266def is_CONS_SUB(U, UISC, UGC, AJT): 267 return UISC == Consonant_Subjoined and UGC != Lo 268def is_CONS_WITH_STACKER(U, UISC, UGC, AJT): 269 return UISC == Consonant_With_Stacker 270def is_HALANT(U, UISC, UGC, AJT): 271 return (UISC in [Virama, Invisible_Stacker] 272 and not is_HALANT_OR_VOWEL_MODIFIER(U, UISC, UGC, AJT) 273 and not is_SAKOT(U, UISC, UGC, AJT)) 274def is_HALANT_OR_VOWEL_MODIFIER(U, UISC, UGC, AJT): 275 # https://github.com/harfbuzz/harfbuzz/issues/1102 276 # https://github.com/harfbuzz/harfbuzz/issues/1379 277 return U in [0x11046, 0x1134D] 278def is_HALANT_NUM(U, UISC, UGC, AJT): 279 return UISC == Number_Joiner 280def is_HIEROGLYPH(U, UISC, UGC, AJT): 281 return UISC == Hieroglyph 282def is_HIEROGLYPH_JOINER(U, UISC, UGC, AJT): 283 return UISC == Hieroglyph_Joiner 284def is_HIEROGLYPH_SEGMENT_BEGIN(U, UISC, UGC, AJT): 285 return UISC == Hieroglyph_Segment_Begin 286def is_HIEROGLYPH_SEGMENT_END(U, UISC, UGC, AJT): 287 return UISC == Hieroglyph_Segment_End 288def is_ZWNJ(U, UISC, UGC, AJT): 289 return UISC == Non_Joiner 290def is_OTHER(U, UISC, UGC, AJT): 291 return ((UGC in [Cn, Po] or UISC in [Consonant_Dead, Joiner, Modifying_Letter, Other]) 292 and not is_BASE(U, UISC, UGC, AJT) 293 and not is_BASE_OTHER(U, UISC, UGC, AJT) 294 and not is_SYM(U, UISC, UGC, AJT) 295 and not is_SYM_MOD(U, UISC, UGC, AJT) 296 ) 297def is_REPHA(U, UISC, UGC, AJT): 298 return UISC in [Consonant_Preceding_Repha, Consonant_Prefixed] 299def is_SAKOT(U, UISC, UGC, AJT): 300 return U == 0x1A60 301def is_SYM(U, UISC, UGC, AJT): 302 if U in [0x25CC, 0x1E14F]: return False 303 return UGC in [So, Sc] and U not in [0x0F01, 0x1B62, 0x1B68] 304def is_SYM_MOD(U, UISC, UGC, AJT): 305 return U in [0x1B6B, 0x1B6C, 0x1B6D, 0x1B6E, 0x1B6F, 0x1B70, 0x1B71, 0x1B72, 0x1B73] 306def is_VOWEL(U, UISC, UGC, AJT): 307 # https://github.com/harfbuzz/harfbuzz/issues/376 308 return (UISC == Pure_Killer or 309 (UGC != Lo and UISC in [Vowel, Vowel_Dependent] and U not in [0xAA29])) 310def is_VOWEL_MOD(U, UISC, UGC, AJT): 311 # https://github.com/harfbuzz/harfbuzz/issues/376 312 return (UISC in [Tone_Mark, Cantillation_Mark, Register_Shifter, Visarga] or 313 (UGC != Lo and (UISC == Bindu or U in [0xAA29]))) 314 315# CGJ, VS, WJ, and ZWJ are handled in find_syllables 316use_mapping = { 317 'B': is_BASE, 318 'N': is_BASE_NUM, 319 'GB': is_BASE_OTHER, 320 'F': is_CONS_FINAL, 321 'FM': is_CONS_FINAL_MOD, 322 'M': is_CONS_MED, 323 'CM': is_CONS_MOD, 324 'SUB': is_CONS_SUB, 325 'CS': is_CONS_WITH_STACKER, 326 'H': is_HALANT, 327 'HVM': is_HALANT_OR_VOWEL_MODIFIER, 328 'HN': is_HALANT_NUM, 329 'G': is_HIEROGLYPH, 330 'J': is_HIEROGLYPH_JOINER, 331 'SB': is_HIEROGLYPH_SEGMENT_BEGIN, 332 'SE': is_HIEROGLYPH_SEGMENT_END, 333 'ZWNJ': is_ZWNJ, 334 'O': is_OTHER, 335 'R': is_REPHA, 336 'S': is_SYM, 337 'Sk': is_SAKOT, 338 'SM': is_SYM_MOD, 339 'V': is_VOWEL, 340 'VM': is_VOWEL_MOD, 341} 342 343use_positions = { 344 'F': { 345 'Abv': [Top], 346 'Blw': [Bottom], 347 'Pst': [Right], 348 }, 349 'M': { 350 'Abv': [Top], 351 'Blw': [Bottom, Bottom_And_Left, Bottom_And_Right], 352 'Pst': [Right], 353 'Pre': [Left, Top_And_Bottom_And_Left], 354 }, 355 'CM': { 356 'Abv': [Top], 357 'Blw': [Bottom, Overstruck], 358 }, 359 'V': { 360 'Abv': [Top, Top_And_Bottom, Top_And_Bottom_And_Right, Top_And_Right], 361 'Blw': [Bottom, Overstruck, Bottom_And_Right], 362 'Pst': [Right], 363 'Pre': [Left, Top_And_Left, Top_And_Left_And_Right, Left_And_Right], 364 }, 365 'VM': { 366 'Abv': [Top], 367 'Blw': [Bottom, Overstruck], 368 'Pst': [Right], 369 'Pre': [Left], 370 }, 371 'SM': { 372 'Abv': [Top], 373 'Blw': [Bottom], 374 }, 375 'H': None, 376 'HVM': None, 377 'B': None, 378 'FM': { 379 'Abv': [Top], 380 'Blw': [Bottom], 381 'Pst': [Not_Applicable], 382 }, 383 'R': None, 384 'SUB': None, 385} 386 387def map_to_use(data): 388 out = {} 389 items = use_mapping.items() 390 for U,(UISC,UIPC,UGC,AJT,UBlock) in data.items(): 391 392 # Resolve Indic_Syllabic_Category 393 394 # TODO: These don't have UISC assigned in Unicode 13.0.0, but have UIPC 395 if 0x1CE2 <= U <= 0x1CE8: UISC = Cantillation_Mark 396 397 # Tibetan: 398 # TODO: These don't have UISC assigned in Unicode 13.0.0, but have UIPC 399 if 0x0F18 <= U <= 0x0F19 or 0x0F3E <= U <= 0x0F3F: UISC = Vowel_Dependent 400 401 # TODO: https://github.com/harfbuzz/harfbuzz/pull/627 402 if 0x1BF2 <= U <= 0x1BF3: UISC = Nukta; UIPC = Bottom 403 404 # TODO: U+1CED should only be allowed after some of 405 # the nasalization marks, maybe only for U+1CE9..U+1CF1. 406 if U == 0x1CED: UISC = Tone_Mark 407 408 # TODO: https://github.com/microsoft/font-tools/issues/1 409 if U == 0xA982: UISC = Consonant_Succeeding_Repha 410 411 values = [k for k,v in items if v(U,UISC,UGC,AJT)] 412 assert len(values) == 1, "%s %s %s %s %s" % (hex(U), UISC, UGC, AJT, values) 413 USE = values[0] 414 415 # Resolve Indic_Positional_Category 416 417 # TODO: These should die, but have UIPC in Unicode 13.0.0 418 if U in [0x953, 0x954]: UIPC = Not_Applicable 419 420 # TODO: These are not in USE's override list that we have, nor are they in Unicode 13.0.0 421 if 0xA926 <= U <= 0xA92A: UIPC = Top 422 # TODO: https://github.com/harfbuzz/harfbuzz/pull/1037 423 # and https://github.com/harfbuzz/harfbuzz/issues/1631 424 if U in [0x11302, 0x11303, 0x114C1]: UIPC = Top 425 if 0x1CF8 <= U <= 0x1CF9: UIPC = Top 426 427 # TODO: https://github.com/harfbuzz/harfbuzz/pull/982 428 # also https://github.com/harfbuzz/harfbuzz/issues/1012 429 if 0x1112A <= U <= 0x1112B: UIPC = Top 430 if 0x11131 <= U <= 0x11132: UIPC = Top 431 432 assert (UIPC in [Not_Applicable, Visual_Order_Left] or U == 0x0F7F or 433 USE in use_positions), "%s %s %s %s %s %s" % (hex(U), UIPC, USE, UISC, UGC, AJT) 434 435 pos_mapping = use_positions.get(USE, None) 436 if pos_mapping: 437 values = [k for k,v in pos_mapping.items() if v and UIPC in v] 438 assert len(values) == 1, "%s %s %s %s %s %s %s" % (hex(U), UIPC, USE, UISC, UGC, AJT, values) 439 USE = USE + values[0] 440 441 out[U] = (USE, UBlock) 442 return out 443 444defaults = ('O', 'No_Block') 445data = map_to_use(data) 446 447print ("/* == Start of generated table == */") 448print ("/*") 449print (" * The following table is generated by running:") 450print (" *") 451print (" * {} IndicSyllabicCategory.txt IndicPositionalCategory.txt UnicodeData.txt ArabicShaping.txt Blocks.txt IndicSyllabicCategory-Additional.txt IndicPositionalCategory-Additional.txt".format (sys.argv[0])) 452print (" *") 453print (" * on files with these headers:") 454print (" *") 455for h in headers: 456 for l in h: 457 print (" * %s" % (l.strip())) 458print (" */") 459print () 460print ("#ifndef HB_OT_SHAPE_COMPLEX_USE_TABLE_HH") 461print ("#define HB_OT_SHAPE_COMPLEX_USE_TABLE_HH") 462print () 463print ('#include "hb.hh"') 464print () 465print ('#include "hb-ot-shape-complex-use-machine.hh"') 466print () 467 468total = 0 469used = 0 470last_block = None 471def print_block (block, start, end, data): 472 global total, used, last_block 473 if block and block != last_block: 474 print () 475 print () 476 print (" /* %s */" % block) 477 if start % 16: 478 print (' ' * (20 + (start % 16 * 6)), end='') 479 num = 0 480 assert start % 8 == 0 481 assert (end+1) % 8 == 0 482 for u in range (start, end+1): 483 if u % 16 == 0: 484 print () 485 print (" /* %04X */" % u, end='') 486 if u in data: 487 num += 1 488 d = data.get (u, defaults) 489 print ("%6s," % d[0], end='') 490 491 total += end - start + 1 492 used += num 493 if block: 494 last_block = block 495 496uu = sorted (data.keys ()) 497 498last = -100000 499num = 0 500offset = 0 501starts = [] 502ends = [] 503print ('#pragma GCC diagnostic push') 504print ('#pragma GCC diagnostic ignored "-Wunused-macros"') 505for k,v in sorted(use_mapping.items()): 506 if k in use_positions and use_positions[k]: continue 507 print ("#define %s USE(%s) /* %s */" % (k, k, v.__name__[3:])) 508for k,v in sorted(use_positions.items()): 509 if not v: continue 510 for suf in v.keys(): 511 tag = k + suf 512 print ("#define %s USE(%s)" % (tag, tag)) 513print ('#pragma GCC diagnostic pop') 514print ("") 515print ("static const uint8_t use_table[] = {") 516for u in uu: 517 if u <= last: 518 continue 519 if data[u][0] == 'O': 520 continue 521 block = data[u][1] 522 523 start = u//8*8 524 end = start+1 525 while end in uu and block == data[end][1]: 526 end += 1 527 end = (end-1)//8*8 + 7 528 529 if start != last + 1: 530 if start - last <= 1+16*3: 531 print_block (None, last+1, start-1, data) 532 else: 533 if last >= 0: 534 ends.append (last + 1) 535 offset += ends[-1] - starts[-1] 536 print () 537 print () 538 print ("#define use_offset_0x%04xu %d" % (start, offset)) 539 starts.append (start) 540 541 print_block (block, start, end, data) 542 last = end 543ends.append (last + 1) 544offset += ends[-1] - starts[-1] 545print () 546print () 547occupancy = used * 100. / total 548page_bits = 12 549print ("}; /* Table items: %d; occupancy: %d%% */" % (offset, occupancy)) 550print () 551print ("static inline uint8_t") 552print ("hb_use_get_category (hb_codepoint_t u)") 553print ("{") 554print (" switch (u >> %d)" % page_bits) 555print (" {") 556pages = set([u>>page_bits for u in starts+ends]) 557for p in sorted(pages): 558 print (" case 0x%0Xu:" % p) 559 for (start,end) in zip (starts, ends): 560 if p not in [start>>page_bits, end>>page_bits]: continue 561 offset = "use_offset_0x%04xu" % start 562 print (" if (hb_in_range<hb_codepoint_t> (u, 0x%04Xu, 0x%04Xu)) return use_table[u - 0x%04Xu + %s];" % (start, end-1, start, offset)) 563 print (" break;") 564 print ("") 565print (" default:") 566print (" break;") 567print (" }") 568print (" return USE(O);") 569print ("}") 570print () 571for k in sorted(use_mapping.keys()): 572 if k in use_positions and use_positions[k]: continue 573 print ("#undef %s" % k) 574for k,v in sorted(use_positions.items()): 575 if not v: continue 576 for suf in v.keys(): 577 tag = k + suf 578 print ("#undef %s" % tag) 579print () 580print () 581print ("#endif /* HB_OT_SHAPE_COMPLEX_USE_TABLE_HH */") 582print ("/* == End of generated table == */") 583 584# Maintain at least 50% occupancy in the table */ 585if occupancy < 50: 586 raise Exception ("Table too sparse, please investigate: ", occupancy) 587