1#!/usr/bin/env python2 2# 3# Extract rules for Unicode case conversion, specifically the behavior 4# required by ECMAScript E5 in Sections 15.5.4.16 to 15.5.4.19. The 5# bitstream encoded rules are used for the slow path at run time, so 6# compactness is favored over speed. 7# 8# There is no support for context or locale sensitive rules, as they 9# are handled directly in C code before consulting tables generated 10# here. ECMAScript requires case conversion both with and without 11# locale/language specific rules (e.g. String.prototype.toLowerCase() 12# and String.prototype.toLocaleLowerCase()), so they are best handled 13# in C anyway. 14# 15# Case conversion rules for ASCII are also excluded as they are handled 16# by the C fast path. Rules for non-BMP characters (codepoints above 17# U+FFFF) are omitted as they're not required for standard ECMAScript. 18# 19 20import os 21import sys 22import re 23import math 24import optparse 25 26import dukutil 27 28class UnicodeData: 29 """Read UnicodeData.txt into an internal representation.""" 30 31 def __init__(self, filename): 32 self.data = self.read_unicode_data(filename) 33 print('read %d unicode data entries' % len(self.data)) 34 35 def read_unicode_data(self, filename): 36 res = [] 37 f = open(filename, 'rb') 38 for line in f: 39 if line.startswith('#'): 40 continue 41 line = line.strip() 42 if line == '': 43 continue 44 parts = line.split(';') 45 if len(parts) != 15: 46 raise Exception('invalid unicode data line') 47 res.append(parts) 48 f.close() 49 50 # Sort based on Unicode codepoint. 51 def mycmp(a,b): 52 return cmp(long(a[0], 16), long(b[0], 16)) 53 54 res.sort(cmp=mycmp) 55 return res 56 57class SpecialCasing: 58 """Read SpecialCasing.txt into an internal representation.""" 59 60 def __init__(self, filename): 61 self.data = self.read_special_casing_data(filename) 62 print('read %d special casing entries' % len(self.data)) 63 64 def read_special_casing_data(self, filename): 65 res = [] 66 f = open(filename, 'rb') 67 for line in f: 68 try: 69 idx = line.index('#') 70 line = line[:idx] 71 except ValueError: 72 pass 73 line = line.strip() 74 if line == '': 75 continue 76 parts = line.split(';') 77 parts = [i.strip() for i in parts] 78 while len(parts) < 6: 79 parts.append('') 80 res.append(parts) 81 f.close() 82 return res 83 84def parse_unicode_sequence(x): 85 """Parse a Unicode sequence like ABCD 1234 into a unicode string.""" 86 res = '' 87 for i in x.split(' '): 88 i = i.strip() 89 if i == '': 90 continue 91 res += unichr(long(i, 16)) 92 return res 93 94def get_base_conversion_maps(unicode_data): 95 """Create case conversion tables without handling special casing yet.""" 96 97 uc = {} # uppercase, codepoint (number) -> string 98 lc = {} # lowercase 99 tc = {} # titlecase 100 101 for x in unicode_data.data: 102 c1 = long(x[0], 16) 103 104 # just 16-bit support needed 105 if c1 >= 0x10000: 106 continue 107 108 if x[12] != '': 109 # field 12: simple uppercase mapping 110 c2 = parse_unicode_sequence(x[12]) 111 uc[c1] = c2 112 tc[c1] = c2 # titlecase default == uppercase, overridden below if necessary 113 if x[13] != '': 114 # field 13: simple lowercase mapping 115 c2 = parse_unicode_sequence(x[13]) 116 lc[c1] = c2 117 if x[14] != '': 118 # field 14: simple titlecase mapping 119 c2 = parse_unicode_sequence(x[14]) 120 tc[c1] = c2 121 122 return uc, lc, tc 123 124def update_special_casings(uc, lc, tc, special_casing): 125 """Update case conversion tables with special case conversion rules.""" 126 127 for x in special_casing.data: 128 c1 = long(x[0], 16) 129 130 if x[4] != '': 131 # conditions 132 continue 133 134 lower = parse_unicode_sequence(x[1]) 135 title = parse_unicode_sequence(x[2]) 136 upper = parse_unicode_sequence(x[3]) 137 138 if len(lower) > 1: 139 lc[c1] = lower 140 if len(upper) > 1: 141 uc[c1] = upper 142 if len(title) > 1: 143 tc[c1] = title 144 145 print('- special case: %d %d %d' % (len(lower), len(upper), len(title))) 146 147def remove_ascii_part(convmap): 148 """Remove ASCII case conversion parts (handled by C fast path).""" 149 150 for i in xrange(128): 151 if convmap.has_key(i): 152 del convmap[i] 153 154def scan_range_with_skip(convmap, start_idx, skip): 155 """Scan for a range of continuous case conversion with a certain 'skip'.""" 156 157 conv_i = start_idx 158 if not convmap.has_key(conv_i): 159 return None, None, None 160 elif len(convmap[conv_i]) > 1: 161 return None, None, None 162 else: 163 conv_o = ord(convmap[conv_i]) 164 165 start_i = conv_i 166 start_o = conv_o 167 168 while True: 169 new_i = conv_i + skip 170 new_o = conv_o + skip 171 172 if not convmap.has_key(new_i): 173 break 174 if len(convmap[new_i]) > 1: 175 break 176 if ord(convmap[new_i]) != new_o: 177 break 178 179 conv_i = new_i 180 conv_o = new_o 181 182 # [start_i,conv_i] maps to [start_o,conv_o], ignore ranges of 1 char. 183 count = (conv_i - start_i) / skip + 1 184 if count <= 1: 185 return None, None, None 186 187 # We have an acceptable range, remove them from the convmap here. 188 for i in xrange(start_i, conv_i + skip, skip): 189 del convmap[i] 190 191 return start_i, start_o, count 192 193def find_first_range_with_skip(convmap, skip): 194 """Find first range with a certain 'skip' value.""" 195 196 for i in xrange(65536): 197 start_i, start_o, count = scan_range_with_skip(convmap, i, skip) 198 if start_i is None: 199 continue 200 return start_i, start_o, count 201 202 return None, None, None 203 204def generate_caseconv_tables(convmap): 205 """Generate bit-packed case conversion table for a given conversion map.""" 206 207 # The bitstream encoding is based on manual inspection for whatever 208 # regularity the Unicode case conversion rules have. 209 # 210 # Start with a full description of case conversions which does not 211 # cover all codepoints; unmapped codepoints convert to themselves. 212 # Scan for range-to-range mappings with a range of skips starting from 1. 213 # Whenever a valid range is found, remove it from the map. Finally, 214 # output the remaining case conversions (1:1 and 1:n) on a per codepoint 215 # basis. 216 # 217 # This is very slow because we always scan from scratch, but its the 218 # most reliable and simple way to scan 219 220 print('generate caseconv tables') 221 222 ranges = [] # range mappings (2 or more consecutive mappings with a certain skip) 223 singles = [] # 1:1 character mappings 224 multis = [] # 1:n character mappings 225 226 # Ranges with skips 227 228 for skip in xrange(1,6+1): # skips 1...6 are useful 229 while True: 230 start_i, start_o, count = find_first_range_with_skip(convmap, skip) 231 if start_i is None: 232 break 233 print('- skip %d: %d %d %d' % (skip, start_i, start_o, count)) 234 ranges.append([start_i, start_o, count, skip]) 235 236 # 1:1 conversions 237 238 k = convmap.keys() 239 k.sort() 240 for i in k: 241 if len(convmap[i]) > 1: 242 continue 243 singles.append([i, ord(convmap[i])]) # codepoint, codepoint 244 del convmap[i] 245 246 # There are many mappings to 2-char sequences with latter char being U+0399. 247 # These could be handled as a special case, but we don't do that right now. 248 # 249 # [8064L, u'\u1f08\u0399'] 250 # [8065L, u'\u1f09\u0399'] 251 # [8066L, u'\u1f0a\u0399'] 252 # [8067L, u'\u1f0b\u0399'] 253 # [8068L, u'\u1f0c\u0399'] 254 # [8069L, u'\u1f0d\u0399'] 255 # [8070L, u'\u1f0e\u0399'] 256 # [8071L, u'\u1f0f\u0399'] 257 # ... 258 # 259 # tmp = {} 260 # k = convmap.keys() 261 # k.sort() 262 # for i in k: 263 # if len(convmap[i]) == 2 and convmap[i][1] == u'\u0399': 264 # tmp[i] = convmap[i][0] 265 # del convmap[i] 266 # print(repr(tmp)) 267 # 268 # skip = 1 269 # while True: 270 # start_i, start_o, count = find_first_range_with_skip(tmp, skip) 271 # if start_i is None: 272 # break 273 # print('- special399, skip %d: %d %d %d' % (skip, start_i, start_o, count)) 274 # print(len(tmp.keys())) 275 # print(repr(tmp)) 276 # XXX: need to put 12 remaining mappings back to convmap 277 278 # 1:n conversions 279 280 k = convmap.keys() 281 k.sort() 282 for i in k: 283 multis.append([i, convmap[i]]) # codepoint, string 284 del convmap[i] 285 286 for t in singles: 287 print '- singles: ' + repr(t) 288 289 for t in multis: 290 print '- multis: ' + repr(t) 291 292 print '- range mappings: %d' % len(ranges) 293 print '- single character mappings: %d' % len(singles) 294 print '- complex mappings (1:n): %d' % len(multis) 295 print '- remaining (should be zero): %d' % len(convmap.keys()) 296 297 # XXX: opportunities for diff encoding skip=3 ranges? 298 prev = None 299 for t in ranges: 300 # range: [start_i, start_o, count, skip] 301 if t[3] != 3: 302 continue 303 if prev is not None: 304 print '- %d %d' % (t[0] - prev[0], t[1] - prev[1]) 305 else: 306 print '- start: %d %d' % (t[0], t[1]) 307 prev = t 308 309 # Bit packed encoding. 310 311 be = dukutil.BitEncoder() 312 313 for curr_skip in xrange(1, 7): # 1...6 314 count = 0 315 for r in ranges: 316 start_i, start_o, r_count, skip = r[0], r[1], r[2], r[3] 317 if skip != curr_skip: 318 continue 319 count += 1 320 be.bits(count, 6) 321 print('- encode: skip=%d, count=%d' % (curr_skip, count)) 322 323 for r in ranges: 324 start_i, start_o, r_count, skip = r[0], r[1], r[2], r[3] 325 if skip != curr_skip: 326 continue 327 be.bits(start_i, 16) 328 be.bits(start_o, 16) 329 be.bits(r_count, 7) 330 be.bits(0x3f, 6) # maximum count value = end of skips 331 332 count = len(singles) 333 be.bits(count, 7) 334 for t in singles: 335 cp_i, cp_o = t[0], t[1] 336 be.bits(cp_i, 16) 337 be.bits(cp_o, 16) 338 339 count = len(multis) 340 be.bits(count, 7) 341 for t in multis: 342 cp_i, str_o = t[0], t[1] 343 be.bits(cp_i, 16) 344 be.bits(len(str_o), 2) 345 for i in xrange(len(str_o)): 346 be.bits(ord(str_o[i]), 16) 347 348 return be.getBytes(), be.getNumBits() 349 350def generate_regexp_canonicalize_tables(convmap): 351 """Generate tables for case insensitive RegExp normalization.""" 352 353 # Generate a direct codepoint lookup for canonicalizing BMP range. 354 355 def generate_canontab(): 356 res = [] 357 highest_nonid = -1 358 359 for cp in xrange(65536): 360 res_cp = cp # default to as is 361 if convmap.has_key(cp): 362 tmp = convmap[cp] 363 if len(tmp) == 1: 364 # If multiple codepoints from input, ignore. 365 res_cp = ord(tmp[0]) 366 if cp >= 0x80 and res_cp < 0x80: 367 res_cp = cp # If non-ASCII mapped to ASCII, ignore. 368 if cp != res_cp: 369 highest_nonid = cp 370 res.append(res_cp) 371 372 # At the moment this is 65370, which means there's very little 373 # gain in assuming 1:1 mapping above a certain BMP codepoint 374 # (though we do assume 1:1 mapping for above BMP codepoints). 375 print('- highest non-identity mapping: %d' % highest_nonid) 376 377 return res 378 379 print('generate canontab') 380 canontab = generate_canontab() 381 382 # Figure out which BMP values are never the result of canonicalization. 383 # Such codepoints are "don't care" in the sense that they are never 384 # matched against at runtime: ranges are canonicalized at compile time, 385 # and codepoint being matched is also canonicalized at run time. 386 # (Currently unused.) 387 388 def generate_dontcare(): 389 res = [ True ] * 65536 390 for cp in canontab: 391 res[cp] = False 392 res_count = 0 393 for x in res: 394 if x: 395 res_count += 1 396 print('- %d dontcare codepoints' % res_count) 397 return res 398 399 print('generate canon dontcare') 400 dontcare = generate_dontcare() 401 402 # Generate maximal continuous ranges for canonicalization. A continuous 403 # range is a sequence with N codepoints where IN+i canonicalizes to OUT+i 404 # for fixed IN, OUT, and i in 0...N-1. There are unfortunately >1000 405 # of these ranges, mostly because there are a lot of individual exceptions. 406 # (Currently unused.) 407 408 canon_ranges = [] 409 for cp in xrange(65536): 410 canon_ranges.append([ cp, canontab[cp], 1 ]) # 1 codepoint ranges at first 411 def merge_compatible_nogap(rng1, rng2): 412 # Merge adjacent ranges if continuity allows. 413 if rng1[0] + rng1[2] == rng2[0] and \ 414 rng1[1] + rng1[2] == rng2[1]: 415 return [ rng1[0], rng1[1], rng1[2] + rng2[2] ] 416 return None 417 def merge_check_nogap(): 418 len_start = len(canon_ranges) 419 for i in xrange(len(canon_ranges) - 1): 420 j = i + 1 421 rng1 = canon_ranges[i] 422 rng2 = canon_ranges[j] 423 if rng1 is None or rng2 is None: continue 424 merged = merge_compatible_nogap(rng1, rng2) 425 if merged is not None: 426 canon_ranges[j] = None 427 canon_ranges[i] = merged 428 filtered = [] 429 for x in canon_ranges: 430 if x is not None: 431 filtered.append(x) 432 len_end = len(filtered) 433 if len_end < len_start: 434 return filtered 435 return None 436 437 print('generate canon_ranges') 438 while True: 439 # Starting from individual ranges of 1 codepoint, merge adjacent 440 # ranges until no more ranges can be merged. 441 t = merge_check_nogap() 442 if t is None: 443 break 444 canon_ranges = t 445 print('- %d ranges' % len(canon_ranges)) 446 #for rng in canon_ranges: 447 # print('canon_ranges:') 448 # print(repr(rng)) 449 450 # Generate true/false ranges for BMP codepoints where: 451 # - A codepoint is flagged true if continuity is broken at that point, so 452 # an explicit codepoint canonicalization is needed at runtime. 453 # - A codepoint is flagged false if case conversion is continuous from the 454 # previous codepoint, i.e. out_curr = out_prev + 1. 455 # 456 # The result is a lot of small ranges due to a lot of small 'false' ranges. 457 # Reduce the range set by checking if adjacent 'true' ranges have at most 458 # false_limit 'false' entries between them. If so, force the 'false' 459 # entries to 'true' (safe but results in an unnecessary runtime codepoint 460 # lookup) and merge the three ranges into a larger 'true' range. 461 # 462 # (Currently unused.) 463 464 def generate_needcheck_straight(): 465 res = [ True ] * 65536 466 assert(canontab[0] == 0) # can start from in == out == 0 467 prev_in = -1 468 prev_out = -1 469 for i in xrange(65536): 470 # First create a straight true/false bitmap for BMP. 471 curr_in = i 472 curr_out = canontab[i] 473 if prev_in + 1 == curr_in and prev_out + 1 == curr_out: 474 res[i] = False 475 prev_in = curr_in 476 prev_out = curr_out 477 return res 478 def generate_needcheck_ranges(data): 479 # Generate maximal accurate ranges. 480 prev = None 481 count = 0 482 ranges = [] 483 for i in data: 484 if prev is None or prev != i: 485 if prev is not None: 486 ranges.append([ prev, count ]) 487 prev = i 488 count = 1 489 else: 490 count += 1 491 if prev is not None: 492 ranges.append([ prev, count ]) 493 return ranges 494 def fillin_needcheck_ranges(data, false_limit): 495 # Fill in TRUE-FALSE*N-TRUE gaps into TRUE-TRUE*N-TRUE which is 496 # safe (leads to an unnecessary runtime check) but reduces 497 # range data size considerably. 498 res = [] 499 for r in data: 500 res.append([ r[0], r[1] ]) 501 while True: 502 found = False 503 for i in xrange(len(res) - 2): 504 r1 = res[i] 505 r2 = res[i + 1] 506 r3 = res[i + 2] 507 if r1[0] == True and r2[0] == False and r3[0] == True and \ 508 r2[1] <= false_limit: 509 #print('fillin %d falses' % r2[1]) 510 res.pop(i + 2) 511 res.pop(i + 1) 512 res[i] = [ True, r1[1] + r2[1] + r3[1] ] 513 found = True 514 break 515 if not found: 516 break 517 return res 518 519 print('generate needcheck straight') 520 needcheck = generate_needcheck_straight() 521 522 print('generate needcheck without false fillins') 523 needcheck_ranges1 = generate_needcheck_ranges(needcheck) 524 print('- %d ranges' % len(needcheck_ranges1)) 525 #print(needcheck_ranges1) 526 527 print('generate needcheck with false fillins') 528 needcheck_ranges2 = fillin_needcheck_ranges(needcheck_ranges1, 11) 529 print('- %d ranges' % len(needcheck_ranges2)) 530 #print(needcheck_ranges2) 531 532 # Generate a bitmap for BMP, divided into N-codepoint blocks, with each 533 # bit indicating: "entire codepoint block canonicalizes continuously, and 534 # the block is continuous with the previous and next block". A 'true' 535 # entry allows runtime code to just skip the block, advancing 'in' and 536 # 'out' by the block size, with no codepoint conversion. The block size 537 # should be large enough to produce a relatively small lookup table, but 538 # small enough to reduce codepoint conversions to a manageable number 539 # because the conversions are (currently) quite slow. This matters 540 # especially for case-insensitive RegExps; without any optimization, 541 # /[\u0000-\uffff]/i requires 65536 case conversions for runtime 542 # normalization. 543 544 block_shift = 5 545 block_size = 1 << block_shift 546 block_mask = block_size - 1 547 num_blocks = 65536 / block_size 548 549 def generate_block_bits(check_continuity): 550 res = [ True ] * num_blocks 551 for i in xrange(num_blocks): 552 base_in = i * block_size 553 base_out = canontab[base_in] 554 if check_continuity: 555 lower = -1 # [-1,block_size] 556 upper = block_size + 1 557 else: 558 lower = 0 # [0,block_size-1] 559 upper = block_size 560 for j in xrange(lower, upper): 561 cp = base_in + j 562 if cp >= 0x0000 and cp <= 0xffff and canontab[cp] != base_out + j: 563 res[i] = False 564 break 565 return res 566 567 def dump_block_bitmap(bits): 568 tmp = ''.join([ ({ True: 'x', False: '.' })[b] for b in bits]) 569 tmp = re.sub(r'.{64}', lambda x: x.group(0) + '\n', tmp) 570 blocks_true = tmp.count('x') 571 blocks_false = tmp.count('.') 572 print('%d codepoint blocks are continuous, %d blocks are not' % (blocks_true, blocks_false)) 573 sys.stdout.write(tmp) 574 #print(bits) 575 576 def dump_test_lookup(bits): 577 sys.stdout.write('duk_uint8_t test = {'); 578 for b in bits: 579 if b: 580 sys.stdout.write('1,') 581 else: 582 sys.stdout.write('0,') 583 sys.stdout.write('};\n') 584 585 def convert_to_bitmap(bits): 586 # C code looks up bits as: 587 # index = codepoint >> N 588 # bitnum = codepoint & mask 589 # bitmask = 1 << bitnum 590 # So block 0 is mask 0x01 of first byte, block 1 is mask 0x02 of 591 # first byte, etc. 592 res = [] 593 curr = 0 594 mask = 0x01 595 for b in bits: 596 if b: 597 curr += mask 598 mask = mask * 2 599 if mask == 0x100: 600 res.append(curr) 601 curr = 0 602 mask = 0x01 603 assert(mask == 0x01) # no leftover 604 return res 605 606 print('generate canon block bitmap without continuity') 607 block_bits1 = generate_block_bits(False) 608 dump_block_bitmap(block_bits1) 609 dump_test_lookup(block_bits1) 610 611 print('generate canon block bitmap with continuity') 612 block_bits2 = generate_block_bits(True) 613 dump_block_bitmap(block_bits2) 614 dump_test_lookup(block_bits2) 615 616 print('generate final canon bitmap') 617 block_bitmap = convert_to_bitmap(block_bits2) 618 print('- %d bytes' % len(block_bitmap)) 619 print('- ' + repr(block_bitmap)) 620 canon_bitmap = { 621 'data': block_bitmap, 622 'block_size': block_size, 623 'block_shift': block_shift, 624 'block_mask': block_mask 625 } 626 627 # This is useful to figure out corner case test cases. 628 print('canon blocks which are different with and without continuity check') 629 for i in xrange(num_blocks): 630 if block_bits1[i] != block_bits2[i]: 631 print('- block %d ([%d,%d]) differs' % (i, i * block_size, i * block_size + block_size - 1)) 632 633 return canontab, canon_bitmap 634 635def clonedict(x): 636 "Shallow clone of input dict." 637 res = {} 638 for k in x.keys(): 639 res[k] = x[k] 640 return res 641 642def main(): 643 parser = optparse.OptionParser() 644 parser.add_option('--command', dest='command', default='caseconv_bitpacked') 645 parser.add_option('--unicode-data', dest='unicode_data') 646 parser.add_option('--special-casing', dest='special_casing') 647 parser.add_option('--out-source', dest='out_source') 648 parser.add_option('--out-header', dest='out_header') 649 parser.add_option('--table-name-lc', dest='table_name_lc', default='caseconv_lc') 650 parser.add_option('--table-name-uc', dest='table_name_uc', default='caseconv_uc') 651 parser.add_option('--table-name-re-canon-lookup', dest='table_name_re_canon_lookup', default='caseconv_re_canon_lookup') 652 parser.add_option('--table-name-re-canon-bitmap', dest='table_name_re_canon_bitmap', default='caseconv_re_canon_bitmap') 653 (opts, args) = parser.parse_args() 654 655 unicode_data = UnicodeData(opts.unicode_data) 656 special_casing = SpecialCasing(opts.special_casing) 657 658 uc, lc, tc = get_base_conversion_maps(unicode_data) 659 update_special_casings(uc, lc, tc, special_casing) 660 661 if opts.command == 'caseconv_bitpacked': 662 # XXX: ASCII and non-BMP filtering could be an option but is now hardcoded 663 664 # ASCII is handled with 'fast path' so not needed here. 665 t = clonedict(uc) 666 remove_ascii_part(t) 667 uc_bytes, uc_nbits = generate_caseconv_tables(t) 668 669 t = clonedict(lc) 670 remove_ascii_part(t) 671 lc_bytes, lc_nbits = generate_caseconv_tables(t) 672 673 # Generate C source and header files. 674 genc = dukutil.GenerateC() 675 genc.emitHeader('extract_caseconv.py') 676 genc.emitArray(uc_bytes, opts.table_name_uc, size=len(uc_bytes), typename='duk_uint8_t', intvalues=True, const=True) 677 genc.emitArray(lc_bytes, opts.table_name_lc, size=len(lc_bytes), typename='duk_uint8_t', intvalues=True, const=True) 678 f = open(opts.out_source, 'wb') 679 f.write(genc.getString()) 680 f.close() 681 682 genc = dukutil.GenerateC() 683 genc.emitHeader('extract_caseconv.py') 684 genc.emitLine('extern const duk_uint8_t %s[%d];' % (opts.table_name_uc, len(uc_bytes))) 685 genc.emitLine('extern const duk_uint8_t %s[%d];' % (opts.table_name_lc, len(lc_bytes))) 686 f = open(opts.out_header, 'wb') 687 f.write(genc.getString()) 688 f.close() 689 elif opts.command == 're_canon_lookup': 690 # Direct canonicalization lookup for case insensitive regexps, includes ascii part. 691 t = clonedict(uc) 692 re_canon_lookup, re_canon_bitmap = generate_regexp_canonicalize_tables(t) 693 694 genc = dukutil.GenerateC() 695 genc.emitHeader('extract_caseconv.py') 696 genc.emitArray(re_canon_lookup, opts.table_name_re_canon_lookup, size=len(re_canon_lookup), typename='duk_uint16_t', intvalues=True, const=True) 697 f = open(opts.out_source, 'wb') 698 f.write(genc.getString()) 699 f.close() 700 701 genc = dukutil.GenerateC() 702 genc.emitHeader('extract_caseconv.py') 703 genc.emitLine('extern const duk_uint16_t %s[%d];' % (opts.table_name_re_canon_lookup, len(re_canon_lookup))) 704 f = open(opts.out_header, 'wb') 705 f.write(genc.getString()) 706 f.close() 707 elif opts.command == 're_canon_bitmap': 708 # N-codepoint block bitmap for skipping continuous codepoint blocks 709 # quickly. 710 t = clonedict(uc) 711 re_canon_lookup, re_canon_bitmap = generate_regexp_canonicalize_tables(t) 712 713 genc = dukutil.GenerateC() 714 genc.emitHeader('extract_caseconv.py') 715 genc.emitArray(re_canon_bitmap['data'], opts.table_name_re_canon_bitmap, size=len(re_canon_bitmap['data']), typename='duk_uint8_t', intvalues=True, const=True) 716 f = open(opts.out_source, 'wb') 717 f.write(genc.getString()) 718 f.close() 719 720 genc = dukutil.GenerateC() 721 genc.emitHeader('extract_caseconv.py') 722 genc.emitDefine('DUK_CANON_BITMAP_BLKSIZE', re_canon_bitmap['block_size']) 723 genc.emitDefine('DUK_CANON_BITMAP_BLKSHIFT', re_canon_bitmap['block_shift']) 724 genc.emitDefine('DUK_CANON_BITMAP_BLKMASK', re_canon_bitmap['block_mask']) 725 genc.emitLine('extern const duk_uint8_t %s[%d];' % (opts.table_name_re_canon_bitmap, len(re_canon_bitmap['data']))) 726 f = open(opts.out_header, 'wb') 727 f.write(genc.getString()) 728 f.close() 729 else: 730 raise Exception('invalid command: %r' % opts.command) 731 732if __name__ == '__main__': 733 main() 734