1#!/usr/bin/env python2
2#
3#  Extract rules for Unicode case conversion, specifically the behavior
4#  required by ECMAScript E5 in Sections 15.5.4.16 to 15.5.4.19.  The
5#  bitstream encoded rules are used for the slow path at run time, so
6#  compactness is favored over speed.
7#
8#  There is no support for context or locale sensitive rules, as they
9#  are handled directly in C code before consulting tables generated
10#  here.  ECMAScript requires case conversion both with and without
11#  locale/language specific rules (e.g. String.prototype.toLowerCase()
12#  and String.prototype.toLocaleLowerCase()), so they are best handled
13#  in C anyway.
14#
15#  Case conversion rules for ASCII are also excluded as they are handled
16#  by the C fast path.  Rules for non-BMP characters (codepoints above
17#  U+FFFF) are omitted as they're not required for standard ECMAScript.
18#
19
20import os
21import sys
22import re
23import math
24import optparse
25
26import dukutil
27
28class UnicodeData:
29    """Read UnicodeData.txt into an internal representation."""
30
31    def __init__(self, filename):
32        self.data = self.read_unicode_data(filename)
33        print('read %d unicode data entries' % len(self.data))
34
35    def read_unicode_data(self, filename):
36        res = []
37        f = open(filename, 'rb')
38        for line in f:
39            if line.startswith('#'):
40                continue
41            line = line.strip()
42            if line == '':
43                continue
44            parts = line.split(';')
45            if len(parts) != 15:
46                raise Exception('invalid unicode data line')
47            res.append(parts)
48        f.close()
49
50        # Sort based on Unicode codepoint.
51        def mycmp(a,b):
52            return cmp(long(a[0], 16), long(b[0], 16))
53
54        res.sort(cmp=mycmp)
55        return res
56
57class SpecialCasing:
58    """Read SpecialCasing.txt into an internal representation."""
59
60    def __init__(self, filename):
61        self.data = self.read_special_casing_data(filename)
62        print('read %d special casing entries' % len(self.data))
63
64    def read_special_casing_data(self, filename):
65        res = []
66        f = open(filename, 'rb')
67        for line in f:
68            try:
69                idx = line.index('#')
70                line = line[:idx]
71            except ValueError:
72                pass
73            line = line.strip()
74            if line == '':
75                continue
76            parts = line.split(';')
77            parts = [i.strip() for i in parts]
78            while len(parts) < 6:
79                parts.append('')
80            res.append(parts)
81        f.close()
82        return res
83
84def parse_unicode_sequence(x):
85    """Parse a Unicode sequence like ABCD 1234 into a unicode string."""
86    res = ''
87    for i in x.split(' '):
88        i = i.strip()
89        if i == '':
90            continue
91        res += unichr(long(i, 16))
92    return res
93
94def get_base_conversion_maps(unicode_data):
95    """Create case conversion tables without handling special casing yet."""
96
97    uc = {}        # uppercase, codepoint (number) -> string
98    lc = {}        # lowercase
99    tc = {}        # titlecase
100
101    for x in unicode_data.data:
102        c1 = long(x[0], 16)
103
104        # just 16-bit support needed
105        if c1 >= 0x10000:
106            continue
107
108        if x[12] != '':
109            # field 12: simple uppercase mapping
110            c2 = parse_unicode_sequence(x[12])
111            uc[c1] = c2
112            tc[c1] = c2    # titlecase default == uppercase, overridden below if necessary
113        if x[13] != '':
114            # field 13: simple lowercase mapping
115            c2 = parse_unicode_sequence(x[13])
116            lc[c1] = c2
117        if x[14] != '':
118            # field 14: simple titlecase mapping
119            c2 = parse_unicode_sequence(x[14])
120            tc[c1] = c2
121
122    return uc, lc, tc
123
124def update_special_casings(uc, lc, tc, special_casing):
125    """Update case conversion tables with special case conversion rules."""
126
127    for x in special_casing.data:
128        c1 = long(x[0], 16)
129
130        if x[4] != '':
131            # conditions
132            continue
133
134        lower = parse_unicode_sequence(x[1])
135        title = parse_unicode_sequence(x[2])
136        upper = parse_unicode_sequence(x[3])
137
138        if len(lower) > 1:
139            lc[c1] = lower
140        if len(upper) > 1:
141            uc[c1] = upper
142        if len(title) > 1:
143            tc[c1] = title
144
145        print('- special case: %d %d %d' % (len(lower), len(upper), len(title)))
146
147def remove_ascii_part(convmap):
148    """Remove ASCII case conversion parts (handled by C fast path)."""
149
150    for i in xrange(128):
151        if convmap.has_key(i):
152            del convmap[i]
153
154def scan_range_with_skip(convmap, start_idx, skip):
155    """Scan for a range of continuous case conversion with a certain 'skip'."""
156
157    conv_i = start_idx
158    if not convmap.has_key(conv_i):
159        return None, None, None
160    elif len(convmap[conv_i]) > 1:
161        return None, None, None
162    else:
163        conv_o = ord(convmap[conv_i])
164
165    start_i = conv_i
166    start_o = conv_o
167
168    while True:
169        new_i = conv_i + skip
170        new_o = conv_o + skip
171
172        if not convmap.has_key(new_i):
173            break
174        if len(convmap[new_i]) > 1:
175            break
176        if ord(convmap[new_i]) != new_o:
177            break
178
179        conv_i = new_i
180        conv_o = new_o
181
182    # [start_i,conv_i] maps to [start_o,conv_o], ignore ranges of 1 char.
183    count = (conv_i - start_i) / skip + 1
184    if count <= 1:
185        return None, None, None
186
187    # We have an acceptable range, remove them from the convmap here.
188    for i in xrange(start_i, conv_i + skip, skip):
189        del convmap[i]
190
191    return start_i, start_o, count
192
193def find_first_range_with_skip(convmap, skip):
194    """Find first range with a certain 'skip' value."""
195
196    for i in xrange(65536):
197        start_i, start_o, count = scan_range_with_skip(convmap, i, skip)
198        if start_i is None:
199            continue
200        return start_i, start_o, count
201
202    return None, None, None
203
204def generate_caseconv_tables(convmap):
205    """Generate bit-packed case conversion table for a given conversion map."""
206
207    # The bitstream encoding is based on manual inspection for whatever
208    # regularity the Unicode case conversion rules have.
209    #
210    # Start with a full description of case conversions which does not
211    # cover all codepoints; unmapped codepoints convert to themselves.
212    # Scan for range-to-range mappings with a range of skips starting from 1.
213    # Whenever a valid range is found, remove it from the map.  Finally,
214    # output the remaining case conversions (1:1 and 1:n) on a per codepoint
215    # basis.
216    #
217    # This is very slow because we always scan from scratch, but its the
218    # most reliable and simple way to scan
219
220    print('generate caseconv tables')
221
222    ranges = []        # range mappings (2 or more consecutive mappings with a certain skip)
223    singles = []       # 1:1 character mappings
224    multis = []        # 1:n character mappings
225
226    # Ranges with skips
227
228    for skip in xrange(1,6+1):    # skips 1...6 are useful
229        while True:
230            start_i, start_o, count = find_first_range_with_skip(convmap, skip)
231            if start_i is None:
232                break
233            print('- skip %d: %d %d %d' % (skip, start_i, start_o, count))
234            ranges.append([start_i, start_o, count, skip])
235
236    # 1:1 conversions
237
238    k = convmap.keys()
239    k.sort()
240    for i in k:
241        if len(convmap[i]) > 1:
242            continue
243        singles.append([i, ord(convmap[i])])    # codepoint, codepoint
244        del convmap[i]
245
246    # There are many mappings to 2-char sequences with latter char being U+0399.
247    # These could be handled as a special case, but we don't do that right now.
248    #
249    # [8064L, u'\u1f08\u0399']
250    # [8065L, u'\u1f09\u0399']
251    # [8066L, u'\u1f0a\u0399']
252    # [8067L, u'\u1f0b\u0399']
253    # [8068L, u'\u1f0c\u0399']
254    # [8069L, u'\u1f0d\u0399']
255    # [8070L, u'\u1f0e\u0399']
256    # [8071L, u'\u1f0f\u0399']
257    # ...
258    #
259    # tmp = {}
260    # k = convmap.keys()
261    # k.sort()
262    # for i in k:
263    #    if len(convmap[i]) == 2 and convmap[i][1] == u'\u0399':
264    #        tmp[i] = convmap[i][0]
265    #        del convmap[i]
266    # print(repr(tmp))
267    #
268    # skip = 1
269    # while True:
270    #    start_i, start_o, count = find_first_range_with_skip(tmp, skip)
271    #    if start_i is None:
272    #        break
273    #    print('- special399, skip %d: %d %d %d' % (skip, start_i, start_o, count))
274    # print(len(tmp.keys()))
275    # print(repr(tmp))
276    # XXX: need to put 12 remaining mappings back to convmap
277
278    # 1:n conversions
279
280    k = convmap.keys()
281    k.sort()
282    for i in k:
283        multis.append([i, convmap[i]])        # codepoint, string
284        del convmap[i]
285
286    for t in singles:
287        print '- singles: ' + repr(t)
288
289    for t in multis:
290        print '- multis: ' + repr(t)
291
292    print '- range mappings: %d' % len(ranges)
293    print '- single character mappings: %d' % len(singles)
294    print '- complex mappings (1:n): %d' % len(multis)
295    print '- remaining (should be zero): %d' % len(convmap.keys())
296
297    # XXX: opportunities for diff encoding skip=3 ranges?
298    prev = None
299    for t in ranges:
300        # range: [start_i, start_o, count, skip]
301        if t[3] != 3:
302            continue
303        if prev is not None:
304            print '- %d %d' % (t[0] - prev[0], t[1] - prev[1])
305        else:
306            print '- start: %d %d' % (t[0], t[1])
307        prev = t
308
309    # Bit packed encoding.
310
311    be = dukutil.BitEncoder()
312
313    for curr_skip in xrange(1, 7):    # 1...6
314        count = 0
315        for r in ranges:
316            start_i, start_o, r_count, skip = r[0], r[1], r[2], r[3]
317            if skip != curr_skip:
318                continue
319            count += 1
320        be.bits(count, 6)
321        print('- encode: skip=%d, count=%d' % (curr_skip, count))
322
323        for r in ranges:
324            start_i, start_o, r_count, skip = r[0], r[1], r[2], r[3]
325            if skip != curr_skip:
326                continue
327            be.bits(start_i, 16)
328            be.bits(start_o, 16)
329            be.bits(r_count, 7)
330    be.bits(0x3f, 6)    # maximum count value = end of skips
331
332    count = len(singles)
333    be.bits(count, 7)
334    for t in singles:
335        cp_i, cp_o = t[0], t[1]
336        be.bits(cp_i, 16)
337        be.bits(cp_o, 16)
338
339    count = len(multis)
340    be.bits(count, 7)
341    for t in multis:
342        cp_i, str_o = t[0], t[1]
343        be.bits(cp_i, 16)
344        be.bits(len(str_o), 2)
345        for i in xrange(len(str_o)):
346            be.bits(ord(str_o[i]), 16)
347
348    return be.getBytes(), be.getNumBits()
349
350def generate_regexp_canonicalize_tables(convmap):
351    """Generate tables for case insensitive RegExp normalization."""
352
353    # Generate a direct codepoint lookup for canonicalizing BMP range.
354
355    def generate_canontab():
356        res = []
357        highest_nonid = -1
358
359        for cp in xrange(65536):
360            res_cp = cp  # default to as is
361            if convmap.has_key(cp):
362                tmp = convmap[cp]
363                if len(tmp) == 1:
364                    # If multiple codepoints from input, ignore.
365                    res_cp = ord(tmp[0])
366            if cp >= 0x80 and res_cp < 0x80:
367                res_cp = cp  # If non-ASCII mapped to ASCII, ignore.
368            if cp != res_cp:
369                highest_nonid = cp
370            res.append(res_cp)
371
372        # At the moment this is 65370, which means there's very little
373        # gain in assuming 1:1 mapping above a certain BMP codepoint
374        # (though we do assume 1:1 mapping for above BMP codepoints).
375        print('- highest non-identity mapping: %d' % highest_nonid)
376
377        return res
378
379    print('generate canontab')
380    canontab = generate_canontab()
381
382    # Figure out which BMP values are never the result of canonicalization.
383    # Such codepoints are "don't care" in the sense that they are never
384    # matched against at runtime: ranges are canonicalized at compile time,
385    # and codepoint being matched is also canonicalized at run time.
386    # (Currently unused.)
387
388    def generate_dontcare():
389        res = [ True ] * 65536
390        for cp in canontab:
391            res[cp] = False
392        res_count = 0
393        for x in res:
394            if x:
395                res_count += 1
396        print('- %d dontcare codepoints' % res_count)
397        return res
398
399    print('generate canon dontcare')
400    dontcare = generate_dontcare()
401
402    # Generate maximal continuous ranges for canonicalization.  A continuous
403    # range is a sequence with N codepoints where IN+i canonicalizes to OUT+i
404    # for fixed IN, OUT, and i in 0...N-1.  There are unfortunately >1000
405    # of these ranges, mostly because there are a lot of individual exceptions.
406    # (Currently unused.)
407
408    canon_ranges = []
409    for cp in xrange(65536):
410       canon_ranges.append([ cp, canontab[cp], 1 ])  # 1 codepoint ranges at first
411    def merge_compatible_nogap(rng1, rng2):
412        # Merge adjacent ranges if continuity allows.
413        if rng1[0] + rng1[2] == rng2[0] and \
414           rng1[1] + rng1[2] == rng2[1]:
415            return [ rng1[0], rng1[1], rng1[2] + rng2[2] ]
416        return None
417    def merge_check_nogap():
418        len_start = len(canon_ranges)
419        for i in xrange(len(canon_ranges) - 1):
420            j = i + 1
421            rng1 = canon_ranges[i]
422            rng2 = canon_ranges[j]
423            if rng1 is None or rng2 is None: continue
424            merged = merge_compatible_nogap(rng1, rng2)
425            if merged is not None:
426                canon_ranges[j] = None
427                canon_ranges[i] = merged
428        filtered = []
429        for x in canon_ranges:
430            if x is not None:
431                filtered.append(x)
432        len_end = len(filtered)
433        if len_end < len_start:
434            return filtered
435        return None
436
437    print('generate canon_ranges')
438    while True:
439        # Starting from individual ranges of 1 codepoint, merge adjacent
440        # ranges until no more ranges can be merged.
441        t = merge_check_nogap()
442        if t is None:
443            break
444        canon_ranges = t
445    print('- %d ranges' % len(canon_ranges))
446    #for rng in canon_ranges:
447    #    print('canon_ranges:')
448    #    print(repr(rng))
449
450    # Generate true/false ranges for BMP codepoints where:
451    # - A codepoint is flagged true if continuity is broken at that point, so
452    #   an explicit codepoint canonicalization is needed at runtime.
453    # - A codepoint is flagged false if case conversion is continuous from the
454    #   previous codepoint, i.e. out_curr = out_prev + 1.
455    #
456    # The result is a lot of small ranges due to a lot of small 'false' ranges.
457    # Reduce the range set by checking if adjacent 'true' ranges have at most
458    # false_limit 'false' entries between them.  If so, force the 'false'
459    # entries to 'true' (safe but results in an unnecessary runtime codepoint
460    # lookup) and merge the three ranges into a larger 'true' range.
461    #
462    # (Currently unused.)
463
464    def generate_needcheck_straight():
465        res = [ True ] * 65536
466        assert(canontab[0] == 0)  # can start from in == out == 0
467        prev_in = -1
468        prev_out = -1
469        for i in xrange(65536):
470            # First create a straight true/false bitmap for BMP.
471            curr_in = i
472            curr_out = canontab[i]
473            if prev_in + 1 == curr_in and prev_out + 1 == curr_out:
474                res[i] = False
475            prev_in = curr_in
476            prev_out = curr_out
477        return res
478    def generate_needcheck_ranges(data):
479        # Generate maximal accurate ranges.
480        prev = None
481        count = 0
482        ranges = []
483        for i in data:
484            if prev is None or prev != i:
485                if prev is not None:
486                    ranges.append([ prev, count ])
487                prev = i
488                count = 1
489            else:
490                count += 1
491        if prev is not None:
492            ranges.append([ prev, count ])
493        return ranges
494    def fillin_needcheck_ranges(data, false_limit):
495        # Fill in TRUE-FALSE*N-TRUE gaps into TRUE-TRUE*N-TRUE which is
496        # safe (leads to an unnecessary runtime check) but reduces
497        # range data size considerably.
498        res = []
499        for r in data:
500            res.append([ r[0], r[1] ])
501        while True:
502            found = False
503            for i in xrange(len(res) - 2):
504                r1 = res[i]
505                r2 = res[i + 1]
506                r3 = res[i + 2]
507                if r1[0] == True and r2[0] == False and r3[0] == True and \
508                   r2[1] <= false_limit:
509                    #print('fillin %d falses' % r2[1])
510                    res.pop(i + 2)
511                    res.pop(i + 1)
512                    res[i] = [ True, r1[1] + r2[1] + r3[1] ]
513                    found = True
514                    break
515            if not found:
516                break
517        return res
518
519    print('generate needcheck straight')
520    needcheck = generate_needcheck_straight()
521
522    print('generate needcheck without false fillins')
523    needcheck_ranges1 = generate_needcheck_ranges(needcheck)
524    print('- %d ranges' % len(needcheck_ranges1))
525    #print(needcheck_ranges1)
526
527    print('generate needcheck with false fillins')
528    needcheck_ranges2 = fillin_needcheck_ranges(needcheck_ranges1, 11)
529    print('- %d ranges' % len(needcheck_ranges2))
530    #print(needcheck_ranges2)
531
532    # Generate a bitmap for BMP, divided into N-codepoint blocks, with each
533    # bit indicating: "entire codepoint block canonicalizes continuously, and
534    # the block is continuous with the previous and next block".  A 'true'
535    # entry allows runtime code to just skip the block, advancing 'in' and
536    # 'out' by the block size, with no codepoint conversion.  The block size
537    # should be large enough to produce a relatively small lookup table, but
538    # small enough to reduce codepoint conversions to a manageable number
539    # because the conversions are (currently) quite slow.  This matters
540    # especially for case-insensitive RegExps; without any optimization,
541    # /[\u0000-\uffff]/i requires 65536 case conversions for runtime
542    # normalization.
543
544    block_shift = 5
545    block_size = 1 << block_shift
546    block_mask = block_size - 1
547    num_blocks = 65536 / block_size
548
549    def generate_block_bits(check_continuity):
550        res = [ True ] * num_blocks
551        for i in xrange(num_blocks):
552            base_in = i * block_size
553            base_out = canontab[base_in]
554            if check_continuity:
555                lower = -1   # [-1,block_size]
556                upper = block_size + 1
557            else:
558                lower = 0    # [0,block_size-1]
559                upper = block_size
560            for j in xrange(lower, upper):
561                cp = base_in + j
562                if cp >= 0x0000 and cp <= 0xffff and canontab[cp] != base_out + j:
563                   res[i] = False
564                   break
565        return res
566
567    def dump_block_bitmap(bits):
568        tmp = ''.join([ ({ True: 'x', False: '.' })[b] for b in bits])
569        tmp = re.sub(r'.{64}', lambda x: x.group(0) + '\n', tmp)
570        blocks_true = tmp.count('x')
571        blocks_false = tmp.count('.')
572        print('%d codepoint blocks are continuous, %d blocks are not' % (blocks_true, blocks_false))
573        sys.stdout.write(tmp)
574        #print(bits)
575
576    def dump_test_lookup(bits):
577        sys.stdout.write('duk_uint8_t test = {');
578        for b in bits:
579            if b:
580                sys.stdout.write('1,')
581            else:
582                sys.stdout.write('0,')
583        sys.stdout.write('};\n')
584
585    def convert_to_bitmap(bits):
586        # C code looks up bits as:
587        #   index = codepoint >> N
588        #   bitnum = codepoint & mask
589        #   bitmask = 1 << bitnum
590        # So block 0 is mask 0x01 of first byte, block 1 is mask 0x02 of
591        # first byte, etc.
592        res = []
593        curr = 0
594        mask = 0x01
595        for b in bits:
596            if b:
597                curr += mask
598            mask = mask * 2
599            if mask == 0x100:
600                res.append(curr)
601                curr = 0
602                mask = 0x01
603        assert(mask == 0x01)  # no leftover
604        return res
605
606    print('generate canon block bitmap without continuity')
607    block_bits1 = generate_block_bits(False)
608    dump_block_bitmap(block_bits1)
609    dump_test_lookup(block_bits1)
610
611    print('generate canon block bitmap with continuity')
612    block_bits2 = generate_block_bits(True)
613    dump_block_bitmap(block_bits2)
614    dump_test_lookup(block_bits2)
615
616    print('generate final canon bitmap')
617    block_bitmap = convert_to_bitmap(block_bits2)
618    print('- %d bytes' % len(block_bitmap))
619    print('- ' + repr(block_bitmap))
620    canon_bitmap = {
621        'data': block_bitmap,
622        'block_size': block_size,
623        'block_shift': block_shift,
624        'block_mask': block_mask
625    }
626
627    # This is useful to figure out corner case test cases.
628    print('canon blocks which are different with and without continuity check')
629    for i in xrange(num_blocks):
630        if block_bits1[i] != block_bits2[i]:
631            print('- block %d ([%d,%d]) differs' % (i, i * block_size, i * block_size + block_size - 1))
632
633    return canontab, canon_bitmap
634
635def clonedict(x):
636    "Shallow clone of input dict."
637    res = {}
638    for k in x.keys():
639        res[k] = x[k]
640    return res
641
642def main():
643    parser = optparse.OptionParser()
644    parser.add_option('--command', dest='command', default='caseconv_bitpacked')
645    parser.add_option('--unicode-data', dest='unicode_data')
646    parser.add_option('--special-casing', dest='special_casing')
647    parser.add_option('--out-source', dest='out_source')
648    parser.add_option('--out-header', dest='out_header')
649    parser.add_option('--table-name-lc', dest='table_name_lc', default='caseconv_lc')
650    parser.add_option('--table-name-uc', dest='table_name_uc', default='caseconv_uc')
651    parser.add_option('--table-name-re-canon-lookup', dest='table_name_re_canon_lookup', default='caseconv_re_canon_lookup')
652    parser.add_option('--table-name-re-canon-bitmap', dest='table_name_re_canon_bitmap', default='caseconv_re_canon_bitmap')
653    (opts, args) = parser.parse_args()
654
655    unicode_data = UnicodeData(opts.unicode_data)
656    special_casing = SpecialCasing(opts.special_casing)
657
658    uc, lc, tc = get_base_conversion_maps(unicode_data)
659    update_special_casings(uc, lc, tc, special_casing)
660
661    if opts.command == 'caseconv_bitpacked':
662        # XXX: ASCII and non-BMP filtering could be an option but is now hardcoded
663
664        # ASCII is handled with 'fast path' so not needed here.
665        t = clonedict(uc)
666        remove_ascii_part(t)
667        uc_bytes, uc_nbits = generate_caseconv_tables(t)
668
669        t = clonedict(lc)
670        remove_ascii_part(t)
671        lc_bytes, lc_nbits = generate_caseconv_tables(t)
672
673        # Generate C source and header files.
674        genc = dukutil.GenerateC()
675        genc.emitHeader('extract_caseconv.py')
676        genc.emitArray(uc_bytes, opts.table_name_uc, size=len(uc_bytes), typename='duk_uint8_t', intvalues=True, const=True)
677        genc.emitArray(lc_bytes, opts.table_name_lc, size=len(lc_bytes), typename='duk_uint8_t', intvalues=True, const=True)
678        f = open(opts.out_source, 'wb')
679        f.write(genc.getString())
680        f.close()
681
682        genc = dukutil.GenerateC()
683        genc.emitHeader('extract_caseconv.py')
684        genc.emitLine('extern const duk_uint8_t %s[%d];' % (opts.table_name_uc, len(uc_bytes)))
685        genc.emitLine('extern const duk_uint8_t %s[%d];' % (opts.table_name_lc, len(lc_bytes)))
686        f = open(opts.out_header, 'wb')
687        f.write(genc.getString())
688        f.close()
689    elif opts.command == 're_canon_lookup':
690        # Direct canonicalization lookup for case insensitive regexps, includes ascii part.
691        t = clonedict(uc)
692        re_canon_lookup, re_canon_bitmap = generate_regexp_canonicalize_tables(t)
693
694        genc = dukutil.GenerateC()
695        genc.emitHeader('extract_caseconv.py')
696        genc.emitArray(re_canon_lookup, opts.table_name_re_canon_lookup, size=len(re_canon_lookup), typename='duk_uint16_t', intvalues=True, const=True)
697        f = open(opts.out_source, 'wb')
698        f.write(genc.getString())
699        f.close()
700
701        genc = dukutil.GenerateC()
702        genc.emitHeader('extract_caseconv.py')
703        genc.emitLine('extern const duk_uint16_t %s[%d];' % (opts.table_name_re_canon_lookup, len(re_canon_lookup)))
704        f = open(opts.out_header, 'wb')
705        f.write(genc.getString())
706        f.close()
707    elif opts.command == 're_canon_bitmap':
708        # N-codepoint block bitmap for skipping continuous codepoint blocks
709        # quickly.
710        t = clonedict(uc)
711        re_canon_lookup, re_canon_bitmap = generate_regexp_canonicalize_tables(t)
712
713        genc = dukutil.GenerateC()
714        genc.emitHeader('extract_caseconv.py')
715        genc.emitArray(re_canon_bitmap['data'], opts.table_name_re_canon_bitmap, size=len(re_canon_bitmap['data']), typename='duk_uint8_t', intvalues=True, const=True)
716        f = open(opts.out_source, 'wb')
717        f.write(genc.getString())
718        f.close()
719
720        genc = dukutil.GenerateC()
721        genc.emitHeader('extract_caseconv.py')
722        genc.emitDefine('DUK_CANON_BITMAP_BLKSIZE', re_canon_bitmap['block_size'])
723        genc.emitDefine('DUK_CANON_BITMAP_BLKSHIFT', re_canon_bitmap['block_shift'])
724        genc.emitDefine('DUK_CANON_BITMAP_BLKMASK', re_canon_bitmap['block_mask'])
725        genc.emitLine('extern const duk_uint8_t %s[%d];' % (opts.table_name_re_canon_bitmap, len(re_canon_bitmap['data'])))
726        f = open(opts.out_header, 'wb')
727        f.write(genc.getString())
728        f.close()
729    else:
730        raise Exception('invalid command: %r' % opts.command)
731
732if __name__ == '__main__':
733    main()
734