duktape-2.6.0/tools/extract_caseconv.py

#!/usr/bin/env python2
#
#  Extract rules for Unicode case conversion, specifically the behavior
#  required by ECMAScript E5 in Sections 15.5.4.16 to 15.5.4.19.  The
#  bitstream encoded rules are used for the slow path at run time, so
#  compactness is favored over speed.
#
#  There is no support for context or locale sensitive rules, as they
#  are handled directly in C code before consulting tables generated
#  here.  ECMAScript requires case conversion both with and without
#  locale/language specific rules (e.g. String.prototype.toLowerCase()
#  and String.prototype.toLocaleLowerCase()), so they are best handled
#  in C anyway.
#
#  Case conversion rules for ASCII are also excluded as they are handled
#  by the C fast path.  Rules for non-BMP characters (codepoints above
#  U+FFFF) are omitted as they're not required for standard ECMAScript.
#

import os
import sys
import re
import math
import optparse

import dukutil

class UnicodeData:
    """Read UnicodeData.txt into an internal representation."""

    def __init__(self, filename):
        self.data = self.read_unicode_data(filename)
        print('read %d unicode data entries' % len(self.data))

    def read_unicode_data(self, filename):
        res = []
        f = open(filename, 'rb')
        for line in f:
            if line.startswith('#'):
                continue
            line = line.strip()
            if line == '':
                continue
            parts = line.split(';')
            if len(parts) != 15:
                raise Exception('invalid unicode data line')
            res.append(parts)
        f.close()

        # Sort based on Unicode codepoint.
        def mycmp(a,b):
            return cmp(long(a[0], 16), long(b[0], 16))

        res.sort(cmp=mycmp)
        return res

class SpecialCasing:
    """Read SpecialCasing.txt into an internal representation."""

    def __init__(self, filename):
        self.data = self.read_special_casing_data(filename)
        print('read %d special casing entries' % len(self.data))

    def read_special_casing_data(self, filename):
        res = []
        f = open(filename, 'rb')
        for line in f:
            try:
                idx = line.index('#')
                line = line[:idx]
            except ValueError:
                pass
            line = line.strip()
            if line == '':
                continue
            parts = line.split(';')
            parts = [i.strip() for i in parts]
            while len(parts) < 6:
                parts.append('')
            res.append(parts)
        f.close()
        return res

def parse_unicode_sequence(x):
    """Parse a Unicode sequence like ABCD 1234 into a unicode string."""
    res = ''
    for i in x.split(' '):
        i = i.strip()
        if i == '':
            continue
        res += unichr(long(i, 16))
    return res

def get_base_conversion_maps(unicode_data):
    """Create case conversion tables without handling special casing yet."""

    uc = {}        # uppercase, codepoint (number) -> string
    lc = {}        # lowercase
    tc = {}        # titlecase

    for x in unicode_data.data:
        c1 = long(x[0], 16)

        # just 16-bit support needed
        if c1 >= 0x10000:
            continue

        if x[12] != '':
            # field 12: simple uppercase mapping
            c2 = parse_unicode_sequence(x[12])
            uc[c1] = c2
            tc[c1] = c2    # titlecase default == uppercase, overridden below if necessary
        if x[13] != '':
            # field 13: simple lowercase mapping
            c2 = parse_unicode_sequence(x[13])
            lc[c1] = c2
        if x[14] != '':
            # field 14: simple titlecase mapping
            c2 = parse_unicode_sequence(x[14])
            tc[c1] = c2

    return uc, lc, tc

def update_special_casings(uc, lc, tc, special_casing):
    """Update case conversion tables with special case conversion rules."""

    for x in special_casing.data:
        c1 = long(x[0], 16)

        if x[4] != '':
            # conditions
            continue

        lower = parse_unicode_sequence(x[1])
        title = parse_unicode_sequence(x[2])
        upper = parse_unicode_sequence(x[3])

        if len(lower) > 1:
            lc[c1] = lower
        if len(upper) > 1:
            uc[c1] = upper
        if len(title) > 1:
            tc[c1] = title

        print('- special case: %d %d %d' % (len(lower), len(upper), len(title)))

def remove_ascii_part(convmap):
    """Remove ASCII case conversion parts (handled by C fast path)."""

    for i in xrange(128):
        if convmap.has_key(i):
            del convmap[i]

def scan_range_with_skip(convmap, start_idx, skip):
    """Scan for a range of continuous case conversion with a certain 'skip'."""

    conv_i = start_idx
    if not convmap.has_key(conv_i):
        return None, None, None
    elif len(convmap[conv_i]) > 1:
        return None, None, None
    else:
        conv_o = ord(convmap[conv_i])

    start_i = conv_i
    start_o = conv_o

    while True:
        new_i = conv_i + skip
        new_o = conv_o + skip

        if not convmap.has_key(new_i):
            break
        if len(convmap[new_i]) > 1:
            break
        if ord(convmap[new_i]) != new_o:
            break

        conv_i = new_i
        conv_o = new_o

    # [start_i,conv_i] maps to [start_o,conv_o], ignore ranges of 1 char.
    count = (conv_i - start_i) / skip + 1
    if count <= 1:
        return None, None, None

    # We have an acceptable range, remove them from the convmap here.
    for i in xrange(start_i, conv_i + skip, skip):
        del convmap[i]

    return start_i, start_o, count

def find_first_range_with_skip(convmap, skip):
    """Find first range with a certain 'skip' value."""

    for i in xrange(65536):
        start_i, start_o, count = scan_range_with_skip(convmap, i, skip)
        if start_i is None:
            continue
        return start_i, start_o, count

    return None, None, None

def generate_caseconv_tables(convmap):
    """Generate bit-packed case conversion table for a given conversion map."""

    # The bitstream encoding is based on manual inspection for whatever
    # regularity the Unicode case conversion rules have.
    #
    # Start with a full description of case conversions which does not
    # cover all codepoints; unmapped codepoints convert to themselves.
    # Scan for range-to-range mappings with a range of skips starting from 1.
    # Whenever a valid range is found, remove it from the map.  Finally,
    # output the remaining case conversions (1:1 and 1:n) on a per codepoint
    # basis.
    #
    # This is very slow because we always scan from scratch, but its the
    # most reliable and simple way to scan

    print('generate caseconv tables')

    ranges = []        # range mappings (2 or more consecutive mappings with a certain skip)
    singles = []       # 1:1 character mappings
    multis = []        # 1:n character mappings

    # Ranges with skips

    for skip in xrange(1,6+1):    # skips 1...6 are useful
        while True:
            start_i, start_o, count = find_first_range_with_skip(convmap, skip)
            if start_i is None:
                break
            print('- skip %d: %d %d %d' % (skip, start_i, start_o, count))
            ranges.append([start_i, start_o, count, skip])

    # 1:1 conversions

    k = convmap.keys()
    k.sort()
    for i in k:
        if len(convmap[i]) > 1:
            continue
        singles.append([i, ord(convmap[i])])    # codepoint, codepoint
        del convmap[i]

    # There are many mappings to 2-char sequences with latter char being U+0399.
    # These could be handled as a special case, but we don't do that right now.
    #
    # [8064L, u'\u1f08\u0399']
    # [8065L, u'\u1f09\u0399']
    # [8066L, u'\u1f0a\u0399']
    # [8067L, u'\u1f0b\u0399']
    # [8068L, u'\u1f0c\u0399']
    # [8069L, u'\u1f0d\u0399']
    # [8070L, u'\u1f0e\u0399']
    # [8071L, u'\u1f0f\u0399']
    # ...
    #
    # tmp = {}
    # k = convmap.keys()
    # k.sort()
    # for i in k:
    #    if len(convmap[i]) == 2 and convmap[i][1] == u'\u0399':
    #        tmp[i] = convmap[i][0]
    #        del convmap[i]
    # print(repr(tmp))
    #
    # skip = 1
    # while True:
    #    start_i, start_o, count = find_first_range_with_skip(tmp, skip)
    #    if start_i is None:
    #        break
    #    print('- special399, skip %d: %d %d %d' % (skip, start_i, start_o, count))
    # print(len(tmp.keys()))
    # print(repr(tmp))
    # XXX: need to put 12 remaining mappings back to convmap

    # 1:n conversions

    k = convmap.keys()
    k.sort()
    for i in k:
        multis.append([i, convmap[i]])        # codepoint, string
        del convmap[i]

    for t in singles:
        print '- singles: ' + repr(t)

    for t in multis:
        print '- multis: ' + repr(t)

    print '- range mappings: %d' % len(ranges)
    print '- single character mappings: %d' % len(singles)
    print '- complex mappings (1:n): %d' % len(multis)
    print '- remaining (should be zero): %d' % len(convmap.keys())

    # XXX: opportunities for diff encoding skip=3 ranges?
    prev = None
    for t in ranges:
        # range: [start_i, start_o, count, skip]
        if t[3] != 3:
            continue
        if prev is not None:
            print '- %d %d' % (t[0] - prev[0], t[1] - prev[1])
        else:
            print '- start: %d %d' % (t[0], t[1])
        prev = t

    # Bit packed encoding.

    be = dukutil.BitEncoder()

    for curr_skip in xrange(1, 7):    # 1...6
        count = 0
        for r in ranges:
            start_i, start_o, r_count, skip = r[0], r[1], r[2], r[3]
            if skip != curr_skip:
                continue
            count += 1
        be.bits(count, 6)
        print('- encode: skip=%d, count=%d' % (curr_skip, count))

        for r in ranges:
            start_i, start_o, r_count, skip = r[0], r[1], r[2], r[3]
            if skip != curr_skip:
                continue
            be.bits(start_i, 16)
            be.bits(start_o, 16)
            be.bits(r_count, 7)
    be.bits(0x3f, 6)    # maximum count value = end of skips

    count = len(singles)
    be.bits(count, 7)
    for t in singles:
        cp_i, cp_o = t[0], t[1]
        be.bits(cp_i, 16)
        be.bits(cp_o, 16)

    count = len(multis)
    be.bits(count, 7)
    for t in multis:
        cp_i, str_o = t[0], t[1]
        be.bits(cp_i, 16)
        be.bits(len(str_o), 2)
        for i in xrange(len(str_o)):
            be.bits(ord(str_o[i]), 16)

    return be.getBytes(), be.getNumBits()

def generate_regexp_canonicalize_tables(convmap):
    """Generate tables for case insensitive RegExp normalization."""

    # Generate a direct codepoint lookup for canonicalizing BMP range.

    def generate_canontab():
        res = []
        highest_nonid = -1

        for cp in xrange(65536):
            res_cp = cp  # default to as is
            if convmap.has_key(cp):
                tmp = convmap[cp]
                if len(tmp) == 1:
                    # If multiple codepoints from input, ignore.
                    res_cp = ord(tmp[0])
            if cp >= 0x80 and res_cp < 0x80:
                res_cp = cp  # If non-ASCII mapped to ASCII, ignore.
            if cp != res_cp:
                highest_nonid = cp
            res.append(res_cp)

        # At the moment this is 65370, which means there's very little
        # gain in assuming 1:1 mapping above a certain BMP codepoint
        # (though we do assume 1:1 mapping for above BMP codepoints).
        print('- highest non-identity mapping: %d' % highest_nonid)

        return res

    print('generate canontab')
    canontab = generate_canontab()

    # Figure out which BMP values are never the result of canonicalization.
    # Such codepoints are "don't care" in the sense that they are never
    # matched against at runtime: ranges are canonicalized at compile time,
    # and codepoint being matched is also canonicalized at run time.
    # (Currently unused.)

    def generate_dontcare():
        res = [ True ] * 65536
        for cp in canontab:
            res[cp] = False
        res_count = 0
        for x in res:
            if x:
                res_count += 1
        print('- %d dontcare codepoints' % res_count)
        return res

    print('generate canon dontcare')
    dontcare = generate_dontcare()

    # Generate maximal continuous ranges for canonicalization.  A continuous
    # range is a sequence with N codepoints where IN+i canonicalizes to OUT+i
    # for fixed IN, OUT, and i in 0...N-1.  There are unfortunately >1000
    # of these ranges, mostly because there are a lot of individual exceptions.
    # (Currently unused.)

    canon_ranges = []
    for cp in xrange(65536):
       canon_ranges.append([ cp, canontab[cp], 1 ])  # 1 codepoint ranges at first
    def merge_compatible_nogap(rng1, rng2):
        # Merge adjacent ranges if continuity allows.
        if rng1[0] + rng1[2] == rng2[0] and \
           rng1[1] + rng1[2] == rng2[1]:
            return [ rng1[0], rng1[1], rng1[2] + rng2[2] ]
        return None
    def merge_check_nogap():
        len_start = len(canon_ranges)
        for i in xrange(len(canon_ranges) - 1):
            j = i + 1
            rng1 = canon_ranges[i]
            rng2 = canon_ranges[j]
            if rng1 is None or rng2 is None: continue
            merged = merge_compatible_nogap(rng1, rng2)
            if merged is not None:
                canon_ranges[j] = None
                canon_ranges[i] = merged
        filtered = []
        for x in canon_ranges:
            if x is not None:
                filtered.append(x)
        len_end = len(filtered)
        if len_end < len_start:
            return filtered
        return None

    print('generate canon_ranges')
    while True:
        # Starting from individual ranges of 1 codepoint, merge adjacent
        # ranges until no more ranges can be merged.
        t = merge_check_nogap()
        if t is None:
            break
        canon_ranges = t
    print('- %d ranges' % len(canon_ranges))
    #for rng in canon_ranges:
    #    print('canon_ranges:')
    #    print(repr(rng))

    # Generate true/false ranges for BMP codepoints where:
    # - A codepoint is flagged true if continuity is broken at that point, so
    #   an explicit codepoint canonicalization is needed at runtime.
    # - A codepoint is flagged false if case conversion is continuous from the
    #   previous codepoint, i.e. out_curr = out_prev + 1.
    #
    # The result is a lot of small ranges due to a lot of small 'false' ranges.
    # Reduce the range set by checking if adjacent 'true' ranges have at most
    # false_limit 'false' entries between them.  If so, force the 'false'
    # entries to 'true' (safe but results in an unnecessary runtime codepoint
    # lookup) and merge the three ranges into a larger 'true' range.
    #
    # (Currently unused.)

    def generate_needcheck_straight():
        res = [ True ] * 65536
        assert(canontab[0] == 0)  # can start from in == out == 0
        prev_in = -1
        prev_out = -1
        for i in xrange(65536):
            # First create a straight true/false bitmap for BMP.
            curr_in = i
            curr_out = canontab[i]
            if prev_in + 1 == curr_in and prev_out + 1 == curr_out:
                res[i] = False
            prev_in = curr_in
            prev_out = curr_out
        return res
    def generate_needcheck_ranges(data):
        # Generate maximal accurate ranges.
        prev = None
        count = 0
        ranges = []
        for i in data:
            if prev is None or prev != i:
                if prev is not None:
                    ranges.append([ prev, count ])
                prev = i
                count = 1
            else:
                count += 1
        if prev is not None:
            ranges.append([ prev, count ])
        return ranges
    def fillin_needcheck_ranges(data, false_limit):
        # Fill in TRUE-FALSE*N-TRUE gaps into TRUE-TRUE*N-TRUE which is
        # safe (leads to an unnecessary runtime check) but reduces
        # range data size considerably.
        res = []
        for r in data:
            res.append([ r[0], r[1] ])
        while True:
            found = False
            for i in xrange(len(res) - 2):
                r1 = res[i]
                r2 = res[i + 1]
                r3 = res[i + 2]
                if r1[0] == True and r2[0] == False and r3[0] == True and \
                   r2[1] <= false_limit:
                    #print('fillin %d falses' % r2[1])
                    res.pop(i + 2)
                    res.pop(i + 1)
                    res[i] = [ True, r1[1] + r2[1] + r3[1] ]
                    found = True
                    break
            if not found:
                break
        return res

    print('generate needcheck straight')
    needcheck = generate_needcheck_straight()

    print('generate needcheck without false fillins')
    needcheck_ranges1 = generate_needcheck_ranges(needcheck)
    print('- %d ranges' % len(needcheck_ranges1))
    #print(needcheck_ranges1)

    print('generate needcheck with false fillins')
    needcheck_ranges2 = fillin_needcheck_ranges(needcheck_ranges1, 11)
    print('- %d ranges' % len(needcheck_ranges2))
    #print(needcheck_ranges2)

    # Generate a bitmap for BMP, divided into N-codepoint blocks, with each
    # bit indicating: "entire codepoint block canonicalizes continuously, and
    # the block is continuous with the previous and next block".  A 'true'
    # entry allows runtime code to just skip the block, advancing 'in' and
    # 'out' by the block size, with no codepoint conversion.  The block size
    # should be large enough to produce a relatively small lookup table, but
    # small enough to reduce codepoint conversions to a manageable number
    # because the conversions are (currently) quite slow.  This matters
    # especially for case-insensitive RegExps; without any optimization,
    # /[\u0000-\uffff]/i requires 65536 case conversions for runtime
    # normalization.

    block_shift = 5
    block_size = 1 << block_shift
    block_mask = block_size - 1
    num_blocks = 65536 / block_size

    def generate_block_bits(check_continuity):
        res = [ True ] * num_blocks
        for i in xrange(num_blocks):
            base_in = i * block_size
            base_out = canontab[base_in]
            if check_continuity:
                lower = -1   # [-1,block_size]
                upper = block_size + 1
            else:
                lower = 0    # [0,block_size-1]
                upper = block_size
            for j in xrange(lower, upper):
                cp = base_in + j
                if cp >= 0x0000 and cp <= 0xffff and canontab[cp] != base_out + j:
                   res[i] = False
                   break
        return res

    def dump_block_bitmap(bits):
        tmp = ''.join([ ({ True: 'x', False: '.' })[b] for b in bits])
        tmp = re.sub(r'.{64}', lambda x: x.group(0) + '\n', tmp)
        blocks_true = tmp.count('x')
        blocks_false = tmp.count('.')
        print('%d codepoint blocks are continuous, %d blocks are not' % (blocks_true, blocks_false))
        sys.stdout.write(tmp)
        #print(bits)

    def dump_test_lookup(bits):
        sys.stdout.write('duk_uint8_t test = {');
        for b in bits:
            if b:
                sys.stdout.write('1,')
            else:
                sys.stdout.write('0,')
        sys.stdout.write('};\n')

    def convert_to_bitmap(bits):
        # C code looks up bits as:
        #   index = codepoint >> N
        #   bitnum = codepoint & mask
        #   bitmask = 1 << bitnum
        # So block 0 is mask 0x01 of first byte, block 1 is mask 0x02 of
        # first byte, etc.
        res = []
        curr = 0
        mask = 0x01
        for b in bits:
            if b:
                curr += mask
            mask = mask * 2
            if mask == 0x100:
                res.append(curr)
                curr = 0
                mask = 0x01
        assert(mask == 0x01)  # no leftover
        return res

    print('generate canon block bitmap without continuity')
    block_bits1 = generate_block_bits(False)
    dump_block_bitmap(block_bits1)
    dump_test_lookup(block_bits1)

    print('generate canon block bitmap with continuity')
    block_bits2 = generate_block_bits(True)
    dump_block_bitmap(block_bits2)
    dump_test_lookup(block_bits2)

    print('generate final canon bitmap')
    block_bitmap = convert_to_bitmap(block_bits2)
    print('- %d bytes' % len(block_bitmap))
    print('- ' + repr(block_bitmap))
    canon_bitmap = {
        'data': block_bitmap,
        'block_size': block_size,
        'block_shift': block_shift,
        'block_mask': block_mask
    }

    # This is useful to figure out corner case test cases.
    print('canon blocks which are different with and without continuity check')
    for i in xrange(num_blocks):
        if block_bits1[i] != block_bits2[i]:
            print('- block %d ([%d,%d]) differs' % (i, i * block_size, i * block_size + block_size - 1))

    return canontab, canon_bitmap

def clonedict(x):
    "Shallow clone of input dict."
    res = {}
    for k in x.keys():
        res[k] = x[k]
    return res

def main():
    parser = optparse.OptionParser()
    parser.add_option('--command', dest='command', default='caseconv_bitpacked')
    parser.add_option('--unicode-data', dest='unicode_data')
    parser.add_option('--special-casing', dest='special_casing')
    parser.add_option('--out-source', dest='out_source')
    parser.add_option('--out-header', dest='out_header')
    parser.add_option('--table-name-lc', dest='table_name_lc', default='caseconv_lc')
    parser.add_option('--table-name-uc', dest='table_name_uc', default='caseconv_uc')
    parser.add_option('--table-name-re-canon-lookup', dest='table_name_re_canon_lookup', default='caseconv_re_canon_lookup')
    parser.add_option('--table-name-re-canon-bitmap', dest='table_name_re_canon_bitmap', default='caseconv_re_canon_bitmap')
    (opts, args) = parser.parse_args()

    unicode_data = UnicodeData(opts.unicode_data)
    special_casing = SpecialCasing(opts.special_casing)

    uc, lc, tc = get_base_conversion_maps(unicode_data)
    update_special_casings(uc, lc, tc, special_casing)

    if opts.command == 'caseconv_bitpacked':
        # XXX: ASCII and non-BMP filtering could be an option but is now hardcoded

        # ASCII is handled with 'fast path' so not needed here.
        t = clonedict(uc)
        remove_ascii_part(t)
        uc_bytes, uc_nbits = generate_caseconv_tables(t)

        t = clonedict(lc)
        remove_ascii_part(t)
        lc_bytes, lc_nbits = generate_caseconv_tables(t)

        # Generate C source and header files.
        genc = dukutil.GenerateC()
        genc.emitHeader('extract_caseconv.py')
        genc.emitArray(uc_bytes, opts.table_name_uc, size=len(uc_bytes), typename='duk_uint8_t', intvalues=True, const=True)
        genc.emitArray(lc_bytes, opts.table_name_lc, size=len(lc_bytes), typename='duk_uint8_t', intvalues=True, const=True)
        f = open(opts.out_source, 'wb')
        f.write(genc.getString())
        f.close()

        genc = dukutil.GenerateC()
        genc.emitHeader('extract_caseconv.py')
        genc.emitLine('extern const duk_uint8_t %s[%d];' % (opts.table_name_uc, len(uc_bytes)))
        genc.emitLine('extern const duk_uint8_t %s[%d];' % (opts.table_name_lc, len(lc_bytes)))
        f = open(opts.out_header, 'wb')
        f.write(genc.getString())
        f.close()
    elif opts.command == 're_canon_lookup':
        # Direct canonicalization lookup for case insensitive regexps, includes ascii part.
        t = clonedict(uc)
        re_canon_lookup, re_canon_bitmap = generate_regexp_canonicalize_tables(t)

        genc = dukutil.GenerateC()
        genc.emitHeader('extract_caseconv.py')
        genc.emitArray(re_canon_lookup, opts.table_name_re_canon_lookup, size=len(re_canon_lookup), typename='duk_uint16_t', intvalues=True, const=True)
        f = open(opts.out_source, 'wb')
        f.write(genc.getString())
        f.close()

        genc = dukutil.GenerateC()
        genc.emitHeader('extract_caseconv.py')
        genc.emitLine('extern const duk_uint16_t %s[%d];' % (opts.table_name_re_canon_lookup, len(re_canon_lookup)))
        f = open(opts.out_header, 'wb')
        f.write(genc.getString())
        f.close()
    elif opts.command == 're_canon_bitmap':
        # N-codepoint block bitmap for skipping continuous codepoint blocks
        # quickly.
        t = clonedict(uc)
        re_canon_lookup, re_canon_bitmap = generate_regexp_canonicalize_tables(t)

        genc = dukutil.GenerateC()
        genc.emitHeader('extract_caseconv.py')
        genc.emitArray(re_canon_bitmap['data'], opts.table_name_re_canon_bitmap, size=len(re_canon_bitmap['data']), typename='duk_uint8_t', intvalues=True, const=True)
        f = open(opts.out_source, 'wb')
        f.write(genc.getString())
        f.close()

        genc = dukutil.GenerateC()
        genc.emitHeader('extract_caseconv.py')
        genc.emitDefine('DUK_CANON_BITMAP_BLKSIZE', re_canon_bitmap['block_size'])
        genc.emitDefine('DUK_CANON_BITMAP_BLKSHIFT', re_canon_bitmap['block_shift'])
        genc.emitDefine('DUK_CANON_BITMAP_BLKMASK', re_canon_bitmap['block_mask'])
        genc.emitLine('extern const duk_uint8_t %s[%d];' % (opts.table_name_re_canon_bitmap, len(re_canon_bitmap['data'])))
        f = open(opts.out_header, 'wb')
        f.write(genc.getString())
        f.close()
    else:
        raise Exception('invalid command: %r' % opts.command)

if __name__ == '__main__':
    main()