1#!/usr/bin/env python
2
3from __future__ import absolute_import, division, print_function
4import codecs
5from operator import itemgetter
6import sys
7
8
9if __name__ == '__main__':
10    # Get frequency counts of each byte.
11    freqs = [0] * 256  # byte |--> frequency
12    for fpath in sys.argv[1:]:
13        with codecs.open(fpath, 'r', 'utf-8') as fin:
14            for line in fin:
15                for byte in line.strip().encode('utf-8'):
16                    freqs[byte] += 1
17
18    # Create the inverse mapping.
19    orders = [0] * 256  # byte |--> sort index, descending
20    sort_by_freq = sorted(zip(range(256), freqs),
21                          key=itemgetter(1), reverse=True)
22    for sort_idx, byte in enumerate(map(itemgetter(0), sort_by_freq)):
23        orders[byte] = sort_idx
24
25    # Now write Rust.
26    olines = ['pub const COMMON_INPUTS: [u8; 256] = [']
27    for byte in range(256):
28        olines.append('    %3d, // %r' % (orders[byte], chr(byte)))
29    olines.append('];')
30    olines.append('')
31    olines.append('pub const COMMON_INPUTS_INV: [u8; 256] = [')
32    for sort_idx in range(256):
33        byte = orders.index(sort_idx)
34        if byte <= 127:
35            olines.append('    b%r,' % chr(byte))
36        else:
37            olines.append("    b'\\x%x'," % byte)
38    olines.append('];')
39    print('\n'.join(olines))
40