1#!/usr/bin/env python 2 3from __future__ import absolute_import, division, print_function 4import codecs 5from operator import itemgetter 6import sys 7 8 9if __name__ == '__main__': 10 # Get frequency counts of each byte. 11 freqs = [0] * 256 # byte |--> frequency 12 for fpath in sys.argv[1:]: 13 with codecs.open(fpath, 'r', 'utf-8') as fin: 14 for line in fin: 15 for byte in line.strip().encode('utf-8'): 16 freqs[byte] += 1 17 18 # Create the inverse mapping. 19 orders = [0] * 256 # byte |--> sort index, descending 20 sort_by_freq = sorted(zip(range(256), freqs), 21 key=itemgetter(1), reverse=True) 22 for sort_idx, byte in enumerate(map(itemgetter(0), sort_by_freq)): 23 orders[byte] = sort_idx 24 25 # Now write Rust. 26 olines = ['pub const COMMON_INPUTS: [u8; 256] = ['] 27 for byte in range(256): 28 olines.append(' %3d, // %r' % (orders[byte], chr(byte))) 29 olines.append('];') 30 olines.append('') 31 olines.append('pub const COMMON_INPUTS_INV: [u8; 256] = [') 32 for sort_idx in range(256): 33 byte = orders.index(sort_idx) 34 if byte <= 127: 35 olines.append(' b%r,' % chr(byte)) 36 else: 37 olines.append(" b'\\x%x'," % byte) 38 olines.append('];') 39 print('\n'.join(olines)) 40