1#!/usr/bin/env python
2
3# This does simple normalized frequency analysis on UTF-8 encoded text. The
4# result of the analysis is translated to a ranked list, where every byte is
5# assigned a rank. This list is written to src/freqs.rs.
6#
7# Currently, the frequencies are generated from the following corpuses:
8#
9#   * The CIA world fact book
10#   * The source code of rustc
11#   * Septuaginta
12
13from __future__ import absolute_import, division, print_function
14
15import argparse
16from collections import Counter
17import sys
18
19preamble = '''
20// NOTE: The following code was generated by "scripts/frequencies.py", do not
21// edit directly
22'''.lstrip()
23
24
25def eprint(*args, **kwargs):
26    kwargs['file'] = sys.stderr
27    print(*args, **kwargs)
28
29
30def main():
31    p = argparse.ArgumentParser()
32    p.add_argument('corpus', metavar='FILE', nargs='+')
33    args = p.parse_args()
34
35    # Get frequency counts of each byte.
36    freqs = Counter()
37    for i in range(0, 256):
38        freqs[i] = 0
39
40    eprint('reading entire corpus into memory')
41    corpus = []
42    for fpath in args.corpus:
43        corpus.append(open(fpath, 'rb').read())
44
45    eprint('computing byte frequencies')
46    for c in corpus:
47        for byte in c:
48            freqs[byte] += 1.0 / float(len(c))
49
50    eprint('writing Rust code')
51    # Get the rank of each byte. A lower rank => lower relative frequency.
52    rank = [0] * 256
53    for i, (byte, _) in enumerate(freqs.most_common()):
54        # print(byte)
55        rank[byte] = 255 - i
56
57    # Forcefully set the highest rank possible for bytes that start multi-byte
58    # UTF-8 sequences. The idea here is that a continuation byte will be more
59    # discerning in a homogenous haystack.
60    for byte in range(0xC0, 0xFF + 1):
61        rank[byte] = 255
62
63    # Now write Rust.
64    olines = ['pub const BYTE_FREQUENCIES: [u8; 256] = [']
65    for byte in range(256):
66        olines.append('    %3d, // %r' % (rank[byte], chr(byte)))
67    olines.append('];')
68
69    print(preamble)
70    print('\n'.join(olines))
71
72
73if __name__ == '__main__':
74    main()
75