1# Copyright 2013-2014 The rust-url developers.
2#
3# Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
4# http://www.apache.org/licenses/LICENSE-2.0> or the MIT license
5# <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your
6# option. This file may not be copied, modified, or distributed
7# except according to those terms.
8
9# Run as: python make_uts46_mapping_table.py IdnaMappingTable.txt > uts46_mapping_table.rs
10# You can get the latest idna table from
11# http://www.unicode.org/Public/idna/latest/IdnaMappingTable.txt
12
13import collections
14import itertools
15
16print('''\
17// Copyright 2013-2020 The rust-url developers.
18//
19// Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
20// http://www.apache.org/licenses/LICENSE-2.0> or the MIT license
21// <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your
22// option. This file may not be copied, modified, or distributed
23// except according to those terms.
24
25// Generated by make_idna_table.py
26''')
27
28txt = open("IdnaMappingTable.txt")
29
30def escape_char(c):
31    return "\\u{%x}" % ord(c[0])
32
33def char(s):
34    return chr(int(s, 16))
35
36strtab = collections.OrderedDict()
37strtab_offset = 0
38
39def strtab_slice(s):
40    global strtab, strtab_offset
41
42    if s in strtab:
43        return strtab[s]
44    else:
45        utf8_len = len(s.encode('utf8'))
46        c = (strtab_offset, utf8_len)
47        strtab[s] = c
48        strtab_offset += utf8_len
49        return c
50
51def rust_slice(s):
52    start = s[0]
53    length = s[1]
54    start_lo = start & 0xff
55    start_hi = start >> 8
56    assert length <= 255
57    assert start_hi <= 255
58    return "(StringTableSlice { byte_start_lo: %d, byte_start_hi: %d, byte_len: %d })" % (start_lo, start_hi, length)
59
60ranges = []
61
62for line in txt:
63    # remove comments
64    line, _, _ = line.partition('#')
65    # skip empty lines
66    if len(line.strip()) == 0:
67        continue
68    fields = line.split(';')
69    if fields[0].strip() == 'D800..DFFF':
70        continue  # Surrogates don't occur in Rust strings.
71    first, _, last = fields[0].strip().partition('..')
72    if not last:
73        last = first
74    mapping = fields[1].strip().replace('_', ' ').title().replace(' ', '')
75    unicode_str = None
76    if len(fields) > 2:
77        if fields[2].strip():
78            unicode_str = u''.join(char(c) for c in fields[2].strip().split(' '))
79        elif mapping == "Deviation":
80            unicode_str = u''
81
82    if len(fields) > 3:
83        assert fields[3].strip() in ('NV8', 'XV8'), fields[3]
84        assert mapping == 'Valid', mapping
85        mapping = 'DisallowedIdna2008'
86
87    ranges.append((first, last, mapping, unicode_str))
88
89def mergeable_key(r):
90    mapping = r[2]
91
92    # These types have associated data, so we should not merge them.
93    if mapping in ('Mapped', 'Deviation', 'DisallowedStd3Mapped'):
94        return r
95    assert mapping in ('Valid', 'Ignored', 'Disallowed', 'DisallowedStd3Valid', 'DisallowedIdna2008')
96    return mapping
97
98grouped_ranges = itertools.groupby(ranges, key=mergeable_key)
99
100optimized_ranges = []
101
102for (k, g) in grouped_ranges:
103    group = list(g)
104    if len(group) == 1:
105        optimized_ranges.append(group[0])
106        continue
107    # Assert that nothing in the group has an associated unicode string.
108    for g in group:
109        if g[3] is not None and len(g[3]) > 2:
110            assert not g[3][2].strip()
111    # Assert that consecutive members of the group don't leave gaps in
112    # the codepoint space.
113    a, b = itertools.tee(group)
114    next(b, None)
115    for (g1, g2) in zip(a, b):
116        last_char = int(g1[1], 16)
117        next_char = int(g2[0], 16)
118        if last_char + 1 == next_char:
119            continue
120        # There's a gap where surrogates would appear, but we don't have to
121        # worry about that gap, as surrogates never appear in Rust strings.
122        # Assert we're seeing the surrogate case here.
123        assert last_char == 0xd7ff
124        assert next_char == 0xe000
125    optimized_ranges.append((group[0][0], group[-1][1]) + group[0][2:])
126
127def is_single_char_range(r):
128    (first, last, _, _) = r
129    return first == last
130
131# We can reduce the size of the character range table and the index table to about 1/4
132# by merging runs of single character ranges and using character offsets from the start
133# of that range to retrieve the correct `Mapping` value
134def merge_single_char_ranges(ranges):
135    current = []
136    for r in ranges:
137        if not current or is_single_char_range(current[-1]) and is_single_char_range(r):
138            current.append(r)
139            continue
140        if len(current) != 0:
141            ret = current
142            current = [r]
143            yield ret
144            continue
145        current.append(r)
146        ret = current
147        current = []
148        yield ret
149    yield current
150
151optimized_ranges = list(merge_single_char_ranges(optimized_ranges))
152
153SINGLE_MARKER = 1 << 15
154
155print("static TABLE: &[(char, u16)] = &[")
156
157offset = 0
158for ranges in optimized_ranges:
159    assert offset < SINGLE_MARKER
160
161    block_len = len(ranges)
162    single = SINGLE_MARKER if block_len == 1 else 0
163    index = offset | single
164    offset += block_len
165
166    start = escape_char(char(ranges[0][0]))
167    print("    ('%s', %s)," % (start, index))
168
169print("];\n")
170
171print("static MAPPING_TABLE: &[Mapping] = &[")
172
173for ranges in optimized_ranges:
174    for (first, last, mapping, unicode_str) in ranges:
175        if unicode_str is not None:
176            mapping += rust_slice(strtab_slice(unicode_str))
177        print("    %s," % mapping)
178
179print("];\n")
180
181def escape_str(s):
182    return [escape_char(c) for c in s]
183
184print("static STRING_TABLE: &str = \"%s\";"
185      % '\\\n  '.join(itertools.chain(*[escape_str(s) for s in strtab.keys()])))
186