1# Copyright 2013-2014 The rust-url developers.
2#
3# Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
4# http://www.apache.org/licenses/LICENSE-2.0> or the MIT license
5# <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your
6# option. This file may not be copied, modified, or distributed
7# except according to those terms.
8
9# Run as: python make_uts46_mapping_table.py IdnaMappingTable.txt > uts46_mapping_table.rs
10# You can get the latest idna table from
11# http://www.unicode.org/Public/idna/latest/IdnaMappingTable.txt
12
13from __future__ import print_function
14import collections
15import itertools
16
17print('''\
18// Copyright 2013-2014 The rust-url developers.
19//
20// Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
21// http://www.apache.org/licenses/LICENSE-2.0> or the MIT license
22// <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your
23// option. This file may not be copied, modified, or distributed
24// except according to those terms.
25
26// Generated by make_idna_table.py
27''')
28
29txt = open("IdnaMappingTable.txt")
30
31def escape_char(c):
32    return "\\u{%x}" % ord(c[0])
33
34def char(s):
35    return unichr(int(s, 16))
36
37strtab = collections.OrderedDict()
38strtab_offset = 0
39
40def strtab_slice(s):
41    global strtab, strtab_offset
42
43    if s in strtab:
44        return strtab[s]
45    else:
46        utf8_len = len(s.encode('utf8'))
47        c = (strtab_offset, utf8_len)
48        strtab[s] = c
49        strtab_offset += utf8_len
50        return c
51
52def rust_slice(s):
53    start = s[0]
54    length = s[1]
55    start_lo = start & 0xff
56    start_hi = start >> 8
57    assert length <= 255
58    assert start_hi <= 255
59    return "(StringTableSlice { byte_start_lo: %d, byte_start_hi: %d, byte_len: %d })" % (start_lo, start_hi, length)
60
61ranges = []
62
63for line in txt:
64    # remove comments
65    line, _, _ = line.partition('#')
66    # skip empty lines
67    if len(line.strip()) == 0:
68        continue
69    fields = line.split(';')
70    if fields[0].strip() == 'D800..DFFF':
71        continue  # Surrogates don't occur in Rust strings.
72    first, _, last = fields[0].strip().partition('..')
73    if not last:
74        last = first
75    mapping = fields[1].strip().replace('_', ' ').title().replace(' ', '')
76    unicode_str = None
77    if len(fields) > 2:
78        if fields[2].strip():
79            unicode_str = u''.join(char(c) for c in fields[2].strip().split(' '))
80        elif mapping == "Deviation":
81            unicode_str = u''
82    ranges.append((first, last, mapping, unicode_str))
83
84def mergeable_key(r):
85    mapping = r[2]
86
87    # These types have associated data, so we should not merge them.
88    if mapping in ('Mapped', 'Deviation', 'DisallowedStd3Mapped'):
89        return r
90    assert mapping in ('Valid', 'Ignored', 'Disallowed', 'DisallowedStd3Valid')
91    return mapping
92
93grouped_ranges = itertools.groupby(ranges, key=mergeable_key)
94
95optimized_ranges = []
96
97for (k, g) in grouped_ranges:
98    group = list(g)
99    if len(group) == 1:
100        optimized_ranges.append(group[0])
101        continue
102    # Assert that nothing in the group has an associated unicode string.
103    for g in group:
104        if g[3] is not None and len(g[3]) > 2:
105            assert not g[3][2].strip()
106    # Assert that consecutive members of the group don't leave gaps in
107    # the codepoint space.
108    a, b = itertools.tee(group)
109    next(b, None)
110    for (g1, g2) in itertools.izip(a, b):
111        last_char = int(g1[1], 16)
112        next_char = int(g2[0], 16)
113        if last_char + 1 == next_char:
114            continue
115        # There's a gap where surrogates would appear, but we don't have to
116        # worry about that gap, as surrogates never appear in Rust strings.
117        # Assert we're seeing the surrogate case here.
118        assert last_char == 0xd7ff
119        assert next_char == 0xe000
120    first = group[0][0]
121    last = group[-1][1]
122    mapping = group[0][2]
123    unicode_str = group[0][3]
124    optimized_ranges.append((first, last, mapping, unicode_str))
125
126def is_single_char_range(r):
127    (first, last, _, _) = r
128    return first == last
129
130# We can reduce the size of the character range table and the index table to about 1/4
131# by merging runs of single character ranges and using character offsets from the start
132# of that range to retrieve the correct `Mapping` value
133def merge_single_char_ranges(ranges):
134    current = []
135    for r in ranges:
136        if not current or is_single_char_range(current[-1]) and is_single_char_range(r):
137            current.append(r)
138            continue
139        if len(current) != 0:
140            ret = current
141            current = [r]
142            yield ret
143            continue
144        current.append(r)
145        ret = current
146        current = []
147        yield ret
148    yield current
149
150optimized_ranges = list(merge_single_char_ranges(optimized_ranges))
151
152
153print("static TABLE: &'static [Range] = &[")
154
155for ranges in optimized_ranges:
156    first = ranges[0][0]
157    last = ranges[-1][1]
158    print("    Range { from: '%s', to: '%s', }," % (escape_char(char(first)),
159                                                            escape_char(char(last))))
160
161print("];\n")
162
163print("static INDEX_TABLE: &'static [u16] = &[")
164
165SINGLE_MARKER = 1 << 15
166
167offset = 0
168for ranges in optimized_ranges:
169    assert offset < SINGLE_MARKER
170
171    block_len = len(ranges)
172    single = SINGLE_MARKER if block_len == 1 else 0
173    print("    %s," % (offset | single))
174    offset += block_len
175
176print("];\n")
177
178print("static MAPPING_TABLE: &'static [Mapping] = &[")
179
180for ranges in optimized_ranges:
181    for (first, last, mapping, unicode_str) in ranges:
182        if unicode_str is not None:
183            mapping += rust_slice(strtab_slice(unicode_str))
184        print("    %s," % mapping)
185
186print("];\n")
187
188def escape_str(s):
189    return [escape_char(c) for c in s]
190
191print("static STRING_TABLE: &'static str = \"%s\";"
192      % '\\\n  '.join(itertools.chain(*[escape_str(s) for s in strtab.iterkeys()])))
193