1# Copyright 2013-2014 The rust-url developers. 2# 3# Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or 4# http://www.apache.org/licenses/LICENSE-2.0> or the MIT license 5# <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your 6# option. This file may not be copied, modified, or distributed 7# except according to those terms. 8 9# Run as: python make_uts46_mapping_table.py IdnaMappingTable.txt > uts46_mapping_table.rs 10# You can get the latest idna table from 11# http://www.unicode.org/Public/idna/latest/IdnaMappingTable.txt 12 13from __future__ import print_function 14import collections 15import itertools 16 17print('''\ 18// Copyright 2013-2014 The rust-url developers. 19// 20// Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or 21// http://www.apache.org/licenses/LICENSE-2.0> or the MIT license 22// <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your 23// option. This file may not be copied, modified, or distributed 24// except according to those terms. 25 26// Generated by make_idna_table.py 27''') 28 29txt = open("IdnaMappingTable.txt") 30 31def escape_char(c): 32 return "\\u{%x}" % ord(c[0]) 33 34def char(s): 35 return unichr(int(s, 16)) 36 37strtab = collections.OrderedDict() 38strtab_offset = 0 39 40def strtab_slice(s): 41 global strtab, strtab_offset 42 43 if s in strtab: 44 return strtab[s] 45 else: 46 utf8_len = len(s.encode('utf8')) 47 c = (strtab_offset, utf8_len) 48 strtab[s] = c 49 strtab_offset += utf8_len 50 return c 51 52def rust_slice(s): 53 start = s[0] 54 length = s[1] 55 start_lo = start & 0xff 56 start_hi = start >> 8 57 assert length <= 255 58 assert start_hi <= 255 59 return "(StringTableSlice { byte_start_lo: %d, byte_start_hi: %d, byte_len: %d })" % (start_lo, start_hi, length) 60 61ranges = [] 62 63for line in txt: 64 # remove comments 65 line, _, _ = line.partition('#') 66 # skip empty lines 67 if len(line.strip()) == 0: 68 continue 69 fields = line.split(';') 70 if fields[0].strip() == 'D800..DFFF': 71 continue # Surrogates don't occur in Rust strings. 72 first, _, last = fields[0].strip().partition('..') 73 if not last: 74 last = first 75 mapping = fields[1].strip().replace('_', ' ').title().replace(' ', '') 76 unicode_str = None 77 if len(fields) > 2: 78 if fields[2].strip(): 79 unicode_str = u''.join(char(c) for c in fields[2].strip().split(' ')) 80 elif mapping == "Deviation": 81 unicode_str = u'' 82 ranges.append((first, last, mapping, unicode_str)) 83 84def mergeable_key(r): 85 mapping = r[2] 86 87 # These types have associated data, so we should not merge them. 88 if mapping in ('Mapped', 'Deviation', 'DisallowedStd3Mapped'): 89 return r 90 assert mapping in ('Valid', 'Ignored', 'Disallowed', 'DisallowedStd3Valid') 91 return mapping 92 93grouped_ranges = itertools.groupby(ranges, key=mergeable_key) 94 95optimized_ranges = [] 96 97for (k, g) in grouped_ranges: 98 group = list(g) 99 if len(group) == 1: 100 optimized_ranges.append(group[0]) 101 continue 102 # Assert that nothing in the group has an associated unicode string. 103 for g in group: 104 if g[3] is not None and len(g[3]) > 2: 105 assert not g[3][2].strip() 106 # Assert that consecutive members of the group don't leave gaps in 107 # the codepoint space. 108 a, b = itertools.tee(group) 109 next(b, None) 110 for (g1, g2) in itertools.izip(a, b): 111 last_char = int(g1[1], 16) 112 next_char = int(g2[0], 16) 113 if last_char + 1 == next_char: 114 continue 115 # There's a gap where surrogates would appear, but we don't have to 116 # worry about that gap, as surrogates never appear in Rust strings. 117 # Assert we're seeing the surrogate case here. 118 assert last_char == 0xd7ff 119 assert next_char == 0xe000 120 first = group[0][0] 121 last = group[-1][1] 122 mapping = group[0][2] 123 unicode_str = group[0][3] 124 optimized_ranges.append((first, last, mapping, unicode_str)) 125 126def is_single_char_range(r): 127 (first, last, _, _) = r 128 return first == last 129 130# We can reduce the size of the character range table and the index table to about 1/4 131# by merging runs of single character ranges and using character offsets from the start 132# of that range to retrieve the correct `Mapping` value 133def merge_single_char_ranges(ranges): 134 current = [] 135 for r in ranges: 136 if not current or is_single_char_range(current[-1]) and is_single_char_range(r): 137 current.append(r) 138 continue 139 if len(current) != 0: 140 ret = current 141 current = [r] 142 yield ret 143 continue 144 current.append(r) 145 ret = current 146 current = [] 147 yield ret 148 yield current 149 150optimized_ranges = list(merge_single_char_ranges(optimized_ranges)) 151 152 153print("static TABLE: &'static [Range] = &[") 154 155for ranges in optimized_ranges: 156 first = ranges[0][0] 157 last = ranges[-1][1] 158 print(" Range { from: '%s', to: '%s', }," % (escape_char(char(first)), 159 escape_char(char(last)))) 160 161print("];\n") 162 163print("static INDEX_TABLE: &'static [u16] = &[") 164 165SINGLE_MARKER = 1 << 15 166 167offset = 0 168for ranges in optimized_ranges: 169 assert offset < SINGLE_MARKER 170 171 block_len = len(ranges) 172 single = SINGLE_MARKER if block_len == 1 else 0 173 print(" %s," % (offset | single)) 174 offset += block_len 175 176print("];\n") 177 178print("static MAPPING_TABLE: &'static [Mapping] = &[") 179 180for ranges in optimized_ranges: 181 for (first, last, mapping, unicode_str) in ranges: 182 if unicode_str is not None: 183 mapping += rust_slice(strtab_slice(unicode_str)) 184 print(" %s," % mapping) 185 186print("];\n") 187 188def escape_str(s): 189 return [escape_char(c) for c in s] 190 191print("static STRING_TABLE: &'static str = \"%s\";" 192 % '\\\n '.join(itertools.chain(*[escape_str(s) for s in strtab.iterkeys()]))) 193