1#!/usr/bin/env python 2# 3# Copyright 2011-2018 The Rust Project Developers. See the COPYRIGHT 4# file at the top-level directory of this distribution and at 5# http://rust-lang.org/COPYRIGHT. 6# 7# Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or 8# http://www.apache.org/licenses/LICENSE-2.0> or the MIT license 9# <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your 10# option. This file may not be copied, modified, or distributed 11# except according to those terms. 12 13# This script uses the following Unicode tables: 14# - DerivedNormalizationProps.txt 15# - NormalizationTest.txt 16# - UnicodeData.txt 17# 18# Since this should not require frequent updates, we just store this 19# out-of-line and check the unicode.rs file into git. 20import collections 21import urllib.request 22 23UNICODE_VERSION = "13.0.0" 24UCD_URL = "https://www.unicode.org/Public/%s/ucd/" % UNICODE_VERSION 25 26PREAMBLE = """// Copyright 2012-2018 The Rust Project Developers. See the COPYRIGHT 27// file at the top-level directory of this distribution and at 28// http://rust-lang.org/COPYRIGHT. 29// 30// Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or 31// http://www.apache.org/licenses/LICENSE-2.0> or the MIT license 32// <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your 33// option. This file may not be copied, modified, or distributed 34// except according to those terms. 35 36// NOTE: The following code was generated by "scripts/unicode.py", do not edit directly 37 38#![allow(missing_docs)] 39""" 40 41NormalizationTest = collections.namedtuple( 42 "NormalizationTest", 43 ["source", "nfc", "nfd", "nfkc", "nfkd"], 44) 45 46# Mapping taken from Table 12 from: 47# http://www.unicode.org/reports/tr44/#General_Category_Values 48expanded_categories = { 49 'Lu': ['LC', 'L'], 'Ll': ['LC', 'L'], 'Lt': ['LC', 'L'], 50 'Lm': ['L'], 'Lo': ['L'], 51 'Mn': ['M'], 'Mc': ['M'], 'Me': ['M'], 52 'Nd': ['N'], 'Nl': ['N'], 'No': ['No'], 53 'Pc': ['P'], 'Pd': ['P'], 'Ps': ['P'], 'Pe': ['P'], 54 'Pi': ['P'], 'Pf': ['P'], 'Po': ['P'], 55 'Sm': ['S'], 'Sc': ['S'], 'Sk': ['S'], 'So': ['S'], 56 'Zs': ['Z'], 'Zl': ['Z'], 'Zp': ['Z'], 57 'Cc': ['C'], 'Cf': ['C'], 'Cs': ['C'], 'Co': ['C'], 'Cn': ['C'], 58} 59 60class UnicodeData(object): 61 def __init__(self): 62 self._load_unicode_data() 63 self.norm_props = self._load_norm_props() 64 self.norm_tests = self._load_norm_tests() 65 66 self.canon_comp = self._compute_canonical_comp() 67 self.canon_fully_decomp, self.compat_fully_decomp = self._compute_fully_decomposed() 68 69 def stats(name, table): 70 count = sum(len(v) for v in table.values()) 71 print("%s: %d chars => %d decomposed chars" % (name, len(table), count)) 72 73 print("Decomposition table stats:") 74 stats("Canonical decomp", self.canon_decomp) 75 stats("Compatible decomp", self.compat_decomp) 76 stats("Canonical fully decomp", self.canon_fully_decomp) 77 stats("Compatible fully decomp", self.compat_fully_decomp) 78 79 self.ss_leading, self.ss_trailing = self._compute_stream_safe_tables() 80 81 def _fetch(self, filename): 82 resp = urllib.request.urlopen(UCD_URL + filename) 83 return resp.read().decode('utf-8') 84 85 def _load_unicode_data(self): 86 self.combining_classes = {} 87 self.compat_decomp = {} 88 self.canon_decomp = {} 89 self.general_category_mark = [] 90 91 for line in self._fetch("UnicodeData.txt").splitlines(): 92 # See ftp://ftp.unicode.org/Public/3.0-Update/UnicodeData-3.0.0.html 93 pieces = line.split(';') 94 assert len(pieces) == 15 95 char, category, cc, decomp = pieces[0], pieces[2], pieces[3], pieces[5] 96 char_int = int(char, 16) 97 98 if cc != '0': 99 self.combining_classes[char_int] = cc 100 101 if decomp.startswith('<'): 102 self.compat_decomp[char_int] = [int(c, 16) for c in decomp.split()[1:]] 103 elif decomp != '': 104 self.canon_decomp[char_int] = [int(c, 16) for c in decomp.split()] 105 106 if category == 'M' or 'M' in expanded_categories.get(category, []): 107 self.general_category_mark.append(char_int) 108 109 def _load_norm_props(self): 110 props = collections.defaultdict(list) 111 112 for line in self._fetch("DerivedNormalizationProps.txt").splitlines(): 113 (prop_data, _, _) = line.partition("#") 114 prop_pieces = prop_data.split(";") 115 116 if len(prop_pieces) < 2: 117 continue 118 119 assert len(prop_pieces) <= 3 120 (low, _, high) = prop_pieces[0].strip().partition("..") 121 122 prop = prop_pieces[1].strip() 123 124 data = None 125 if len(prop_pieces) == 3: 126 data = prop_pieces[2].strip() 127 128 props[prop].append((low, high, data)) 129 130 return props 131 132 def _load_norm_tests(self): 133 tests = [] 134 for line in self._fetch("NormalizationTest.txt").splitlines(): 135 (test_data, _, _) = line.partition("#") 136 test_pieces = test_data.split(";") 137 138 if len(test_pieces) < 5: 139 continue 140 141 source, nfc, nfd, nfkc, nfkd = [[c.strip() for c in p.split()] for p in test_pieces[:5]] 142 tests.append(NormalizationTest(source, nfc, nfd, nfkc, nfkd)) 143 144 return tests 145 146 def _compute_canonical_comp(self): 147 canon_comp = {} 148 comp_exclusions = [ 149 (int(low, 16), int(high or low, 16)) 150 for low, high, _ in self.norm_props["Full_Composition_Exclusion"] 151 ] 152 for char_int, decomp in self.canon_decomp.items(): 153 if any(lo <= char_int <= hi for lo, hi in comp_exclusions): 154 continue 155 156 assert len(decomp) == 2 157 assert (decomp[0], decomp[1]) not in canon_comp 158 canon_comp[(decomp[0], decomp[1])] = char_int 159 160 return canon_comp 161 162 def _compute_fully_decomposed(self): 163 """ 164 Even though the decomposition algorithm is recursive, it is possible 165 to precompute the recursion at table generation time with modest 166 increase to the table size. Then, for these precomputed tables, we 167 note that 1) compatible decomposition is a subset of canonical 168 decomposition and 2) they mostly agree on their intersection. 169 Therefore, we don't store entries in the compatible table for 170 characters that decompose the same way under canonical decomposition. 171 172 Decomposition table stats: 173 Canonical decomp: 2060 chars => 3085 decomposed chars 174 Compatible decomp: 3662 chars => 5440 decomposed chars 175 Canonical fully decomp: 2060 chars => 3404 decomposed chars 176 Compatible fully decomp: 3678 chars => 5599 decomposed chars 177 178 The upshot is that decomposition code is very simple and easy to inline 179 at mild code size cost. 180 """ 181 # Constants from Unicode 9.0.0 Section 3.12 Conjoining Jamo Behavior 182 # http://www.unicode.org/versions/Unicode9.0.0/ch03.pdf#M9.32468.Heading.310.Combining.Jamo.Behavior 183 S_BASE, L_COUNT, V_COUNT, T_COUNT = 0xAC00, 19, 21, 28 184 S_COUNT = L_COUNT * V_COUNT * T_COUNT 185 186 def _decompose(char_int, compatible): 187 # 7-bit ASCII never decomposes 188 if char_int <= 0x7f: 189 yield char_int 190 return 191 192 # Assert that we're handling Hangul separately. 193 assert not (S_BASE <= char_int < S_BASE + S_COUNT) 194 195 decomp = self.canon_decomp.get(char_int) 196 if decomp is not None: 197 for decomposed_ch in decomp: 198 for fully_decomposed_ch in _decompose(decomposed_ch, compatible): 199 yield fully_decomposed_ch 200 return 201 202 if compatible and char_int in self.compat_decomp: 203 for decomposed_ch in self.compat_decomp[char_int]: 204 for fully_decomposed_ch in _decompose(decomposed_ch, compatible): 205 yield fully_decomposed_ch 206 return 207 208 yield char_int 209 return 210 211 end_codepoint = max( 212 max(self.canon_decomp.keys()), 213 max(self.compat_decomp.keys()), 214 ) 215 216 canon_fully_decomp = {} 217 compat_fully_decomp = {} 218 219 for char_int in range(0, end_codepoint + 1): 220 # Always skip Hangul, since it's more efficient to represent its 221 # decomposition programmatically. 222 if S_BASE <= char_int < S_BASE + S_COUNT: 223 continue 224 225 canon = list(_decompose(char_int, False)) 226 if not (len(canon) == 1 and canon[0] == char_int): 227 canon_fully_decomp[char_int] = canon 228 229 compat = list(_decompose(char_int, True)) 230 if not (len(compat) == 1 and compat[0] == char_int): 231 compat_fully_decomp[char_int] = compat 232 233 # Since canon_fully_decomp is a subset of compat_fully_decomp, we don't 234 # need to store their overlap when they agree. When they don't agree, 235 # store the decomposition in the compatibility table since we'll check 236 # that first when normalizing to NFKD. 237 assert set(canon_fully_decomp) <= set(compat_fully_decomp) 238 239 for ch in set(canon_fully_decomp) & set(compat_fully_decomp): 240 if canon_fully_decomp[ch] == compat_fully_decomp[ch]: 241 del compat_fully_decomp[ch] 242 243 return canon_fully_decomp, compat_fully_decomp 244 245 def _compute_stream_safe_tables(self): 246 """ 247 To make a text stream-safe with the Stream-Safe Text Process (UAX15-D4), 248 we need to be able to know the number of contiguous non-starters *after* 249 applying compatibility decomposition to each character. 250 251 We can do this incrementally by computing the number of leading and 252 trailing non-starters for each character's compatibility decomposition 253 with the following rules: 254 255 1) If a character is not affected by compatibility decomposition, look 256 up its canonical combining class to find out if it's a non-starter. 257 2) All Hangul characters are starters, even under decomposition. 258 3) Otherwise, very few decomposing characters have a nonzero count 259 of leading or trailing non-starters, so store these characters 260 with their associated counts in a separate table. 261 """ 262 leading_nonstarters = {} 263 trailing_nonstarters = {} 264 265 for c in set(self.canon_fully_decomp) | set(self.compat_fully_decomp): 266 decomposed = self.compat_fully_decomp.get(c) or self.canon_fully_decomp[c] 267 268 num_leading = 0 269 for d in decomposed: 270 if d not in self.combining_classes: 271 break 272 num_leading += 1 273 274 num_trailing = 0 275 for d in reversed(decomposed): 276 if d not in self.combining_classes: 277 break 278 num_trailing += 1 279 280 if num_leading > 0: 281 leading_nonstarters[c] = num_leading 282 if num_trailing > 0: 283 trailing_nonstarters[c] = num_trailing 284 285 return leading_nonstarters, trailing_nonstarters 286 287hexify = lambda c: '{:04X}'.format(c) 288 289def gen_mph_data(name, d, kv_type, kv_callback): 290 (salt, keys) = minimal_perfect_hash(d) 291 out.write("pub(crate) const %s_SALT: &[u16] = &[\n" % name.upper()) 292 for s in salt: 293 out.write(" 0x{:x},\n".format(s)) 294 out.write("];\n") 295 out.write("pub(crate) const {}_KV: &[{}] = &[\n".format(name.upper(), kv_type)) 296 for k in keys: 297 out.write(" {},\n".format(kv_callback(k))) 298 out.write("];\n\n") 299 300def gen_combining_class(combining_classes, out): 301 gen_mph_data('canonical_combining_class', combining_classes, 'u32', 302 lambda k: "0x{:X}".format(int(combining_classes[k]) | (k << 8))) 303 304def gen_composition_table(canon_comp, out): 305 table = {} 306 for (c1, c2), c3 in canon_comp.items(): 307 if c1 < 0x10000 and c2 < 0x10000: 308 table[(c1 << 16) | c2] = c3 309 (salt, keys) = minimal_perfect_hash(table) 310 gen_mph_data('COMPOSITION_TABLE', table, '(u32, char)', 311 lambda k: "(0x%s, '\\u{%s}')" % (hexify(k), hexify(table[k]))) 312 313 out.write("pub(crate) fn composition_table_astral(c1: char, c2: char) -> Option<char> {\n") 314 out.write(" match (c1, c2) {\n") 315 for (c1, c2), c3 in sorted(canon_comp.items()): 316 if c1 >= 0x10000 and c2 >= 0x10000: 317 out.write(" ('\\u{%s}', '\\u{%s}') => Some('\\u{%s}'),\n" % (hexify(c1), hexify(c2), hexify(c3))) 318 319 out.write(" _ => None,\n") 320 out.write(" }\n") 321 out.write("}\n") 322 323def gen_decomposition_tables(canon_decomp, compat_decomp, out): 324 tables = [(canon_decomp, 'canonical'), (compat_decomp, 'compatibility')] 325 for table, name in tables: 326 gen_mph_data(name + '_decomposed', table, "(u32, &'static [char])", 327 lambda k: "(0x{:x}, &[{}])".format(k, 328 ", ".join("'\\u{%s}'" % hexify(c) for c in table[k]))) 329 330def gen_qc_match(prop_table, out): 331 out.write(" match c {\n") 332 333 for low, high, data in prop_table: 334 assert data in ('N', 'M') 335 result = "No" if data == 'N' else "Maybe" 336 if high: 337 out.write(r" '\u{%s}'...'\u{%s}' => %s," % (low, high, result)) 338 else: 339 out.write(r" '\u{%s}' => %s," % (low, result)) 340 out.write("\n") 341 342 out.write(" _ => Yes,\n") 343 out.write(" }\n") 344 345def gen_nfc_qc(prop_tables, out): 346 out.write("#[inline]\n") 347 out.write("#[allow(ellipsis_inclusive_range_patterns)]\n") 348 out.write("pub fn qc_nfc(c: char) -> IsNormalized {\n") 349 gen_qc_match(prop_tables['NFC_QC'], out) 350 out.write("}\n") 351 352def gen_nfkc_qc(prop_tables, out): 353 out.write("#[inline]\n") 354 out.write("#[allow(ellipsis_inclusive_range_patterns)]\n") 355 out.write("pub fn qc_nfkc(c: char) -> IsNormalized {\n") 356 gen_qc_match(prop_tables['NFKC_QC'], out) 357 out.write("}\n") 358 359def gen_nfd_qc(prop_tables, out): 360 out.write("#[inline]\n") 361 out.write("#[allow(ellipsis_inclusive_range_patterns)]\n") 362 out.write("pub fn qc_nfd(c: char) -> IsNormalized {\n") 363 gen_qc_match(prop_tables['NFD_QC'], out) 364 out.write("}\n") 365 366def gen_nfkd_qc(prop_tables, out): 367 out.write("#[inline]\n") 368 out.write("#[allow(ellipsis_inclusive_range_patterns)]\n") 369 out.write("pub fn qc_nfkd(c: char) -> IsNormalized {\n") 370 gen_qc_match(prop_tables['NFKD_QC'], out) 371 out.write("}\n") 372 373def gen_combining_mark(general_category_mark, out): 374 gen_mph_data('combining_mark', general_category_mark, 'u32', 375 lambda k: '0x{:04x}'.format(k)) 376 377def gen_stream_safe(leading, trailing, out): 378 # This could be done as a hash but the table is very small. 379 out.write("#[inline]\n") 380 out.write("pub fn stream_safe_leading_nonstarters(c: char) -> usize {\n") 381 out.write(" match c {\n") 382 383 for char, num_leading in sorted(leading.items()): 384 out.write(" '\\u{%s}' => %d,\n" % (hexify(char), num_leading)) 385 386 out.write(" _ => 0,\n") 387 out.write(" }\n") 388 out.write("}\n") 389 out.write("\n") 390 391 gen_mph_data('trailing_nonstarters', trailing, 'u32', 392 lambda k: "0x{:X}".format(int(trailing[k]) | (k << 8))) 393 394def gen_tests(tests, out): 395 out.write("""#[derive(Debug)] 396pub struct NormalizationTest { 397 pub source: &'static str, 398 pub nfc: &'static str, 399 pub nfd: &'static str, 400 pub nfkc: &'static str, 401 pub nfkd: &'static str, 402} 403 404""") 405 406 out.write("pub const NORMALIZATION_TESTS: &[NormalizationTest] = &[\n") 407 str_literal = lambda s: '"%s"' % "".join("\\u{%s}" % c for c in s) 408 409 for test in tests: 410 out.write(" NormalizationTest {\n") 411 out.write(" source: %s,\n" % str_literal(test.source)) 412 out.write(" nfc: %s,\n" % str_literal(test.nfc)) 413 out.write(" nfd: %s,\n" % str_literal(test.nfd)) 414 out.write(" nfkc: %s,\n" % str_literal(test.nfkc)) 415 out.write(" nfkd: %s,\n" % str_literal(test.nfkd)) 416 out.write(" },\n") 417 418 out.write("];\n") 419 420# Guaranteed to be less than n. 421def my_hash(x, salt, n): 422 # This is hash based on the theory that multiplication is efficient 423 mask_32 = 0xffffffff 424 y = ((x + salt) * 2654435769) & mask_32 425 y ^= (x * 0x31415926) & mask_32 426 return (y * n) >> 32 427 428# Compute minimal perfect hash function, d can be either a dict or list of keys. 429def minimal_perfect_hash(d): 430 n = len(d) 431 buckets = dict((h, []) for h in range(n)) 432 for key in d: 433 h = my_hash(key, 0, n) 434 buckets[h].append(key) 435 bsorted = [(len(buckets[h]), h) for h in range(n)] 436 bsorted.sort(reverse = True) 437 claimed = [False] * n 438 salts = [0] * n 439 keys = [0] * n 440 for (bucket_size, h) in bsorted: 441 # Note: the traditional perfect hashing approach would also special-case 442 # bucket_size == 1 here and assign any empty slot, rather than iterating 443 # until rehash finds an empty slot. But we're not doing that so we can 444 # avoid the branch. 445 if bucket_size == 0: 446 break 447 else: 448 for salt in range(1, 32768): 449 rehashes = [my_hash(key, salt, n) for key in buckets[h]] 450 # Make sure there are no rehash collisions within this bucket. 451 if all(not claimed[hash] for hash in rehashes): 452 if len(set(rehashes)) < bucket_size: 453 continue 454 salts[h] = salt 455 for key in buckets[h]: 456 rehash = my_hash(key, salt, n) 457 claimed[rehash] = True 458 keys[rehash] = key 459 break 460 if salts[h] == 0: 461 print("minimal perfect hashing failed") 462 # Note: if this happens (because of unfortunate data), then there are 463 # a few things that could be done. First, the hash function could be 464 # tweaked. Second, the bucket order could be scrambled (especially the 465 # singletons). Right now, the buckets are sorted, which has the advantage 466 # of being deterministic. 467 # 468 # As a more extreme approach, the singleton bucket optimization could be 469 # applied (give the direct address for singleton buckets, rather than 470 # relying on a rehash). That is definitely the more standard approach in 471 # the minimal perfect hashing literature, but in testing the branch was a 472 # significant slowdown. 473 exit(1) 474 return (salts, keys) 475 476if __name__ == '__main__': 477 data = UnicodeData() 478 with open("tables.rs", "w", newline = "\n") as out: 479 out.write(PREAMBLE) 480 out.write("use crate::quick_check::IsNormalized;\n") 481 out.write("use crate::quick_check::IsNormalized::*;\n") 482 out.write("\n") 483 484 version = "(%s, %s, %s)" % tuple(UNICODE_VERSION.split(".")) 485 out.write("#[allow(unused)]\n") 486 out.write("pub const UNICODE_VERSION: (u8, u8, u8) = %s;\n\n" % version) 487 488 gen_combining_class(data.combining_classes, out) 489 out.write("\n") 490 491 gen_composition_table(data.canon_comp, out) 492 out.write("\n") 493 494 gen_decomposition_tables(data.canon_fully_decomp, data.compat_fully_decomp, out) 495 496 gen_combining_mark(data.general_category_mark, out) 497 out.write("\n") 498 499 gen_nfc_qc(data.norm_props, out) 500 out.write("\n") 501 502 gen_nfkc_qc(data.norm_props, out) 503 out.write("\n") 504 505 gen_nfd_qc(data.norm_props, out) 506 out.write("\n") 507 508 gen_nfkd_qc(data.norm_props, out) 509 out.write("\n") 510 511 gen_stream_safe(data.ss_leading, data.ss_trailing, out) 512 out.write("\n") 513 514 with open("normalization_tests.rs", "w", newline = "\n") as out: 515 out.write(PREAMBLE) 516 gen_tests(data.norm_tests, out) 517