1#!/usr/bin/env python
2#
3# Copyright 2011-2018 The Rust Project Developers. See the COPYRIGHT
4# file at the top-level directory of this distribution and at
5# http://rust-lang.org/COPYRIGHT.
6#
7# Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
8# http://www.apache.org/licenses/LICENSE-2.0> or the MIT license
9# <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your
10# option. This file may not be copied, modified, or distributed
11# except according to those terms.
12
13# This script uses the following Unicode tables:
14# - DerivedNormalizationProps.txt
15# - NormalizationTest.txt
16# - UnicodeData.txt
17#
18# Since this should not require frequent updates, we just store this
19# out-of-line and check the unicode.rs file into git.
20import collections
21import urllib.request
22
23UNICODE_VERSION = "9.0.0"
24UCD_URL = "https://www.unicode.org/Public/%s/ucd/" % UNICODE_VERSION
25
26PREAMBLE = """// Copyright 2012-2018 The Rust Project Developers. See the COPYRIGHT
27// file at the top-level directory of this distribution and at
28// http://rust-lang.org/COPYRIGHT.
29//
30// Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
31// http://www.apache.org/licenses/LICENSE-2.0> or the MIT license
32// <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your
33// option. This file may not be copied, modified, or distributed
34// except according to those terms.
35
36// NOTE: The following code was generated by "scripts/unicode.py", do not edit directly
37
38#![allow(missing_docs)]
39"""
40
41NormalizationTest = collections.namedtuple(
42    "NormalizationTest",
43    ["source", "nfc", "nfd", "nfkc", "nfkd"],
44)
45
46# Mapping taken from Table 12 from:
47# http://www.unicode.org/reports/tr44/#General_Category_Values
48expanded_categories = {
49    'Lu': ['LC', 'L'], 'Ll': ['LC', 'L'], 'Lt': ['LC', 'L'],
50    'Lm': ['L'], 'Lo': ['L'],
51    'Mn': ['M'], 'Mc': ['M'], 'Me': ['M'],
52    'Nd': ['N'], 'Nl': ['N'], 'No': ['No'],
53    'Pc': ['P'], 'Pd': ['P'], 'Ps': ['P'], 'Pe': ['P'],
54    'Pi': ['P'], 'Pf': ['P'], 'Po': ['P'],
55    'Sm': ['S'], 'Sc': ['S'], 'Sk': ['S'], 'So': ['S'],
56    'Zs': ['Z'], 'Zl': ['Z'], 'Zp': ['Z'],
57    'Cc': ['C'], 'Cf': ['C'], 'Cs': ['C'], 'Co': ['C'], 'Cn': ['C'],
58}
59
60class UnicodeData(object):
61    def __init__(self):
62        self._load_unicode_data()
63        self.norm_props = self._load_norm_props()
64        self.norm_tests = self._load_norm_tests()
65
66        self.canon_comp = self._compute_canonical_comp()
67        self.canon_fully_decomp, self.compat_fully_decomp = self._compute_fully_decomposed()
68
69        def stats(name, table):
70            count = sum(len(v) for v in table.values())
71            print("%s: %d chars => %d decomposed chars" % (name, len(table), count))
72
73        print("Decomposition table stats:")
74        stats("Canonical decomp", self.canon_decomp)
75        stats("Compatible decomp", self.compat_decomp)
76        stats("Canonical fully decomp", self.canon_fully_decomp)
77        stats("Compatible fully decomp", self.compat_fully_decomp)
78
79        self.ss_leading, self.ss_trailing = self._compute_stream_safe_tables()
80
81    def _fetch(self, filename):
82        resp = urllib.request.urlopen(UCD_URL + filename)
83        return resp.read().decode('utf-8')
84
85    def _load_unicode_data(self):
86        self.combining_classes = {}
87        self.compat_decomp = {}
88        self.canon_decomp = {}
89        self.general_category_mark = []
90
91        for line in self._fetch("UnicodeData.txt").splitlines():
92            # See ftp://ftp.unicode.org/Public/3.0-Update/UnicodeData-3.0.0.html
93            pieces = line.split(';')
94            assert len(pieces) == 15
95            char, category, cc, decomp = pieces[0], pieces[2], pieces[3], pieces[5]
96            char_int = int(char, 16)
97
98            if cc != '0':
99                self.combining_classes[char_int] = cc
100
101            if decomp.startswith('<'):
102                self.compat_decomp[char_int] = [int(c, 16) for c in decomp.split()[1:]]
103            elif decomp != '':
104                self.canon_decomp[char_int] = [int(c, 16) for c in decomp.split()]
105
106            if category == 'M' or 'M' in expanded_categories.get(category, []):
107                self.general_category_mark.append(char_int)
108
109    def _load_norm_props(self):
110        props = collections.defaultdict(list)
111
112        for line in self._fetch("DerivedNormalizationProps.txt").splitlines():
113            (prop_data, _, _) = line.partition("#")
114            prop_pieces = prop_data.split(";")
115
116            if len(prop_pieces) < 2:
117                continue
118
119            assert len(prop_pieces) <= 3
120            (low, _, high) = prop_pieces[0].strip().partition("..")
121
122            prop = prop_pieces[1].strip()
123
124            data = None
125            if len(prop_pieces) == 3:
126                data = prop_pieces[2].strip()
127
128            props[prop].append((low, high, data))
129
130        return props
131
132    def _load_norm_tests(self):
133        tests = []
134        for line in self._fetch("NormalizationTest.txt").splitlines():
135            (test_data, _, _) = line.partition("#")
136            test_pieces = test_data.split(";")
137
138            if len(test_pieces) < 5:
139                continue
140
141            source, nfc, nfd, nfkc, nfkd = [[c.strip() for c in p.split()] for p in test_pieces[:5]]
142            tests.append(NormalizationTest(source, nfc, nfd, nfkc, nfkd))
143
144        return tests
145
146    def _compute_canonical_comp(self):
147        canon_comp = {}
148        comp_exclusions = [
149            (int(low, 16), int(high or low, 16))
150            for low, high, _ in self.norm_props["Full_Composition_Exclusion"]
151        ]
152        for char_int, decomp in self.canon_decomp.items():
153            if any(lo <= char_int <= hi for lo, hi in comp_exclusions):
154                continue
155
156            assert len(decomp) == 2
157            assert (decomp[0], decomp[1]) not in canon_comp
158            canon_comp[(decomp[0], decomp[1])] = char_int
159
160        return canon_comp
161
162    def _compute_fully_decomposed(self):
163        """
164        Even though the decomposition algorithm is recursive, it is possible
165        to precompute the recursion at table generation time with modest
166        increase to the table size.  Then, for these precomputed tables, we
167        note that 1) compatible decomposition is a subset of canonical
168        decomposition and 2) they mostly agree on their intersection.
169        Therefore, we don't store entries in the compatible table for
170        characters that decompose the same way under canonical decomposition.
171
172            Decomposition table stats:
173            Canonical decomp: 2060 chars => 3085 decomposed chars
174            Compatible decomp: 3662 chars => 5440 decomposed chars
175            Canonical fully decomp: 2060 chars => 3404 decomposed chars
176            Compatible fully decomp: 3678 chars => 5599 decomposed chars
177
178        The upshot is that decomposition code is very simple and easy to inline
179        at mild code size cost.
180        """
181        # Constants from Unicode 9.0.0 Section 3.12 Conjoining Jamo Behavior
182        # http://www.unicode.org/versions/Unicode9.0.0/ch03.pdf#M9.32468.Heading.310.Combining.Jamo.Behavior
183        S_BASE, L_COUNT, V_COUNT, T_COUNT = 0xAC00, 19, 21, 28
184        S_COUNT = L_COUNT * V_COUNT * T_COUNT
185
186        def _decompose(char_int, compatible):
187            # 7-bit ASCII never decomposes
188            if char_int <= 0x7f:
189                yield char_int
190                return
191
192            # Assert that we're handling Hangul separately.
193            assert not (S_BASE <= char_int < S_BASE + S_COUNT)
194
195            decomp = self.canon_decomp.get(char_int)
196            if decomp is not None:
197                for decomposed_ch in decomp:
198                    for fully_decomposed_ch in _decompose(decomposed_ch, compatible):
199                        yield fully_decomposed_ch
200                return
201
202            if compatible and char_int in self.compat_decomp:
203                for decomposed_ch in self.compat_decomp[char_int]:
204                    for fully_decomposed_ch in _decompose(decomposed_ch, compatible):
205                        yield fully_decomposed_ch
206                return
207
208            yield char_int
209            return
210
211        end_codepoint = max(
212            max(self.canon_decomp.keys()),
213            max(self.compat_decomp.keys()),
214        )
215
216        canon_fully_decomp = {}
217        compat_fully_decomp = {}
218
219        for char_int in range(0, end_codepoint + 1):
220            # Always skip Hangul, since it's more efficient to represent its
221            # decomposition programmatically.
222            if S_BASE <= char_int < S_BASE + S_COUNT:
223                continue
224
225            canon = list(_decompose(char_int, False))
226            if not (len(canon) == 1 and canon[0] == char_int):
227                canon_fully_decomp[char_int] = canon
228
229            compat = list(_decompose(char_int, True))
230            if not (len(compat) == 1 and compat[0] == char_int):
231                compat_fully_decomp[char_int] = compat
232
233        # Since canon_fully_decomp is a subset of compat_fully_decomp, we don't
234        # need to store their overlap when they agree.  When they don't agree,
235        # store the decomposition in the compatibility table since we'll check
236        # that first when normalizing to NFKD.
237        assert set(canon_fully_decomp) <= set(compat_fully_decomp)
238
239        for ch in set(canon_fully_decomp) & set(compat_fully_decomp):
240            if canon_fully_decomp[ch] == compat_fully_decomp[ch]:
241                del compat_fully_decomp[ch]
242
243        return canon_fully_decomp, compat_fully_decomp
244
245    def _compute_stream_safe_tables(self):
246        """
247        To make a text stream-safe with the Stream-Safe Text Process (UAX15-D4),
248        we need to be able to know the number of contiguous non-starters *after*
249        applying compatibility decomposition to each character.
250
251        We can do this incrementally by computing the number of leading and
252        trailing non-starters for each character's compatibility decomposition
253        with the following rules:
254
255        1) If a character is not affected by compatibility decomposition, look
256           up its canonical combining class to find out if it's a non-starter.
257        2) All Hangul characters are starters, even under decomposition.
258        3) Otherwise, very few decomposing characters have a nonzero count
259           of leading or trailing non-starters, so store these characters
260           with their associated counts in a separate table.
261        """
262        leading_nonstarters = {}
263        trailing_nonstarters = {}
264
265        for c in set(self.canon_fully_decomp) | set(self.compat_fully_decomp):
266            decomposed = self.compat_fully_decomp.get(c) or self.canon_fully_decomp[c]
267
268            num_leading = 0
269            for d in decomposed:
270                if d not in self.combining_classes:
271                    break
272                num_leading += 1
273
274            num_trailing = 0
275            for d in reversed(decomposed):
276                if d not in self.combining_classes:
277                    break
278                num_trailing += 1
279
280            if num_leading > 0:
281                leading_nonstarters[c] = num_leading
282            if num_trailing > 0:
283                trailing_nonstarters[c] = num_trailing
284
285        return leading_nonstarters, trailing_nonstarters
286
287hexify = lambda c: '{:04X}'.format(c)
288
289def gen_mph_data(name, d, kv_type, kv_callback):
290    (salt, keys) = minimal_perfect_hash(d)
291    out.write("pub(crate) const %s_SALT: &[u16] = &[\n" % name.upper())
292    for s in salt:
293        out.write("    0x{:x},\n".format(s))
294    out.write("];\n")
295    out.write("pub(crate) const {}_KV: &[{}] = &[\n".format(name.upper(), kv_type))
296    for k in keys:
297        out.write("    {},\n".format(kv_callback(k)))
298    out.write("];\n\n")
299
300def gen_combining_class(combining_classes, out):
301    gen_mph_data('canonical_combining_class', combining_classes, 'u32',
302        lambda k: "0x{:X}".format(int(combining_classes[k]) | (k << 8)))
303
304def gen_composition_table(canon_comp, out):
305    table = {}
306    for (c1, c2), c3 in canon_comp.items():
307        if c1 < 0x10000 and c2 < 0x10000:
308            table[(c1 << 16) | c2] = c3
309    (salt, keys) = minimal_perfect_hash(table)
310    gen_mph_data('COMPOSITION_TABLE', table, '(u32, char)',
311        lambda k: "(0x%s, '\\u{%s}')" % (hexify(k), hexify(table[k])))
312
313    out.write("pub(crate) fn composition_table_astral(c1: char, c2: char) -> Option<char> {\n")
314    out.write("    match (c1, c2) {\n")
315    for (c1, c2), c3 in sorted(canon_comp.items()):
316        if c1 >= 0x10000 and c2 >= 0x10000:
317            out.write("        ('\\u{%s}', '\\u{%s}') => Some('\\u{%s}'),\n" % (hexify(c1), hexify(c2), hexify(c3)))
318
319    out.write("        _ => None,\n")
320    out.write("    }\n")
321    out.write("}\n")
322
323def gen_decomposition_tables(canon_decomp, compat_decomp, out):
324    tables = [(canon_decomp, 'canonical'), (compat_decomp, 'compatibility')]
325    for table, name in tables:
326        gen_mph_data(name + '_decomposed', table, "(u32, &'static [char])",
327            lambda k: "(0x{:x}, &[{}])".format(k,
328                ", ".join("'\\u{%s}'" % hexify(c) for c in table[k])))
329
330def gen_qc_match(prop_table, out):
331    out.write("    match c {\n")
332
333    for low, high, data in prop_table:
334        assert data in ('N', 'M')
335        result = "No" if data == 'N' else "Maybe"
336        if high:
337            out.write(r"        '\u{%s}'...'\u{%s}' => %s," % (low, high, result))
338        else:
339            out.write(r"        '\u{%s}' => %s," % (low, result))
340        out.write("\n")
341
342    out.write("        _ => Yes,\n")
343    out.write("    }\n")
344
345def gen_nfc_qc(prop_tables, out):
346    out.write("#[inline]\n")
347    out.write("#[allow(ellipsis_inclusive_range_patterns)]\n")
348    out.write("pub fn qc_nfc(c: char) -> IsNormalized {\n")
349    gen_qc_match(prop_tables['NFC_QC'], out)
350    out.write("}\n")
351
352def gen_nfkc_qc(prop_tables, out):
353    out.write("#[inline]\n")
354    out.write("#[allow(ellipsis_inclusive_range_patterns)]\n")
355    out.write("pub fn qc_nfkc(c: char) -> IsNormalized {\n")
356    gen_qc_match(prop_tables['NFKC_QC'], out)
357    out.write("}\n")
358
359def gen_nfd_qc(prop_tables, out):
360    out.write("#[inline]\n")
361    out.write("#[allow(ellipsis_inclusive_range_patterns)]\n")
362    out.write("pub fn qc_nfd(c: char) -> IsNormalized {\n")
363    gen_qc_match(prop_tables['NFD_QC'], out)
364    out.write("}\n")
365
366def gen_nfkd_qc(prop_tables, out):
367    out.write("#[inline]\n")
368    out.write("#[allow(ellipsis_inclusive_range_patterns)]\n")
369    out.write("pub fn qc_nfkd(c: char) -> IsNormalized {\n")
370    gen_qc_match(prop_tables['NFKD_QC'], out)
371    out.write("}\n")
372
373def gen_combining_mark(general_category_mark, out):
374    gen_mph_data('combining_mark', general_category_mark, 'u32',
375        lambda k: '0x{:04x}'.format(k))
376
377def gen_stream_safe(leading, trailing, out):
378    # This could be done as a hash but the table is very small.
379    out.write("#[inline]\n")
380    out.write("pub fn stream_safe_leading_nonstarters(c: char) -> usize {\n")
381    out.write("    match c {\n")
382
383    for char, num_leading in sorted(leading.items()):
384        out.write("        '\\u{%s}' => %d,\n" % (hexify(char), num_leading))
385
386    out.write("        _ => 0,\n")
387    out.write("    }\n")
388    out.write("}\n")
389    out.write("\n")
390
391    gen_mph_data('trailing_nonstarters', trailing, 'u32',
392        lambda k: "0x{:X}".format(int(trailing[k]) | (k << 8)))
393
394def gen_tests(tests, out):
395    out.write("""#[derive(Debug)]
396pub struct NormalizationTest {
397    pub source: &'static str,
398    pub nfc: &'static str,
399    pub nfd: &'static str,
400    pub nfkc: &'static str,
401    pub nfkd: &'static str,
402}
403
404""")
405
406    out.write("pub const NORMALIZATION_TESTS: &[NormalizationTest] = &[\n")
407    str_literal = lambda s: '"%s"' % "".join("\\u{%s}" % c for c in s)
408
409    for test in tests:
410        out.write("    NormalizationTest {\n")
411        out.write("        source: %s,\n" % str_literal(test.source))
412        out.write("        nfc: %s,\n" % str_literal(test.nfc))
413        out.write("        nfd: %s,\n" % str_literal(test.nfd))
414        out.write("        nfkc: %s,\n" % str_literal(test.nfkc))
415        out.write("        nfkd: %s,\n" % str_literal(test.nfkd))
416        out.write("    },\n")
417
418    out.write("];\n")
419
420# Guaranteed to be less than n.
421def my_hash(x, salt, n):
422    # This is hash based on the theory that multiplication is efficient
423    mask_32 = 0xffffffff
424    y = ((x + salt) * 2654435769) & mask_32
425    y ^= (x * 0x31415926) & mask_32
426    return (y * n) >> 32
427
428# Compute minimal perfect hash function, d can be either a dict or list of keys.
429def minimal_perfect_hash(d):
430    n = len(d)
431    buckets = dict((h, []) for h in range(n))
432    for key in d:
433        h = my_hash(key, 0, n)
434        buckets[h].append(key)
435    bsorted = [(len(buckets[h]), h) for h in range(n)]
436    bsorted.sort(reverse = True)
437    claimed = [False] * n
438    salts = [0] * n
439    keys = [0] * n
440    for (bucket_size, h) in bsorted:
441        # Note: the traditional perfect hashing approach would also special-case
442        # bucket_size == 1 here and assign any empty slot, rather than iterating
443        # until rehash finds an empty slot. But we're not doing that so we can
444        # avoid the branch.
445        if bucket_size == 0:
446            break
447        else:
448            for salt in range(1, 32768):
449                rehashes = [my_hash(key, salt, n) for key in buckets[h]]
450                # Make sure there are no rehash collisions within this bucket.
451                if all(not claimed[hash] for hash in rehashes):
452                    if len(set(rehashes)) < bucket_size:
453                        continue
454                    salts[h] = salt
455                    for key in buckets[h]:
456                        rehash = my_hash(key, salt, n)
457                        claimed[rehash] = True
458                        keys[rehash] = key
459                    break
460            if salts[h] == 0:
461                print("minimal perfect hashing failed")
462                # Note: if this happens (because of unfortunate data), then there are
463                # a few things that could be done. First, the hash function could be
464                # tweaked. Second, the bucket order could be scrambled (especially the
465                # singletons). Right now, the buckets are sorted, which has the advantage
466                # of being deterministic.
467                #
468                # As a more extreme approach, the singleton bucket optimization could be
469                # applied (give the direct address for singleton buckets, rather than
470                # relying on a rehash). That is definitely the more standard approach in
471                # the minimal perfect hashing literature, but in testing the branch was a
472                # significant slowdown.
473                exit(1)
474    return (salts, keys)
475
476if __name__ == '__main__':
477    data = UnicodeData()
478    with open("tables.rs", "w", newline = "\n") as out:
479        out.write(PREAMBLE)
480        out.write("use quick_check::IsNormalized;\n")
481        out.write("use quick_check::IsNormalized::*;\n")
482        out.write("\n")
483
484        version = "(%s, %s, %s)" % tuple(UNICODE_VERSION.split("."))
485        out.write("#[allow(unused)]\n")
486        out.write("pub const UNICODE_VERSION: (u64, u64, u64) = %s;\n\n" % version)
487
488        gen_combining_class(data.combining_classes, out)
489        out.write("\n")
490
491        gen_composition_table(data.canon_comp, out)
492        out.write("\n")
493
494        gen_decomposition_tables(data.canon_fully_decomp, data.compat_fully_decomp, out)
495
496        gen_combining_mark(data.general_category_mark, out)
497        out.write("\n")
498
499        gen_nfc_qc(data.norm_props, out)
500        out.write("\n")
501
502        gen_nfkc_qc(data.norm_props, out)
503        out.write("\n")
504
505        gen_nfd_qc(data.norm_props, out)
506        out.write("\n")
507
508        gen_nfkd_qc(data.norm_props, out)
509        out.write("\n")
510
511        gen_stream_safe(data.ss_leading, data.ss_trailing, out)
512        out.write("\n")
513
514    with open("normalization_tests.rs", "w", newline = "\n") as out:
515        out.write(PREAMBLE)
516        gen_tests(data.norm_tests, out)
517