1#!/usr/bin/env python3
2
3import argparse, collections, datetime, os, re, sys, unicodedata
4from urllib.request import urlopen
5from intranges import intranges_from_list
6
7if sys.version_info[0] < 3:
8    print("Only Python 3 supported.")
9    sys.exit(2)
10
11PREFERRED_VERSION = '12.1.0'
12UCD_URL = 'http://www.unicode.org/Public/{version}/ucd/{filename}'
13UTS46_URL = 'http://www.unicode.org/Public/idna/{version}/{filename}'
14
15DEFAULT_CACHE_DIR = '~/.cache/unidata'
16
17# Scripts affected by IDNA contextual rules
18SCRIPT_WHITELIST = sorted(['Greek', 'Han', 'Hebrew', 'Hiragana', 'Katakana'])
19
20# Used to piece apart UTS#46 data for Jython compatibility
21UTS46_SEGMENT_SIZE = 100
22
23UTS46_STATUSES = {
24    "valid": ("V", False),
25    "ignored": ("I", False),
26    "mapped": ("M", True),
27    "deviation": ("D", True),
28    "disallowed": ("X", False),
29    "disallowed_STD3_valid": ("3", False),
30    "disallowed_STD3_mapped": ("3", True)
31}
32
33# Exceptions are manually assigned in Section 2.6 of RFC 5892.
34exceptions = {
35    0x00DF: 'PVALID',      # LATIN SMALL LETTER SHARP S
36    0x03C2: 'PVALID',      # GREEK SMALL LETTER FINAL SIGMA
37    0x06FD: 'PVALID',      # ARABIC SIGN SINDHI AMPERSAND
38    0x06FE: 'PVALID',      # ARABIC SIGN SINDHI POSTPOSITION MEN
39    0x0F0B: 'PVALID',      # TIBETAN MARK INTERSYLLABIC TSHEG
40    0x3007: 'PVALID',      # IDEOGRAPHIC NUMBER ZERO
41    0x00B7: 'CONTEXTO',    # MIDDLE DOT
42    0x0375: 'CONTEXTO',    # GREEK LOWER NUMERAL SIGN (KERAIA)
43    0x05F3: 'CONTEXTO',    # HEBREW PUNCTUATION GERESH
44    0x05F4: 'CONTEXTO',    # HEBREW PUNCTUATION GERSHAYIM
45    0x30FB: 'CONTEXTO',    # KATAKANA MIDDLE DOT
46    0x0660: 'CONTEXTO',    # ARABIC-INDIC DIGIT ZERO
47    0x0661: 'CONTEXTO',    # ARABIC-INDIC DIGIT ONE
48    0x0662: 'CONTEXTO',    # ARABIC-INDIC DIGIT TWO
49    0x0663: 'CONTEXTO',    # ARABIC-INDIC DIGIT THREE
50    0x0664: 'CONTEXTO',    # ARABIC-INDIC DIGIT FOUR
51    0x0665: 'CONTEXTO',    # ARABIC-INDIC DIGIT FIVE
52    0x0666: 'CONTEXTO',    # ARABIC-INDIC DIGIT SIX
53    0x0667: 'CONTEXTO',    # ARABIC-INDIC DIGIT SEVEN
54    0x0668: 'CONTEXTO',    # ARABIC-INDIC DIGIT EIGHT
55    0x0669: 'CONTEXTO',    # ARABIC-INDIC DIGIT NINE
56    0x06F0: 'CONTEXTO',    # EXTENDED ARABIC-INDIC DIGIT ZERO
57    0x06F1: 'CONTEXTO',    # EXTENDED ARABIC-INDIC DIGIT ONE
58    0x06F2: 'CONTEXTO',    # EXTENDED ARABIC-INDIC DIGIT TWO
59    0x06F3: 'CONTEXTO',    # EXTENDED ARABIC-INDIC DIGIT THREE
60    0x06F4: 'CONTEXTO',    # EXTENDED ARABIC-INDIC DIGIT FOUR
61    0x06F5: 'CONTEXTO',    # EXTENDED ARABIC-INDIC DIGIT FIVE
62    0x06F6: 'CONTEXTO',    # EXTENDED ARABIC-INDIC DIGIT SIX
63    0x06F7: 'CONTEXTO',    # EXTENDED ARABIC-INDIC DIGIT SEVEN
64    0x06F8: 'CONTEXTO',    # EXTENDED ARABIC-INDIC DIGIT EIGHT
65    0x06F9: 'CONTEXTO',    # EXTENDED ARABIC-INDIC DIGIT NINE
66    0x0640: 'DISALLOWED',  # ARABIC TATWEEL
67    0x07FA: 'DISALLOWED',  # NKO LAJANYALAN
68    0x302E: 'DISALLOWED',  # HANGUL SINGLE DOT TONE MARK
69    0x302F: 'DISALLOWED',  # HANGUL DOUBLE DOT TONE MARK
70    0x3031: 'DISALLOWED',  # VERTICAL KANA REPEAT MARK
71    0x3032: 'DISALLOWED',  # VERTICAL KANA REPEAT WITH VOICED SOUND MARK
72    0x3033: 'DISALLOWED',  # VERTICAL KANA REPEAT MARK UPPER HALF
73    0x3034: 'DISALLOWED',  # VERTICAL KANA REPEAT WITH VOICED SOUND MARK UPPER HA
74    0x3035: 'DISALLOWED',  # VERTICAL KANA REPEAT MARK LOWER HALF
75    0x303B: 'DISALLOWED',  # VERTICAL IDEOGRAPHIC ITERATION MARK
76}
77backwardscompatible = {}
78
79
80def hexrange(start, end):
81    return range(int(start, 16), int(end, 16) + 1)
82
83def hexvalue(value):
84    return int(value, 16)
85
86
87class UnicodeVersion(object):
88
89    def __init__(self, version):
90        result = re.match('^(?P<major>\d+)\.(?P<minor>\d+)\.(?P<patch>\d+)$', version)
91        if result:
92            self.major = int(result.group('major'))
93            self.minor = int(result.group('minor'))
94            self.patch = int(result.group('patch'))
95            self.numerical = (self.major << 8) + (self.minor << 4) + self.patch
96            self.latest = False
97        elif version == 'latest':
98            self.latest = True
99        else:
100            raise ValueError('Unrecognized Unicode version')
101
102    def __repr__(self, with_date=True):
103        if self.latest:
104            if with_date:
105                return 'latest@{}'.format(datetime.datetime.now().strftime('%Y-%m-%d'))
106            else:
107                return 'latest'
108        else:
109            return "{}.{}.{}".format(self.major, self.minor, self.patch)
110
111    @property
112    def tag(self):
113        return self.__repr__(with_date=False)
114
115    def __gt__(self, other):
116        if self.latest:
117            return True
118        return self.numerical > other.numerical
119
120    def __eq__(self, other):
121        if self.latest:
122            return False
123        return self.numerical == other.numerical
124
125
126class UnicodeData(object):
127
128    def __init__(self, version, cache, args):
129        self.version = UnicodeVersion(version)
130        self.system_version = UnicodeVersion(unicodedata.unidata_version)
131        self.source = args.source
132        self.cache = cache
133        self.max = 0
134
135        if self.system_version < self.version:
136            print("Warning: Character stability not guaranteed as Python Unicode data {}"
137                   " older than requested {}".format(self.system_version, self.version))
138
139        self._load_unicodedata()
140        self._load_proplist()
141        self._load_derivedcoreprops()
142        self._load_blocks()
143        self._load_casefolding()
144        self._load_hangulst()
145        self._load_arabicshaping()
146        self._load_scripts()
147        self._load_uts46mapping()
148
149    def _load_unicodedata(self):
150
151        f_ud = self._ucdfile('UnicodeData.txt')
152        self.ucd_data = {}
153        range_begin = None
154        for line in f_ud.splitlines():
155            fields = line.split(';')
156            value = int(fields[0], 16)
157            start_marker = re.match('^<(?P<name>.*?), First>$', fields[1])
158            end_marker = re.match('^<(?P<name>.*?), Last>$', fields[1])
159            if start_marker:
160                range_begin = value
161            elif end_marker:
162                for i in range(range_begin, value+1):
163                    fields[1] = '<{}>'.format(end_marker.group('name'))
164                    self.ucd_data[i] = fields[1:]
165                range_begin = None
166            else:
167                self.ucd_data[value] = fields[1:]
168
169    def _load_proplist(self):
170
171        f_pl = self._ucdfile('PropList.txt')
172        self.ucd_props = collections.defaultdict(list)
173        for line in f_pl.splitlines():
174            result = re.match(
175                '^(?P<start>[0-9A-F]{4,6})(|\.\.(?P<end>[0-9A-F]{4,6}))\s*;\s*(?P<prop>\S+)\s*(|\#.*)$',
176                line)
177            if result:
178                if result.group('end'):
179                    for i in hexrange(result.group('start'), result.group('end')):
180                        self.ucd_props[i].append(result.group('prop'))
181                else:
182                    i = hexvalue(result.group('start'))
183                    self.ucd_props[i].append(result.group('prop'))
184
185    def _load_derivedcoreprops(self):
186
187        f_dcp = self._ucdfile('DerivedCoreProperties.txt')
188        for line in f_dcp.splitlines():
189            result = re.match(
190                '^(?P<start>[0-9A-F]{4,6})(|\.\.(?P<end>[0-9A-F]{4,6}))\s*;\s*(?P<prop>\S+)\s*(|\#.*)$',
191                line)
192            if result:
193                if result.group('end'):
194                    for i in hexrange(result.group('start'), result.group('end')):
195                        self.ucd_props[i].append(result.group('prop'))
196                else:
197                    i = hexvalue(result.group('start'))
198                    self.ucd_props[i].append(result.group('prop'))
199
200    def _load_blocks(self):
201
202        self.ucd_block = {}
203        f_b = self._ucdfile('Blocks.txt')
204        for line in f_b.splitlines():
205            result = re.match(
206                '^(?P<start>[0-9A-F]{4,6})\.\.(?P<end>[0-9A-F]{4,6})\s*;\s*(?P<block>.*)\s*$',
207                line)
208            if result:
209                for i in hexrange(result.group('start'), result.group('end')):
210                    self.ucd_block[i] = result.group('block')
211                    self.max = max(self.max, i)
212
213    def _load_casefolding(self):
214
215        self.ucd_cf = {}
216        f_cf = self._ucdfile('CaseFolding.txt')
217        for line in f_cf.splitlines():
218            result = re.match(
219                '^(?P<cp>[0-9A-F]{4,6})\s*;\s*(?P<type>\S+)\s*;\s*(?P<subst>[0-9A-F\s]+)\s*',
220                line)
221            if result:
222                if result.group('type') in ('C', 'F'):
223                    self.ucd_cf[int(result.group('cp'), 16)] = \
224                        ''.join([chr(int(x, 16)) for x in result.group('subst').split(' ')])
225
226    def _load_hangulst(self):
227
228        self.ucd_hst = {}
229        f_hst = self._ucdfile('HangulSyllableType.txt')
230        for line in f_hst.splitlines():
231            result = re.match(
232                '^(?P<start>[0-9A-F]{4,6})\.\.(?P<end>[0-9A-F]{4,6})\s*;\s*(?P<type>\S+)\s*(|\#.*)$',
233                line)
234            if result:
235                for i in hexrange(result.group('start'), result.group('end')):
236                    self.ucd_hst[i] = result.group('type')
237
238    def _load_arabicshaping(self):
239
240        self.ucd_as = {}
241        f_as = self._ucdfile('ArabicShaping.txt')
242        for line in f_as.splitlines():
243            result = re.match('^(?P<cp>[0-9A-F]{4,6})\s*;\s*.*?\s*;\s*(?P<jt>\S+)\s*;', line)
244            if result:
245                self.ucd_as[int(result.group('cp'), 16)] = result.group('jt')
246
247    def _load_scripts(self):
248
249        self.ucd_s = {}
250        f_s = self._ucdfile('Scripts.txt')
251        for line in f_s.splitlines():
252            result = re.match(
253                '^(?P<start>[0-9A-F]{4,6})(|\.\.(?P<end>[0-9A-F]{4,6}))\s*;\s*(?P<script>\S+)\s*(|\#.*)$',
254                line)
255            if result:
256                if not result.group('script') in self.ucd_s:
257                    self.ucd_s[result.group('script')] = set()
258                if result.group('end'):
259                    for i in hexrange(result.group('start'), result.group('end')):
260                        self.ucd_s[result.group('script')].add(i)
261                else:
262                    i = hexvalue(result.group('start'))
263                    self.ucd_s[result.group('script')].add(i)
264
265    def _load_uts46mapping(self):
266
267        self.ucd_idnamt = {}
268        f_idnamt = self._ucdfile('IdnaMappingTable.txt', urlbase=UTS46_URL)
269        for line in f_idnamt.splitlines():
270            result = re.match(
271                '^(?P<start>[0-9A-F]{4,6})(|\.\.(?P<end>[0-9A-F]{4,6}))\s*;\s*(?P<fields>[^#]+)',
272                line)
273            if result:
274                fields = [x.strip() for x in result.group('fields').split(';')]
275                if result.group('end'):
276                    for i in hexrange(result.group('start'), result.group('end')):
277                        self.ucd_idnamt[i] = fields
278                else:
279                    i = hexvalue(result.group('start'))
280                    self.ucd_idnamt[i] = fields
281
282    def _ucdfile(self, filename, urlbase=UCD_URL):
283        if self.source:
284            f = open("{}/{}".format(self.source, filename))
285            return f.read()
286        else:
287            cache_file = None
288            if self.cache:
289                cache_file = os.path.expanduser("{}/{}/{}".format(
290                    self.cache, self.version.tag, filename))
291                if os.path.isfile(cache_file):
292                    f = open(cache_file)
293                    return f.read()
294
295            version_path = self.version.tag
296            if version_path == 'latest':
297                version_path = 'UCD/latest'
298            url = urlbase.format(
299                version=version_path,
300                filename=filename,
301            )
302            content = urlopen(url).read().decode('utf-8')
303
304            if cache_file:
305                if not os.path.isdir(os.path.dirname(cache_file)):
306                    os.makedirs(os.path.dirname(cache_file))
307                f = open(cache_file, 'wb')
308                f.write(content.encode('utf-8'))
309                f.close()
310
311            return str(content)
312
313    def codepoints(self):
314        for i in range(0, self.max + 1):
315            yield CodePoint(i, ucdata=self)
316
317
318class CodePoint:
319
320    def __init__(self, value=None, ucdata=None):
321        self.value = value
322        self.ucdata = ucdata
323
324    def _casefold(self, s):
325        r = ''
326        for c in s:
327            r += self.ucdata.ucd_cf.get(ord(c), c)
328        return r
329
330    @property
331    def exception_value(self):
332        return exceptions.get(self.value, False)
333
334    @property
335    def compat_value(self):
336        return backwardscompatible.get(self.value, False)
337
338    @property
339    def name(self):
340        if self.value in self.ucdata.ucd_data:
341            return self.ucdata.ucd_data[self.value][0]
342        elif 'Noncharacter_Code_Point' in self.ucdata.ucd_props[self.value]:
343            return '<noncharacter>'
344        else:
345            return '<reserved>'
346
347    @property
348    def general_category(self):
349        return self.ucdata.ucd_data.get(self.value, [None, None])[1]
350
351    @property
352    def unassigned(self):
353        return not ('Noncharacter_Code_Point' in self.ucdata.ucd_props[self.value] or \
354                    self.value in self.ucdata.ucd_data)
355
356    @property
357    def ldh(self):
358        if self.value == 0x002d or \
359           self.value in range(0x0030, 0x0039+1) or \
360           self.value in range(0x0061, 0x007a+1):
361            return True
362        return False
363
364    @property
365    def join_control(self):
366        return 'Join_Control' in self.ucdata.ucd_props[self.value]
367
368    @property
369    def joining_type(self):
370        return self.ucdata.ucd_as.get(self.value, None)
371
372    @property
373    def char(self):
374        return chr(self.value)
375
376    @property
377    def nfkc_cf(self):
378        return unicodedata.normalize('NFKC',
379                                     self._casefold(unicodedata.normalize('NFKC', self.char)))
380
381    @property
382    def unstable(self):
383        return self.char != self.nfkc_cf
384
385    @property
386    def in_ignorableproperties(self):
387        for prop in ['Default_Ignorable_Code_Point', 'White_Space', 'Noncharacter_Code_Point']:
388            if prop in self.ucdata.ucd_props[self.value]:
389                return True
390        return False
391
392    @property
393    def in_ignorableblocks(self):
394        return self.ucdata.ucd_block.get(self.value) in (
395            'Combining Diacritical Marks for Symbols', 'Musical Symbols',
396            'Ancient Greek Musical Notation'
397        )
398
399    @property
400    def oldhanguljamo(self):
401        return self.ucdata.ucd_hst.get(self.value) in ('L', 'V', 'T')
402
403    @property
404    def in_lettersdigits(self):
405        return self.general_category in ('Ll', 'Lu', 'Lo', 'Nd', 'Lm', 'Mn', 'Mc')
406
407    @property
408    def idna2008_status(self):
409        if self.exception_value:
410            return self.exception_value
411        elif self.compat_value:
412            return self.compat_value
413        elif self.unassigned:
414            return 'UNASSIGNED'
415        elif self.ldh:
416            return 'PVALID'
417        elif self.join_control:
418            return 'CONTEXTJ'
419        elif self.unstable:
420            return 'DISALLOWED'
421        elif self.in_ignorableproperties:
422            return 'DISALLOWED'
423        elif self.in_ignorableblocks:
424            return 'DISALLOWED'
425        elif self.oldhanguljamo:
426            return 'DISALLOWED'
427        elif self.in_lettersdigits:
428            return 'PVALID'
429        else:
430            return 'DISALLOWED'
431
432    @property
433    def uts46_data(self):
434        return self.ucdata.ucd_idnamt.get(self.value, None)
435
436    @property
437    def uts46_status(self):
438        return ' '.join(self.uts46_data)
439
440
441def diagnose_codepoint(codepoint, args, ucdata):
442
443    cp = CodePoint(codepoint, ucdata=ucdata)
444
445    print("U+{:04X}:".format(codepoint))
446    print("   Name:             {}".format(cp.name))
447    print("1  Exceptions:       {}".format(exceptions.get(codepoint, False)))
448    print("2  Backwards Compat: {}".format(backwardscompatible.get(codepoint, False)))
449    print("3  Unassigned:       {}".format(cp.unassigned))
450    print("4  LDH:              {}".format(cp.ldh))
451    print("   Properties:       {}".format(" ".join(sorted(ucdata.ucd_props.get(codepoint, ['None'])))))
452    print("5  .Join Control:    {}".format(cp.join_control))
453    print("   NFKC CF:          {}".format(" ".join(["U+{:04X}".format(ord(x)) for x in cp.nfkc_cf])))
454    print("6  .Unstable:        {}".format(cp.unstable))
455    print("7  .Ignorable Prop:  {}".format(cp.in_ignorableproperties))
456    print("   Block:            {}".format(ucdata.ucd_block.get(codepoint, None)))
457    print("8  .Ignorable Block: {}".format(cp.in_ignorableblocks))
458    print("   Hangul Syll Type: {}".format(ucdata.ucd_hst.get(codepoint, None)))
459    print("9  .Old Hangul Jamo: {}".format(cp.oldhanguljamo))
460    print("   General Category: {}".format(cp.general_category))
461    print("10 .Letters Digits:  {}".format(cp.in_lettersdigits))
462    print("== IDNA 2008:        {}".format(cp.idna2008_status))
463    print("== UTS 46:           {}".format(cp.uts46_status))
464    print("(Unicode {} [sys:{}])".format(ucdata.version, ucdata.system_version))
465
466def ucdrange(start, end):
467    if start == end:
468        return ("{:04X}".format(start.value), start.name)
469    else:
470        return ("{:04X}..{:04X}".format(start.value, end.value),
471                "{}..{}".format(start.name, end.name))
472
473def optimised_list(d):
474    yield '('
475    for value in intranges_from_list(d):
476        yield '        {},'.format(hex(value))
477    yield '    ),'
478
479def make_table(args, ucdata):
480
481    last_status = None
482    cps = []
483    table_data = []
484
485    for cp in ucdata.codepoints():
486        status = cp.idna2008_status
487        if (last_status and last_status != status):
488            (values, description) = ucdrange(cps[0], cps[-1])
489            table_data.append([values, last_status, description])
490            cps = []
491        last_status = status
492        cps.append(cp)
493    (values, description) = ucdrange(cps[0], cps[-1])
494    table_data.append([values, last_status, description])
495
496    if args.dir:
497
498        f = open("{}/idna-table-{}.txt".format(args.dir, ucdata.version), 'wb')
499        for row in table_data:
500            f.write("{:12}; {:12}# {:.44}\n".format(*row).encode('ascii'))
501        f.close()
502
503    else:
504
505        for row in table_data:
506            print("{:12}; {:12}# {:.44}".format(*row))
507
508def idna_libdata(ucdata):
509
510    yield "# This file is automatically generated by tools/idna-data\n"
511    yield "__version__ = \"{}\"".format(ucdata.version)
512
513    #
514    # Script classifications are used by some CONTEXTO rules in RFC 5891
515    #
516    yield "scripts = {"
517    for script in SCRIPT_WHITELIST:
518        prefix = "    '{0}': ".format(script)
519        for line in optimised_list(ucdata.ucd_s[script]):
520            yield prefix + line
521            prefix = ""
522    yield "}"
523
524    #
525    # Joining types are used by CONTEXTJ rule A.1
526    #
527    yield "joining_types = {"
528    for cp in ucdata.codepoints():
529        if cp.joining_type:
530            yield "    0x{0:x}: {1},".format(cp.value, ord(cp.joining_type))
531    yield "}"
532
533    #
534    # These are the classification of codepoints into PVALID, CONTEXTO, CONTEXTJ, etc.
535    #
536    yield "codepoint_classes = {"
537    classes = {}
538    for cp in ucdata.codepoints():
539        status = cp.idna2008_status
540        if status in ('UNASSIGNED', 'DISALLOWED'):
541            continue
542        if not status in classes:
543            classes[status] = set()
544        classes[status].add(cp.value)
545    for status in ['PVALID', 'CONTEXTJ', 'CONTEXTO']:
546        prefix = "    '{0}': ".format(status)
547        for line in optimised_list(classes[status]):
548            yield prefix + line
549            prefix = ""
550    yield "}"
551
552def uts46_ranges(ucdata):
553
554    last = (None, None)
555    for cp in ucdata.codepoints():
556        fields = cp.uts46_data
557        if not fields:
558            continue
559        status, mapping = UTS46_STATUSES[fields[0]]
560        if mapping:
561            mapping = "".join(chr(int(codepoint, 16)) for codepoint in fields[1].split())
562            mapping = mapping.replace("\\", "\\\\").replace("'", "\\'")
563        else:
564            mapping = None
565        if cp.value > 255 and (status, mapping) == last:
566            continue
567        last = (status, mapping)
568
569        if mapping is not None:
570            yield "(0x{0:X}, '{1}', u'{2}')".format(cp.value, status, mapping)
571        else:
572            yield "(0x{0:X}, '{1}')".format(cp.value, status)
573
574def uts46_libdata(ucdata):
575
576    yield "# This file is automatically generated by tools/idna-data"
577    yield "# vim: set fileencoding=utf-8 :\n"
578    yield '"""IDNA Mapping Table from UTS46."""\n\n'
579
580    yield "__version__ = \"{}\"".format(ucdata.version)
581
582    idx = -1
583    for row in uts46_ranges(ucdata):
584        idx += 1
585        if idx % UTS46_SEGMENT_SIZE == 0:
586            if idx != 0:
587                yield "    ]\n"
588            yield "def _seg_{0}():\n    return [".format(idx // UTS46_SEGMENT_SIZE)
589        yield "    {0},".format(row)
590    yield "    ]\n"
591
592    yield "uts46data = tuple("
593    yield "    _seg_0()"
594    for i in range(1, idx // UTS46_SEGMENT_SIZE + 1):
595        yield "    + _seg_{0}()".format(i)
596    yield ")"
597
598def make_libdata(args, ucdata):
599
600    dest_dir = args.dir or '.'
601
602    target_filename = os.path.join(dest_dir, 'idnadata.py')
603    with open(target_filename, 'wb') as target:
604        for line in idna_libdata(ucdata):
605            target.write((line + "\n").encode('utf-8'))
606
607    target_filename = os.path.join(dest_dir, 'uts46data.py')
608    with open(target_filename, 'wb') as target:
609        for line in uts46_libdata(ucdata):
610            target.write((line + "\n").encode('utf-8'))
611
612def arg_error(message, parser):
613
614    parser.print_usage()
615    print('{}: error: {}'.format(sys.argv[0], message))
616    sys.exit(2)
617
618def main():
619
620    parser = argparse.ArgumentParser(description='Determine IDNA code-point validity data')
621    parser.add_argument('action', type=str, default='preferred',
622                        help='Task to perform (make-libdata, make-tables, <codepoint>)')
623
624    parser.add_argument('--version', type=str, default='preferred',
625                        help='Unicode version to use (preferred, latest, <x.y.z>)')
626    parser.add_argument('--source', type=str, default=None,
627                        help='Where to fetch Unicode data (file path)')
628    parser.add_argument('--dir', type=str, default=None, help='Where to export the output')
629    parser.add_argument('--cache', type=str, default=None, help='Where to cache Unicode data')
630    parser.add_argument('--no-cache', action='store_true', help='Don\'t cache Unicode data')
631    libdata = parser.add_argument_group('make-libdata', 'Make module data for Python IDNA library')
632
633    tables = parser.add_argument_group('make-table', 'Make IANA-style reference table')
634
635    codepoint = parser.add_argument_group('codepoint',
636                                          'Display related data for given codepoint (e.g. U+0061)')
637
638    args = parser.parse_args()
639
640    if args.version == 'preferred':
641        target_version = PREFERRED_VERSION
642    else:
643        target_version = args.version
644
645    if args.cache and args.no_cache:
646        arg_error('I can\'t both --cache and --no-cache', parser)
647    cache = args.cache or DEFAULT_CACHE_DIR
648    if args.no_cache:
649        cache = None
650
651    ucdata = UnicodeData(target_version, cache, args)
652
653    if args.action == 'make-table':
654        make_table(args, ucdata)
655    elif args.action == 'make-libdata':
656        make_libdata(args, ucdata)
657    else:
658        result = re.match('^(?i)(U\+|)(?P<cp>[0-9A-F]{4,6})$', args.action)
659        if result:
660            codepoint = int(result.group('cp'), 16)
661            diagnose_codepoint(codepoint, args, ucdata)
662            sys.exit(0)
663        arg_error('Don\'t recognize action or codepoint value', parser)
664
665
666if __name__ == '__main__':
667    main()
668
669
670
671