1#!/usr/bin/env python3 2 3import argparse, collections, datetime, os, re, sys, unicodedata 4from urllib.request import urlopen 5from intranges import intranges_from_list 6 7if sys.version_info[0] < 3: 8 print("Only Python 3 supported.") 9 sys.exit(2) 10 11PREFERRED_VERSION = '12.1.0' 12UCD_URL = 'http://www.unicode.org/Public/{version}/ucd/{filename}' 13UTS46_URL = 'http://www.unicode.org/Public/idna/{version}/{filename}' 14 15DEFAULT_CACHE_DIR = '~/.cache/unidata' 16 17# Scripts affected by IDNA contextual rules 18SCRIPT_WHITELIST = sorted(['Greek', 'Han', 'Hebrew', 'Hiragana', 'Katakana']) 19 20# Used to piece apart UTS#46 data for Jython compatibility 21UTS46_SEGMENT_SIZE = 100 22 23UTS46_STATUSES = { 24 "valid": ("V", False), 25 "ignored": ("I", False), 26 "mapped": ("M", True), 27 "deviation": ("D", True), 28 "disallowed": ("X", False), 29 "disallowed_STD3_valid": ("3", False), 30 "disallowed_STD3_mapped": ("3", True) 31} 32 33# Exceptions are manually assigned in Section 2.6 of RFC 5892. 34exceptions = { 35 0x00DF: 'PVALID', # LATIN SMALL LETTER SHARP S 36 0x03C2: 'PVALID', # GREEK SMALL LETTER FINAL SIGMA 37 0x06FD: 'PVALID', # ARABIC SIGN SINDHI AMPERSAND 38 0x06FE: 'PVALID', # ARABIC SIGN SINDHI POSTPOSITION MEN 39 0x0F0B: 'PVALID', # TIBETAN MARK INTERSYLLABIC TSHEG 40 0x3007: 'PVALID', # IDEOGRAPHIC NUMBER ZERO 41 0x00B7: 'CONTEXTO', # MIDDLE DOT 42 0x0375: 'CONTEXTO', # GREEK LOWER NUMERAL SIGN (KERAIA) 43 0x05F3: 'CONTEXTO', # HEBREW PUNCTUATION GERESH 44 0x05F4: 'CONTEXTO', # HEBREW PUNCTUATION GERSHAYIM 45 0x30FB: 'CONTEXTO', # KATAKANA MIDDLE DOT 46 0x0660: 'CONTEXTO', # ARABIC-INDIC DIGIT ZERO 47 0x0661: 'CONTEXTO', # ARABIC-INDIC DIGIT ONE 48 0x0662: 'CONTEXTO', # ARABIC-INDIC DIGIT TWO 49 0x0663: 'CONTEXTO', # ARABIC-INDIC DIGIT THREE 50 0x0664: 'CONTEXTO', # ARABIC-INDIC DIGIT FOUR 51 0x0665: 'CONTEXTO', # ARABIC-INDIC DIGIT FIVE 52 0x0666: 'CONTEXTO', # ARABIC-INDIC DIGIT SIX 53 0x0667: 'CONTEXTO', # ARABIC-INDIC DIGIT SEVEN 54 0x0668: 'CONTEXTO', # ARABIC-INDIC DIGIT EIGHT 55 0x0669: 'CONTEXTO', # ARABIC-INDIC DIGIT NINE 56 0x06F0: 'CONTEXTO', # EXTENDED ARABIC-INDIC DIGIT ZERO 57 0x06F1: 'CONTEXTO', # EXTENDED ARABIC-INDIC DIGIT ONE 58 0x06F2: 'CONTEXTO', # EXTENDED ARABIC-INDIC DIGIT TWO 59 0x06F3: 'CONTEXTO', # EXTENDED ARABIC-INDIC DIGIT THREE 60 0x06F4: 'CONTEXTO', # EXTENDED ARABIC-INDIC DIGIT FOUR 61 0x06F5: 'CONTEXTO', # EXTENDED ARABIC-INDIC DIGIT FIVE 62 0x06F6: 'CONTEXTO', # EXTENDED ARABIC-INDIC DIGIT SIX 63 0x06F7: 'CONTEXTO', # EXTENDED ARABIC-INDIC DIGIT SEVEN 64 0x06F8: 'CONTEXTO', # EXTENDED ARABIC-INDIC DIGIT EIGHT 65 0x06F9: 'CONTEXTO', # EXTENDED ARABIC-INDIC DIGIT NINE 66 0x0640: 'DISALLOWED', # ARABIC TATWEEL 67 0x07FA: 'DISALLOWED', # NKO LAJANYALAN 68 0x302E: 'DISALLOWED', # HANGUL SINGLE DOT TONE MARK 69 0x302F: 'DISALLOWED', # HANGUL DOUBLE DOT TONE MARK 70 0x3031: 'DISALLOWED', # VERTICAL KANA REPEAT MARK 71 0x3032: 'DISALLOWED', # VERTICAL KANA REPEAT WITH VOICED SOUND MARK 72 0x3033: 'DISALLOWED', # VERTICAL KANA REPEAT MARK UPPER HALF 73 0x3034: 'DISALLOWED', # VERTICAL KANA REPEAT WITH VOICED SOUND MARK UPPER HA 74 0x3035: 'DISALLOWED', # VERTICAL KANA REPEAT MARK LOWER HALF 75 0x303B: 'DISALLOWED', # VERTICAL IDEOGRAPHIC ITERATION MARK 76} 77backwardscompatible = {} 78 79 80def hexrange(start, end): 81 return range(int(start, 16), int(end, 16) + 1) 82 83def hexvalue(value): 84 return int(value, 16) 85 86 87class UnicodeVersion(object): 88 89 def __init__(self, version): 90 result = re.match('^(?P<major>\d+)\.(?P<minor>\d+)\.(?P<patch>\d+)$', version) 91 if result: 92 self.major = int(result.group('major')) 93 self.minor = int(result.group('minor')) 94 self.patch = int(result.group('patch')) 95 self.numerical = (self.major << 8) + (self.minor << 4) + self.patch 96 self.latest = False 97 elif version == 'latest': 98 self.latest = True 99 else: 100 raise ValueError('Unrecognized Unicode version') 101 102 def __repr__(self, with_date=True): 103 if self.latest: 104 if with_date: 105 return 'latest@{}'.format(datetime.datetime.now().strftime('%Y-%m-%d')) 106 else: 107 return 'latest' 108 else: 109 return "{}.{}.{}".format(self.major, self.minor, self.patch) 110 111 @property 112 def tag(self): 113 return self.__repr__(with_date=False) 114 115 def __gt__(self, other): 116 if self.latest: 117 return True 118 return self.numerical > other.numerical 119 120 def __eq__(self, other): 121 if self.latest: 122 return False 123 return self.numerical == other.numerical 124 125 126class UnicodeData(object): 127 128 def __init__(self, version, cache, args): 129 self.version = UnicodeVersion(version) 130 self.system_version = UnicodeVersion(unicodedata.unidata_version) 131 self.source = args.source 132 self.cache = cache 133 self.max = 0 134 135 if self.system_version < self.version: 136 print("Warning: Character stability not guaranteed as Python Unicode data {}" 137 " older than requested {}".format(self.system_version, self.version)) 138 139 self._load_unicodedata() 140 self._load_proplist() 141 self._load_derivedcoreprops() 142 self._load_blocks() 143 self._load_casefolding() 144 self._load_hangulst() 145 self._load_arabicshaping() 146 self._load_scripts() 147 self._load_uts46mapping() 148 149 def _load_unicodedata(self): 150 151 f_ud = self._ucdfile('UnicodeData.txt') 152 self.ucd_data = {} 153 range_begin = None 154 for line in f_ud.splitlines(): 155 fields = line.split(';') 156 value = int(fields[0], 16) 157 start_marker = re.match('^<(?P<name>.*?), First>$', fields[1]) 158 end_marker = re.match('^<(?P<name>.*?), Last>$', fields[1]) 159 if start_marker: 160 range_begin = value 161 elif end_marker: 162 for i in range(range_begin, value+1): 163 fields[1] = '<{}>'.format(end_marker.group('name')) 164 self.ucd_data[i] = fields[1:] 165 range_begin = None 166 else: 167 self.ucd_data[value] = fields[1:] 168 169 def _load_proplist(self): 170 171 f_pl = self._ucdfile('PropList.txt') 172 self.ucd_props = collections.defaultdict(list) 173 for line in f_pl.splitlines(): 174 result = re.match( 175 '^(?P<start>[0-9A-F]{4,6})(|\.\.(?P<end>[0-9A-F]{4,6}))\s*;\s*(?P<prop>\S+)\s*(|\#.*)$', 176 line) 177 if result: 178 if result.group('end'): 179 for i in hexrange(result.group('start'), result.group('end')): 180 self.ucd_props[i].append(result.group('prop')) 181 else: 182 i = hexvalue(result.group('start')) 183 self.ucd_props[i].append(result.group('prop')) 184 185 def _load_derivedcoreprops(self): 186 187 f_dcp = self._ucdfile('DerivedCoreProperties.txt') 188 for line in f_dcp.splitlines(): 189 result = re.match( 190 '^(?P<start>[0-9A-F]{4,6})(|\.\.(?P<end>[0-9A-F]{4,6}))\s*;\s*(?P<prop>\S+)\s*(|\#.*)$', 191 line) 192 if result: 193 if result.group('end'): 194 for i in hexrange(result.group('start'), result.group('end')): 195 self.ucd_props[i].append(result.group('prop')) 196 else: 197 i = hexvalue(result.group('start')) 198 self.ucd_props[i].append(result.group('prop')) 199 200 def _load_blocks(self): 201 202 self.ucd_block = {} 203 f_b = self._ucdfile('Blocks.txt') 204 for line in f_b.splitlines(): 205 result = re.match( 206 '^(?P<start>[0-9A-F]{4,6})\.\.(?P<end>[0-9A-F]{4,6})\s*;\s*(?P<block>.*)\s*$', 207 line) 208 if result: 209 for i in hexrange(result.group('start'), result.group('end')): 210 self.ucd_block[i] = result.group('block') 211 self.max = max(self.max, i) 212 213 def _load_casefolding(self): 214 215 self.ucd_cf = {} 216 f_cf = self._ucdfile('CaseFolding.txt') 217 for line in f_cf.splitlines(): 218 result = re.match( 219 '^(?P<cp>[0-9A-F]{4,6})\s*;\s*(?P<type>\S+)\s*;\s*(?P<subst>[0-9A-F\s]+)\s*', 220 line) 221 if result: 222 if result.group('type') in ('C', 'F'): 223 self.ucd_cf[int(result.group('cp'), 16)] = \ 224 ''.join([chr(int(x, 16)) for x in result.group('subst').split(' ')]) 225 226 def _load_hangulst(self): 227 228 self.ucd_hst = {} 229 f_hst = self._ucdfile('HangulSyllableType.txt') 230 for line in f_hst.splitlines(): 231 result = re.match( 232 '^(?P<start>[0-9A-F]{4,6})\.\.(?P<end>[0-9A-F]{4,6})\s*;\s*(?P<type>\S+)\s*(|\#.*)$', 233 line) 234 if result: 235 for i in hexrange(result.group('start'), result.group('end')): 236 self.ucd_hst[i] = result.group('type') 237 238 def _load_arabicshaping(self): 239 240 self.ucd_as = {} 241 f_as = self._ucdfile('ArabicShaping.txt') 242 for line in f_as.splitlines(): 243 result = re.match('^(?P<cp>[0-9A-F]{4,6})\s*;\s*.*?\s*;\s*(?P<jt>\S+)\s*;', line) 244 if result: 245 self.ucd_as[int(result.group('cp'), 16)] = result.group('jt') 246 247 def _load_scripts(self): 248 249 self.ucd_s = {} 250 f_s = self._ucdfile('Scripts.txt') 251 for line in f_s.splitlines(): 252 result = re.match( 253 '^(?P<start>[0-9A-F]{4,6})(|\.\.(?P<end>[0-9A-F]{4,6}))\s*;\s*(?P<script>\S+)\s*(|\#.*)$', 254 line) 255 if result: 256 if not result.group('script') in self.ucd_s: 257 self.ucd_s[result.group('script')] = set() 258 if result.group('end'): 259 for i in hexrange(result.group('start'), result.group('end')): 260 self.ucd_s[result.group('script')].add(i) 261 else: 262 i = hexvalue(result.group('start')) 263 self.ucd_s[result.group('script')].add(i) 264 265 def _load_uts46mapping(self): 266 267 self.ucd_idnamt = {} 268 f_idnamt = self._ucdfile('IdnaMappingTable.txt', urlbase=UTS46_URL) 269 for line in f_idnamt.splitlines(): 270 result = re.match( 271 '^(?P<start>[0-9A-F]{4,6})(|\.\.(?P<end>[0-9A-F]{4,6}))\s*;\s*(?P<fields>[^#]+)', 272 line) 273 if result: 274 fields = [x.strip() for x in result.group('fields').split(';')] 275 if result.group('end'): 276 for i in hexrange(result.group('start'), result.group('end')): 277 self.ucd_idnamt[i] = fields 278 else: 279 i = hexvalue(result.group('start')) 280 self.ucd_idnamt[i] = fields 281 282 def _ucdfile(self, filename, urlbase=UCD_URL): 283 if self.source: 284 f = open("{}/{}".format(self.source, filename)) 285 return f.read() 286 else: 287 cache_file = None 288 if self.cache: 289 cache_file = os.path.expanduser("{}/{}/{}".format( 290 self.cache, self.version.tag, filename)) 291 if os.path.isfile(cache_file): 292 f = open(cache_file) 293 return f.read() 294 295 version_path = self.version.tag 296 if version_path == 'latest': 297 version_path = 'UCD/latest' 298 url = urlbase.format( 299 version=version_path, 300 filename=filename, 301 ) 302 content = urlopen(url).read().decode('utf-8') 303 304 if cache_file: 305 if not os.path.isdir(os.path.dirname(cache_file)): 306 os.makedirs(os.path.dirname(cache_file)) 307 f = open(cache_file, 'wb') 308 f.write(content.encode('utf-8')) 309 f.close() 310 311 return str(content) 312 313 def codepoints(self): 314 for i in range(0, self.max + 1): 315 yield CodePoint(i, ucdata=self) 316 317 318class CodePoint: 319 320 def __init__(self, value=None, ucdata=None): 321 self.value = value 322 self.ucdata = ucdata 323 324 def _casefold(self, s): 325 r = '' 326 for c in s: 327 r += self.ucdata.ucd_cf.get(ord(c), c) 328 return r 329 330 @property 331 def exception_value(self): 332 return exceptions.get(self.value, False) 333 334 @property 335 def compat_value(self): 336 return backwardscompatible.get(self.value, False) 337 338 @property 339 def name(self): 340 if self.value in self.ucdata.ucd_data: 341 return self.ucdata.ucd_data[self.value][0] 342 elif 'Noncharacter_Code_Point' in self.ucdata.ucd_props[self.value]: 343 return '<noncharacter>' 344 else: 345 return '<reserved>' 346 347 @property 348 def general_category(self): 349 return self.ucdata.ucd_data.get(self.value, [None, None])[1] 350 351 @property 352 def unassigned(self): 353 return not ('Noncharacter_Code_Point' in self.ucdata.ucd_props[self.value] or \ 354 self.value in self.ucdata.ucd_data) 355 356 @property 357 def ldh(self): 358 if self.value == 0x002d or \ 359 self.value in range(0x0030, 0x0039+1) or \ 360 self.value in range(0x0061, 0x007a+1): 361 return True 362 return False 363 364 @property 365 def join_control(self): 366 return 'Join_Control' in self.ucdata.ucd_props[self.value] 367 368 @property 369 def joining_type(self): 370 return self.ucdata.ucd_as.get(self.value, None) 371 372 @property 373 def char(self): 374 return chr(self.value) 375 376 @property 377 def nfkc_cf(self): 378 return unicodedata.normalize('NFKC', 379 self._casefold(unicodedata.normalize('NFKC', self.char))) 380 381 @property 382 def unstable(self): 383 return self.char != self.nfkc_cf 384 385 @property 386 def in_ignorableproperties(self): 387 for prop in ['Default_Ignorable_Code_Point', 'White_Space', 'Noncharacter_Code_Point']: 388 if prop in self.ucdata.ucd_props[self.value]: 389 return True 390 return False 391 392 @property 393 def in_ignorableblocks(self): 394 return self.ucdata.ucd_block.get(self.value) in ( 395 'Combining Diacritical Marks for Symbols', 'Musical Symbols', 396 'Ancient Greek Musical Notation' 397 ) 398 399 @property 400 def oldhanguljamo(self): 401 return self.ucdata.ucd_hst.get(self.value) in ('L', 'V', 'T') 402 403 @property 404 def in_lettersdigits(self): 405 return self.general_category in ('Ll', 'Lu', 'Lo', 'Nd', 'Lm', 'Mn', 'Mc') 406 407 @property 408 def idna2008_status(self): 409 if self.exception_value: 410 return self.exception_value 411 elif self.compat_value: 412 return self.compat_value 413 elif self.unassigned: 414 return 'UNASSIGNED' 415 elif self.ldh: 416 return 'PVALID' 417 elif self.join_control: 418 return 'CONTEXTJ' 419 elif self.unstable: 420 return 'DISALLOWED' 421 elif self.in_ignorableproperties: 422 return 'DISALLOWED' 423 elif self.in_ignorableblocks: 424 return 'DISALLOWED' 425 elif self.oldhanguljamo: 426 return 'DISALLOWED' 427 elif self.in_lettersdigits: 428 return 'PVALID' 429 else: 430 return 'DISALLOWED' 431 432 @property 433 def uts46_data(self): 434 return self.ucdata.ucd_idnamt.get(self.value, None) 435 436 @property 437 def uts46_status(self): 438 return ' '.join(self.uts46_data) 439 440 441def diagnose_codepoint(codepoint, args, ucdata): 442 443 cp = CodePoint(codepoint, ucdata=ucdata) 444 445 print("U+{:04X}:".format(codepoint)) 446 print(" Name: {}".format(cp.name)) 447 print("1 Exceptions: {}".format(exceptions.get(codepoint, False))) 448 print("2 Backwards Compat: {}".format(backwardscompatible.get(codepoint, False))) 449 print("3 Unassigned: {}".format(cp.unassigned)) 450 print("4 LDH: {}".format(cp.ldh)) 451 print(" Properties: {}".format(" ".join(sorted(ucdata.ucd_props.get(codepoint, ['None']))))) 452 print("5 .Join Control: {}".format(cp.join_control)) 453 print(" NFKC CF: {}".format(" ".join(["U+{:04X}".format(ord(x)) for x in cp.nfkc_cf]))) 454 print("6 .Unstable: {}".format(cp.unstable)) 455 print("7 .Ignorable Prop: {}".format(cp.in_ignorableproperties)) 456 print(" Block: {}".format(ucdata.ucd_block.get(codepoint, None))) 457 print("8 .Ignorable Block: {}".format(cp.in_ignorableblocks)) 458 print(" Hangul Syll Type: {}".format(ucdata.ucd_hst.get(codepoint, None))) 459 print("9 .Old Hangul Jamo: {}".format(cp.oldhanguljamo)) 460 print(" General Category: {}".format(cp.general_category)) 461 print("10 .Letters Digits: {}".format(cp.in_lettersdigits)) 462 print("== IDNA 2008: {}".format(cp.idna2008_status)) 463 print("== UTS 46: {}".format(cp.uts46_status)) 464 print("(Unicode {} [sys:{}])".format(ucdata.version, ucdata.system_version)) 465 466def ucdrange(start, end): 467 if start == end: 468 return ("{:04X}".format(start.value), start.name) 469 else: 470 return ("{:04X}..{:04X}".format(start.value, end.value), 471 "{}..{}".format(start.name, end.name)) 472 473def optimised_list(d): 474 yield '(' 475 for value in intranges_from_list(d): 476 yield ' {},'.format(hex(value)) 477 yield ' ),' 478 479def make_table(args, ucdata): 480 481 last_status = None 482 cps = [] 483 table_data = [] 484 485 for cp in ucdata.codepoints(): 486 status = cp.idna2008_status 487 if (last_status and last_status != status): 488 (values, description) = ucdrange(cps[0], cps[-1]) 489 table_data.append([values, last_status, description]) 490 cps = [] 491 last_status = status 492 cps.append(cp) 493 (values, description) = ucdrange(cps[0], cps[-1]) 494 table_data.append([values, last_status, description]) 495 496 if args.dir: 497 498 f = open("{}/idna-table-{}.txt".format(args.dir, ucdata.version), 'wb') 499 for row in table_data: 500 f.write("{:12}; {:12}# {:.44}\n".format(*row).encode('ascii')) 501 f.close() 502 503 else: 504 505 for row in table_data: 506 print("{:12}; {:12}# {:.44}".format(*row)) 507 508def idna_libdata(ucdata): 509 510 yield "# This file is automatically generated by tools/idna-data\n" 511 yield "__version__ = \"{}\"".format(ucdata.version) 512 513 # 514 # Script classifications are used by some CONTEXTO rules in RFC 5891 515 # 516 yield "scripts = {" 517 for script in SCRIPT_WHITELIST: 518 prefix = " '{0}': ".format(script) 519 for line in optimised_list(ucdata.ucd_s[script]): 520 yield prefix + line 521 prefix = "" 522 yield "}" 523 524 # 525 # Joining types are used by CONTEXTJ rule A.1 526 # 527 yield "joining_types = {" 528 for cp in ucdata.codepoints(): 529 if cp.joining_type: 530 yield " 0x{0:x}: {1},".format(cp.value, ord(cp.joining_type)) 531 yield "}" 532 533 # 534 # These are the classification of codepoints into PVALID, CONTEXTO, CONTEXTJ, etc. 535 # 536 yield "codepoint_classes = {" 537 classes = {} 538 for cp in ucdata.codepoints(): 539 status = cp.idna2008_status 540 if status in ('UNASSIGNED', 'DISALLOWED'): 541 continue 542 if not status in classes: 543 classes[status] = set() 544 classes[status].add(cp.value) 545 for status in ['PVALID', 'CONTEXTJ', 'CONTEXTO']: 546 prefix = " '{0}': ".format(status) 547 for line in optimised_list(classes[status]): 548 yield prefix + line 549 prefix = "" 550 yield "}" 551 552def uts46_ranges(ucdata): 553 554 last = (None, None) 555 for cp in ucdata.codepoints(): 556 fields = cp.uts46_data 557 if not fields: 558 continue 559 status, mapping = UTS46_STATUSES[fields[0]] 560 if mapping: 561 mapping = "".join(chr(int(codepoint, 16)) for codepoint in fields[1].split()) 562 mapping = mapping.replace("\\", "\\\\").replace("'", "\\'") 563 else: 564 mapping = None 565 if cp.value > 255 and (status, mapping) == last: 566 continue 567 last = (status, mapping) 568 569 if mapping is not None: 570 yield "(0x{0:X}, '{1}', u'{2}')".format(cp.value, status, mapping) 571 else: 572 yield "(0x{0:X}, '{1}')".format(cp.value, status) 573 574def uts46_libdata(ucdata): 575 576 yield "# This file is automatically generated by tools/idna-data" 577 yield "# vim: set fileencoding=utf-8 :\n" 578 yield '"""IDNA Mapping Table from UTS46."""\n\n' 579 580 yield "__version__ = \"{}\"".format(ucdata.version) 581 582 idx = -1 583 for row in uts46_ranges(ucdata): 584 idx += 1 585 if idx % UTS46_SEGMENT_SIZE == 0: 586 if idx != 0: 587 yield " ]\n" 588 yield "def _seg_{0}():\n return [".format(idx // UTS46_SEGMENT_SIZE) 589 yield " {0},".format(row) 590 yield " ]\n" 591 592 yield "uts46data = tuple(" 593 yield " _seg_0()" 594 for i in range(1, idx // UTS46_SEGMENT_SIZE + 1): 595 yield " + _seg_{0}()".format(i) 596 yield ")" 597 598def make_libdata(args, ucdata): 599 600 dest_dir = args.dir or '.' 601 602 target_filename = os.path.join(dest_dir, 'idnadata.py') 603 with open(target_filename, 'wb') as target: 604 for line in idna_libdata(ucdata): 605 target.write((line + "\n").encode('utf-8')) 606 607 target_filename = os.path.join(dest_dir, 'uts46data.py') 608 with open(target_filename, 'wb') as target: 609 for line in uts46_libdata(ucdata): 610 target.write((line + "\n").encode('utf-8')) 611 612def arg_error(message, parser): 613 614 parser.print_usage() 615 print('{}: error: {}'.format(sys.argv[0], message)) 616 sys.exit(2) 617 618def main(): 619 620 parser = argparse.ArgumentParser(description='Determine IDNA code-point validity data') 621 parser.add_argument('action', type=str, default='preferred', 622 help='Task to perform (make-libdata, make-tables, <codepoint>)') 623 624 parser.add_argument('--version', type=str, default='preferred', 625 help='Unicode version to use (preferred, latest, <x.y.z>)') 626 parser.add_argument('--source', type=str, default=None, 627 help='Where to fetch Unicode data (file path)') 628 parser.add_argument('--dir', type=str, default=None, help='Where to export the output') 629 parser.add_argument('--cache', type=str, default=None, help='Where to cache Unicode data') 630 parser.add_argument('--no-cache', action='store_true', help='Don\'t cache Unicode data') 631 libdata = parser.add_argument_group('make-libdata', 'Make module data for Python IDNA library') 632 633 tables = parser.add_argument_group('make-table', 'Make IANA-style reference table') 634 635 codepoint = parser.add_argument_group('codepoint', 636 'Display related data for given codepoint (e.g. U+0061)') 637 638 args = parser.parse_args() 639 640 if args.version == 'preferred': 641 target_version = PREFERRED_VERSION 642 else: 643 target_version = args.version 644 645 if args.cache and args.no_cache: 646 arg_error('I can\'t both --cache and --no-cache', parser) 647 cache = args.cache or DEFAULT_CACHE_DIR 648 if args.no_cache: 649 cache = None 650 651 ucdata = UnicodeData(target_version, cache, args) 652 653 if args.action == 'make-table': 654 make_table(args, ucdata) 655 elif args.action == 'make-libdata': 656 make_libdata(args, ucdata) 657 else: 658 result = re.match('^(?i)(U\+|)(?P<cp>[0-9A-F]{4,6})$', args.action) 659 if result: 660 codepoint = int(result.group('cp'), 16) 661 diagnose_codepoint(codepoint, args, ucdata) 662 sys.exit(0) 663 arg_error('Don\'t recognize action or codepoint value', parser) 664 665 666if __name__ == '__main__': 667 main() 668 669 670 671