1#!/usr/bin/python 2# -*- coding: utf-8 -*- 3# make_unicode_property_data.py 4# Copyright (c) 2016-2020 K.Kosako 5 6import sys 7import re 8 9POSIX_LIST = [ 10 'NEWLINE', 'Alpha', 'Blank', 'Cntrl', 'Digit', 'Graph', 'Lower', 11 'Print', 'Punct', 'Space', 'Upper', 'XDigit', 'Word', 'Alnum', 'ASCII' 12] 13 14MAX_CODE_POINT = 0x10ffff 15 16GRAPHEME_CLUSTER_BREAK_NAME_PREFIX = 'Grapheme_Cluster_Break_' 17 18UD_FIRST_REG = re.compile("<.+,\s*First>") 19UD_LAST_REG = re.compile("<.+,\s*Last>") 20PR_TOTAL_REG = re.compile("#\s*Total\s+(?:code\s+points|elements):") 21PR_LINE_REG = re.compile("([0-9A-Fa-f]+)(?:..([0-9A-Fa-f]+))?\s*;\s*(\w+)") 22PA_LINE_REG = re.compile("(\w+)\s*;\s*(\w+)") 23PVA_LINE_REG = re.compile("(sc|gc)\s*;\s*(\w+)\s*;\s*(\w+)(?:\s*;\s*(\w+))?") 24BL_LINE_REG = re.compile("([0-9A-Fa-f]+)\.\.([0-9A-Fa-f]+)\s*;\s*(.*)") 25UNICODE_VERSION_REG = re.compile("#\s*.*-(\d+)\.(\d+)\.(\d+)\.txt") 26EMOJI_VERSION_REG = re.compile("(?i)#\s*Version:\s*(\d+)\.(\d+)") 27 28VERSION_INFO = [-1, -1, -1] 29EMOJI_VERSION_INFO = [-1, -1] 30 31DIC = { } 32KDIC = { } 33PropIndex = { } 34PROPERTY_NAME_MAX_LEN = 0 35PROPS = None 36 37def normalize_prop_name(name): 38 name = re.sub(r'[ _]', '', name) 39 name = name.lower() 40 return name 41 42def fix_block_name(name): 43 s = re.sub(r'[- ]+', '_', name) 44 return 'In_' + s 45 46def print_ranges(ranges): 47 for (start, end) in ranges: 48 print "0x%06x, 0x%06x" % (start, end) 49 50 print len(ranges) 51 52def print_prop_and_index(prop, i): 53 print "%-35s %3d" % (prop + ',', i) 54 PropIndex[prop] = i 55 56PRINT_CACHE = { } 57 58def print_property(prop, data, desc): 59 print '' 60 print "/* PROPERTY: '%s': %s */" % (prop, desc) 61 62 prev_prop = dic_find_by_value(PRINT_CACHE, data) 63 if prev_prop is not None: 64 print "#define CR_%s CR_%s" % (prop, prev_prop) 65 else: 66 PRINT_CACHE[prop] = data 67 print "static const OnigCodePoint" 68 print "CR_%s[] = { %d," % (prop, len(data)) 69 for (start, end) in data: 70 print "0x%04x, 0x%04x," % (start, end) 71 72 print "}; /* END of CR_%s */" % prop 73 74 75def dic_find_by_value(dic, v): 76 for key, val in dic.items(): 77 if val == v: 78 return key 79 80 return None 81 82def make_reverse_dic(dic): 83 rev = {} 84 for key, val in dic.items(): 85 d = rev.get(val, None) 86 if d is None: 87 rev[val] = [key] 88 else: 89 d.append(key) 90 91 return rev 92 93def normalize_ranges(in_ranges, sort=False): 94 if sort: 95 ranges = sorted(in_ranges) 96 else: 97 ranges = in_ranges 98 99 r = [] 100 prev = None 101 for (start, end) in ranges: 102 if prev >= start - 1: 103 (pstart, pend) = r.pop() 104 end = max(pend, end) 105 start = pstart 106 107 r.append((start, end)) 108 prev = end 109 110 return r 111 112def inverse_ranges(in_ranges): 113 r = [] 114 prev = 0x000000 115 for (start, end) in in_ranges: 116 if prev < start: 117 r.append((prev, start - 1)) 118 119 prev = end + 1 120 121 if prev < MAX_CODE_POINT: 122 r.append((prev, MAX_CODE_POINT)) 123 124 return r 125 126def add_ranges(r1, r2): 127 r = r1 + r2 128 return normalize_ranges(r, True) 129 130def sub_one_range(one_range, rs): 131 r = [] 132 (s1, e1) = one_range 133 n = len(rs) 134 for i in range(0, n): 135 (s2, e2) = rs[i] 136 if s2 >= s1 and s2 <= e1: 137 if s2 > s1: 138 r.append((s1, s2 - 1)) 139 if e2 >= e1: 140 return r 141 142 s1 = e2 + 1 143 elif s2 < s1 and e2 >= s1: 144 if e2 < e1: 145 s1 = e2 + 1 146 else: 147 return r 148 149 r.append((s1, e1)) 150 return r 151 152def sub_ranges(r1, r2): 153 r = [] 154 for one_range in r1: 155 rs = sub_one_range(one_range, r2) 156 r.extend(rs) 157 158 return r 159 160def add_ranges_in_dic(dic): 161 r = [] 162 for k, v in dic.items(): 163 r = r + v 164 165 return normalize_ranges(r, True) 166 167def normalize_ranges_in_dic(dic, sort=False): 168 for k, v in dic.items(): 169 r = normalize_ranges(v, sort) 170 dic[k] = r 171 172def merge_dic(to_dic, from_dic): 173 to_keys = to_dic.keys() 174 from_keys = from_dic.keys() 175 common = list(set(to_keys) & set(from_keys)) 176 if len(common) != 0: 177 print >> sys.stderr, "merge_dic: collision: %s" % sorted(common) 178 179 to_dic.update(from_dic) 180 181def merge_props(to_props, from_props): 182 common = list(set(to_props) & set(from_props)) 183 if len(common) != 0: 184 print >> sys.stderr, "merge_props: collision: %s" % sorted(common) 185 186 to_props.extend(from_props) 187 188def add_range_into_dic(dic, name, start, end): 189 d = dic.get(name, None) 190 if d is None: 191 d = [(start, end)] 192 dic[name] = d 193 else: 194 d.append((start, end)) 195 196def list_sub(a, b): 197 x = set(a) - set(b) 198 return list(x) 199 200 201def parse_unicode_data_file(f): 202 dic = { } 203 assigned = [] 204 for line in f: 205 s = line.strip() 206 if len(s) == 0: 207 continue 208 if s[0] == '#': 209 continue 210 211 a = s.split(';') 212 code = int(a[0], 16) 213 desc = a[1] 214 prop = a[2] 215 if UD_FIRST_REG.match(desc) is not None: 216 start = code 217 end = None 218 elif UD_LAST_REG.match(desc) is not None: 219 end = code 220 else: 221 start = end = code 222 223 if end is not None: 224 assigned.append((start, end)) 225 add_range_into_dic(dic, prop, start, end) 226 if len(prop) == 2: 227 add_range_into_dic(dic, prop[0:1], start, end) 228 229 normalize_ranges_in_dic(dic) 230 return dic, assigned 231 232def parse_properties(path, klass, prop_prefix = None, version_reg = None): 233 version_match = None 234 with open(path, 'r') as f: 235 dic = { } 236 prop = None 237 props = [] 238 for line in f: 239 s = line.strip() 240 if len(s) == 0: 241 continue 242 243 if s[0] == '#' and version_reg is not None and version_match is None: 244 version_match = version_reg.match(s) 245 if version_match is not None: 246 continue 247 248 m = PR_LINE_REG.match(s) 249 if m: 250 prop = m.group(3) 251 if prop_prefix is not None: 252 prop = prop_prefix + prop 253 254 if m.group(2): 255 start = int(m.group(1), 16) 256 end = int(m.group(2), 16) 257 add_range_into_dic(dic, prop, start, end) 258 else: 259 start = int(m.group(1), 16) 260 add_range_into_dic(dic, prop, start, start) 261 262 elif PR_TOTAL_REG.match(s) is not None: 263 KDIC[prop] = klass 264 props.append(prop) 265 266 normalize_ranges_in_dic(dic) 267 return (dic, props, version_match) 268 269def parse_property_aliases(path): 270 a = { } 271 with open(path, 'r') as f: 272 for line in f: 273 s = line.strip() 274 if len(s) == 0: 275 continue 276 277 m = PA_LINE_REG.match(s) 278 if not(m): 279 continue 280 281 if m.group(1) == m.group(2): 282 continue 283 284 a[m.group(1)] = m.group(2) 285 286 return a 287 288def parse_property_value_aliases(path): 289 a = { } 290 with open(path, 'r') as f: 291 for line in f: 292 s = line.strip() 293 if len(s) == 0: 294 continue 295 296 m = PVA_LINE_REG.match(s) 297 if not(m): 298 continue 299 300 cat = m.group(1) 301 x2 = m.group(2) 302 x3 = m.group(3) 303 x4 = m.group(4) 304 if cat == 'sc': 305 if x2 != x3: 306 a[x2] = x3 307 if x4 and x4 != x3: 308 a[x4] = x3 309 else: 310 if x2 != x3: 311 a[x3] = x2 312 if x4 and x4 != x2: 313 a[x4] = x2 314 315 return a 316 317def parse_blocks(path): 318 dic = { } 319 blocks = [] 320 with open(path, 'r') as f: 321 for line in f: 322 s = line.strip() 323 if len(s) == 0: 324 continue 325 326 m = BL_LINE_REG.match(s) 327 if not(m): 328 continue 329 330 start = int(m.group(1), 16) 331 end = int(m.group(2), 16) 332 block = fix_block_name(m.group(3)) 333 add_range_into_dic(dic, block, start, end) 334 blocks.append(block) 335 336 noblock = fix_block_name('No_Block') 337 dic[noblock] = inverse_ranges(add_ranges_in_dic(dic)) 338 blocks.append(noblock) 339 return dic, blocks 340 341def add_primitive_props(assigned): 342 DIC['Assigned'] = normalize_ranges(assigned) 343 DIC['Any'] = [(0x000000, 0x10ffff)] 344 DIC['ASCII'] = [(0x000000, 0x00007f)] 345 DIC['NEWLINE'] = [(0x00000a, 0x00000a)] 346 DIC['Cn'] = inverse_ranges(DIC['Assigned']) 347 DIC['C'].extend(DIC['Cn']) 348 DIC['C'] = normalize_ranges(DIC['C'], True) 349 350 d = [] 351 d.extend(DIC['Ll']) 352 d.extend(DIC['Lt']) 353 d.extend(DIC['Lu']) 354 DIC['LC'] = normalize_ranges(d, True) 355 356def add_posix_props(dic): 357 alnum = [] 358 alnum.extend(dic['Alphabetic']) 359 alnum.extend(dic['Nd']) # Nd == Decimal_Number 360 alnum = normalize_ranges(alnum, True) 361 362 blank = [(0x0009, 0x0009)] 363 blank.extend(dic['Zs']) # Zs == Space_Separator 364 blank = normalize_ranges(blank, True) 365 366 word = [] 367 word.extend(dic['Alphabetic']) 368 word.extend(dic['M']) # M == Mark 369 word.extend(dic['Nd']) 370 word.extend(dic['Pc']) # Pc == Connector_Punctuation 371 word = normalize_ranges(word, True) 372 373 graph = sub_ranges(dic['Any'], dic['White_Space']) 374 graph = sub_ranges(graph, dic['Cc']) 375 graph = sub_ranges(graph, dic['Cs']) # Cs == Surrogate 376 graph = sub_ranges(graph, dic['Cn']) # Cn == Unassigned 377 graph = normalize_ranges(graph, True) 378 379 p = [] 380 p.extend(graph) 381 p.extend(dic['Zs']) 382 p = normalize_ranges(p, True) 383 384 dic['Alpha'] = dic['Alphabetic'] 385 dic['Upper'] = dic['Uppercase'] 386 dic['Lower'] = dic['Lowercase'] 387 dic['Punct'] = dic['P'] # P == Punctuation 388 dic['Digit'] = dic['Nd'] 389 dic['XDigit'] = [(0x0030, 0x0039), (0x0041, 0x0046), (0x0061, 0x0066)] 390 dic['Alnum'] = alnum 391 dic['Space'] = dic['White_Space'] 392 dic['Blank'] = blank 393 dic['Cntrl'] = dic['Cc'] 394 dic['Word'] = word 395 dic['Graph'] = graph 396 dic['Print'] = p 397 398 399def set_max_prop_name(name): 400 global PROPERTY_NAME_MAX_LEN 401 n = len(name) 402 if n > PROPERTY_NAME_MAX_LEN: 403 PROPERTY_NAME_MAX_LEN = n 404 405def entry_prop_name(name, index): 406 set_max_prop_name(name) 407 if OUTPUT_LIST_MODE and index >= len(POSIX_LIST): 408 print >> UPF, "%s" % (name) 409 410def entry_and_print_prop_and_index(name, index): 411 entry_prop_name(name, index) 412 nname = normalize_prop_name(name) 413 print_prop_and_index(nname, index) 414 415def parse_and_merge_properties(path, klass, prop_prefix = None, version_reg = None): 416 dic, props, ver_m = parse_properties(path, klass, prop_prefix, version_reg) 417 merge_dic(DIC, dic) 418 merge_props(PROPS, props) 419 return dic, props, ver_m 420 421 422### main ### 423argv = sys.argv 424argc = len(argv) 425 426COPYRIGHT = ''' 427/*- 428 * Copyright (c) 2016-2020 K.Kosako 429 * All rights reserved. 430 * 431 * Redistribution and use in source and binary forms, with or without 432 * modification, are permitted provided that the following conditions 433 * are met: 434 * 1. Redistributions of source code must retain the above copyright 435 * notice, this list of conditions and the following disclaimer. 436 * 2. Redistributions in binary form must reproduce the above copyright 437 * notice, this list of conditions and the following disclaimer in the 438 * documentation and/or other materials provided with the distribution. 439 * 440 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND 441 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 442 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 443 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE 444 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 445 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 446 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 447 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 448 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 449 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 450 * SUCH DAMAGE. 451 */ 452'''.strip() 453 454POSIX_ONLY = False 455INCLUDE_GRAPHEME_CLUSTER_DATA = False 456 457for i in range(1, argc): 458 arg = argv[i] 459 if arg == '-posix': 460 POSIX_ONLY = True 461 elif arg == '-gc': 462 INCLUDE_GRAPHEME_CLUSTER_DATA = True 463 else: 464 print >> sys.stderr, "Invalid argument: %s" % arg 465 466 467OUTPUT_LIST_MODE = not(POSIX_ONLY) 468 469with open('UnicodeData.txt', 'r') as f: 470 dic, assigned = parse_unicode_data_file(f) 471 DIC = dic 472 add_primitive_props(assigned) 473 474PROPS = DIC.keys() 475PROPS = list_sub(PROPS, POSIX_LIST) 476 477_, _, ver_m = parse_and_merge_properties('DerivedCoreProperties.txt', 'Derived Property', None, UNICODE_VERSION_REG) 478if ver_m is not None: 479 VERSION_INFO[0] = int(ver_m.group(1)) 480 VERSION_INFO[1] = int(ver_m.group(2)) 481 VERSION_INFO[2] = int(ver_m.group(3)) 482 483dic, props, _ = parse_and_merge_properties('Scripts.txt', 'Script') 484DIC['Unknown'] = inverse_ranges(add_ranges_in_dic(dic)) 485 486parse_and_merge_properties('PropList.txt', 'Binary Property') 487 488_, _, ver_m = parse_and_merge_properties('emoji-data.txt', 'Emoji Property', None, EMOJI_VERSION_REG) 489if ver_m is not None: 490 EMOJI_VERSION_INFO[0] = int(ver_m.group(1)) 491 EMOJI_VERSION_INFO[1] = int(ver_m.group(2)) 492 493PROPS.append('Unknown') 494KDIC['Unknown'] = 'Script' 495 496ALIASES = parse_property_aliases('PropertyAliases.txt') 497a = parse_property_value_aliases('PropertyValueAliases.txt') 498merge_dic(ALIASES, a) 499 500dic, BLOCKS = parse_blocks('Blocks.txt') 501merge_dic(DIC, dic) 502 503if INCLUDE_GRAPHEME_CLUSTER_DATA: 504 dic, props, _ = parse_properties('GraphemeBreakProperty.txt', 505 'GraphemeBreak Property', 506 GRAPHEME_CLUSTER_BREAK_NAME_PREFIX) 507 merge_dic(DIC, dic) 508 merge_props(PROPS, props) 509 #prop = GRAPHEME_CLUSTER_BREAK_NAME_PREFIX + 'Other' 510 #DIC[prop] = inverse_ranges(add_ranges_in_dic(dic)) 511 #PROPS.append(prop) 512 #KDIC[prop] = 'GrapemeBreak Property' 513 514add_posix_props(DIC) 515PROPS = sorted(PROPS) 516 517 518s = '''%{ 519/* Generated by make_unicode_property_data.py. */ 520''' 521print s 522print COPYRIGHT 523print '' 524 525for prop in POSIX_LIST: 526 print_property(prop, DIC[prop], "POSIX [[:%s:]]" % prop) 527 528print '' 529 530if not(POSIX_ONLY): 531 for prop in PROPS: 532 klass = KDIC.get(prop, None) 533 if klass is None: 534 n = len(prop) 535 if n == 1: 536 klass = 'Major Category' 537 elif n == 2: 538 klass = 'General Category' 539 else: 540 klass = '-' 541 542 print_property(prop, DIC[prop], klass) 543 544 for block in BLOCKS: 545 print_property(block, DIC[block], 'Block') 546 547 548print '' 549print "static const OnigCodePoint*\nconst CodeRanges[] = {" 550 551for prop in POSIX_LIST: 552 print " CR_%s," % prop 553 554if not(POSIX_ONLY): 555 for prop in PROPS: 556 print " CR_%s," % prop 557 558 for prop in BLOCKS: 559 print " CR_%s," % prop 560 561s = '''}; 562 563#define pool_offset(s) offsetof(struct unicode_prop_name_pool_t, unicode_prop_name_pool_str##s) 564 565%} 566struct PoolPropertyNameCtype { 567 short int name; 568 short int ctype; 569}; 570 571%% 572''' 573sys.stdout.write(s) 574 575if OUTPUT_LIST_MODE: 576 UPF = open("UNICODE_PROPERTIES", "w") 577 if VERSION_INFO[0] < 0: 578 raise RuntimeError("Unicode Version is not found") 579 if EMOJI_VERSION_INFO[0] < 0: 580 raise RuntimeError("Emoji Version is not found") 581 582 print >> UPF, "Unicode Properties (Unicode Version: %d.%d.%d, Emoji: %d.%d)" % (VERSION_INFO[0], VERSION_INFO[1], VERSION_INFO[2], EMOJI_VERSION_INFO[0], EMOJI_VERSION_INFO[1]) 583 print >> UPF, '' 584 585index = -1 586for prop in POSIX_LIST: 587 index += 1 588 entry_and_print_prop_and_index(prop, index) 589 590if not(POSIX_ONLY): 591 for prop in PROPS: 592 index += 1 593 entry_and_print_prop_and_index(prop, index) 594 595 NALIASES = map(lambda (k,v):(normalize_prop_name(k), k, v), ALIASES.items()) 596 NALIASES = sorted(NALIASES) 597 for (nk, k, v) in NALIASES: 598 nv = normalize_prop_name(v) 599 if PropIndex.get(nk, None) is not None: 600 print >> sys.stderr, "ALIASES: already exists: %s => %s" % (k, v) 601 continue 602 aindex = PropIndex.get(nv, None) 603 if aindex is None: 604 #print >> sys.stderr, "ALIASES: value is not exist: %s => %s" % (k, v) 605 continue 606 607 entry_prop_name(k, aindex) 608 print_prop_and_index(nk, aindex) 609 610 for name in BLOCKS: 611 index += 1 612 entry_and_print_prop_and_index(name, index) 613 614print '%%' 615print '' 616if not(POSIX_ONLY): 617 if VERSION_INFO[0] < 0: 618 raise RuntimeError("Unicode Version is not found") 619 if EMOJI_VERSION_INFO[0] < 0: 620 raise RuntimeError("Emoji Version is not found") 621 622 print "#define UNICODE_PROPERTY_VERSION %02d%02d%02d" % (VERSION_INFO[0], VERSION_INFO[1], VERSION_INFO[2]) 623 print "#define UNICODE_EMOJI_VERSION %02d%02d" % (EMOJI_VERSION_INFO[0], EMOJI_VERSION_INFO[1]) 624 print '' 625 626print "#define PROPERTY_NAME_MAX_SIZE %d" % (PROPERTY_NAME_MAX_LEN + 10) 627print "#define CODE_RANGES_NUM %d" % (index + 1) 628 629index_props = make_reverse_dic(PropIndex) 630print '' 631for i in range(index + 1): 632 for p in index_props[i]: 633 print "#define PROP_INDEX_%s %d" % (p.upper(), i) 634 635if OUTPUT_LIST_MODE: 636 UPF.close() 637 638sys.exit(0) 639