1#!/usr/bin/python 2# -*- coding: utf-8 -*- 3# make_unicode_property_data.py 4# Copyright (c) 2016-2019 K.Kosako 5 6import sys 7import re 8 9POSIX_LIST = [ 10 'NEWLINE', 'Alpha', 'Blank', 'Cntrl', 'Digit', 'Graph', 'Lower', 11 'Print', 'Punct', 'Space', 'Upper', 'XDigit', 'Word', 'Alnum', 'ASCII' 12] 13 14MAX_CODE_POINT = 0x10ffff 15 16GRAPHEME_CLUSTER_BREAK_NAME_PREFIX = 'Grapheme_Cluster_Break_' 17 18UD_FIRST_REG = re.compile("<.+,\s*First>") 19UD_LAST_REG = re.compile("<.+,\s*Last>") 20PR_TOTAL_REG = re.compile("#\s*Total\s+(?:code\s+points|elements):") 21PR_LINE_REG = re.compile("([0-9A-Fa-f]+)(?:..([0-9A-Fa-f]+))?\s*;\s*(\w+)") 22PA_LINE_REG = re.compile("(\w+)\s*;\s*(\w+)") 23PVA_LINE_REG = re.compile("(sc|gc)\s*;\s*(\w+)\s*;\s*(\w+)(?:\s*;\s*(\w+))?") 24BL_LINE_REG = re.compile("([0-9A-Fa-f]+)\.\.([0-9A-Fa-f]+)\s*;\s*(.*)") 25UNICODE_VERSION_REG = re.compile("#\s*.*-(\d+)\.(\d+)\.(\d+)\.txt") 26EMOJI_VERSION_REG = re.compile("(?i)#\s*Version:\s*(\d+)\.(\d+)") 27 28VERSION_INFO = [-1, -1, -1] 29EMOJI_VERSION_INFO = [-1, -1] 30 31DIC = { } 32KDIC = { } 33PropIndex = { } 34PROPERTY_NAME_MAX_LEN = 0 35PROPS = None 36 37def normalize_prop_name(name): 38 name = re.sub(r'[ _]', '', name) 39 name = name.lower() 40 return name 41 42def fix_block_name(name): 43 s = re.sub(r'[- ]+', '_', name) 44 return 'In_' + s 45 46def print_ranges(ranges): 47 for (start, end) in ranges: 48 print "0x%06x, 0x%06x" % (start, end) 49 50 print len(ranges) 51 52def print_prop_and_index(prop, i): 53 print "%-35s %3d" % (prop + ',', i) 54 PropIndex[prop] = i 55 56PRINT_CACHE = { } 57 58def print_property(prop, data, desc): 59 print '' 60 print "/* PROPERTY: '%s': %s */" % (prop, desc) 61 62 prev_prop = dic_find_by_value(PRINT_CACHE, data) 63 if prev_prop is not None: 64 print "#define CR_%s CR_%s" % (prop, prev_prop) 65 else: 66 PRINT_CACHE[prop] = data 67 print "static const OnigCodePoint" 68 print "CR_%s[] = { %d," % (prop, len(data)) 69 for (start, end) in data: 70 print "0x%04x, 0x%04x," % (start, end) 71 72 print "}; /* END of CR_%s */" % prop 73 74 75def dic_find_by_value(dic, v): 76 for key, val in dic.items(): 77 if val == v: 78 return key 79 80 return None 81 82def make_reverse_dic(dic): 83 rev = {} 84 for key, val in dic.items(): 85 d = rev.get(val, None) 86 if d is None: 87 rev[val] = [key] 88 else: 89 d.append(key) 90 91 return rev 92 93def normalize_ranges(in_ranges, sort=False): 94 if sort: 95 ranges = sorted(in_ranges) 96 else: 97 ranges = in_ranges 98 99 r = [] 100 prev = None 101 for (start, end) in ranges: 102 if prev >= start - 1: 103 (pstart, pend) = r.pop() 104 end = max(pend, end) 105 start = pstart 106 107 r.append((start, end)) 108 prev = end 109 110 return r 111 112def inverse_ranges(in_ranges): 113 r = [] 114 prev = 0x000000 115 for (start, end) in in_ranges: 116 if prev < start: 117 r.append((prev, start - 1)) 118 119 prev = end + 1 120 121 if prev < MAX_CODE_POINT: 122 r.append((prev, MAX_CODE_POINT)) 123 124 return r 125 126def add_ranges(r1, r2): 127 r = r1 + r2 128 return normalize_ranges(r, True) 129 130def sub_one_range(one_range, rs): 131 r = [] 132 (s1, e1) = one_range 133 n = len(rs) 134 for i in range(0, n): 135 (s2, e2) = rs[i] 136 if s2 >= s1 and s2 <= e1: 137 if s2 > s1: 138 r.append((s1, s2 - 1)) 139 if e2 >= e1: 140 return r 141 142 s1 = e2 + 1 143 elif s2 < s1 and e2 >= s1: 144 if e2 < e1: 145 s1 = e2 + 1 146 else: 147 return r 148 149 r.append((s1, e1)) 150 return r 151 152def sub_ranges(r1, r2): 153 r = [] 154 for one_range in r1: 155 rs = sub_one_range(one_range, r2) 156 r.extend(rs) 157 158 return r 159 160def add_ranges_in_dic(dic): 161 r = [] 162 for k, v in dic.items(): 163 r = r + v 164 165 return normalize_ranges(r, True) 166 167def normalize_ranges_in_dic(dic, sort=False): 168 for k, v in dic.items(): 169 r = normalize_ranges(v, sort) 170 dic[k] = r 171 172def merge_dic(to_dic, from_dic): 173 to_keys = to_dic.keys() 174 from_keys = from_dic.keys() 175 common = list(set(to_keys) & set(from_keys)) 176 if len(common) != 0: 177 print >> sys.stderr, "merge_dic: collision: %s" % sorted(common) 178 179 to_dic.update(from_dic) 180 181def merge_props(to_props, from_props): 182 common = list(set(to_props) & set(from_props)) 183 if len(common) != 0: 184 print >> sys.stderr, "merge_props: collision: %s" % sorted(common) 185 186 to_props.extend(from_props) 187 188def add_range_into_dic(dic, name, start, end): 189 d = dic.get(name, None) 190 if d is None: 191 d = [(start, end)] 192 dic[name] = d 193 else: 194 d.append((start, end)) 195 196def list_sub(a, b): 197 x = set(a) - set(b) 198 return list(x) 199 200 201def parse_unicode_data_file(f): 202 dic = { } 203 assigned = [] 204 for line in f: 205 s = line.strip() 206 if len(s) == 0: 207 continue 208 if s[0] == '#': 209 continue 210 211 a = s.split(';') 212 code = int(a[0], 16) 213 desc = a[1] 214 prop = a[2] 215 if UD_FIRST_REG.match(desc) is not None: 216 start = code 217 end = None 218 elif UD_LAST_REG.match(desc) is not None: 219 end = code 220 else: 221 start = end = code 222 223 if end is not None: 224 assigned.append((start, end)) 225 add_range_into_dic(dic, prop, start, end) 226 if len(prop) == 2: 227 add_range_into_dic(dic, prop[0:1], start, end) 228 229 normalize_ranges_in_dic(dic) 230 return dic, assigned 231 232def parse_properties(path, klass, prop_prefix = None, version_reg = None): 233 version_match = None 234 with open(path, 'r') as f: 235 dic = { } 236 prop = None 237 props = [] 238 for line in f: 239 s = line.strip() 240 if len(s) == 0: 241 continue 242 243 if s[0] == '#' and version_reg is not None and version_match is None: 244 version_match = version_reg.match(s) 245 if version_match is not None: 246 continue 247 248 m = PR_LINE_REG.match(s) 249 if m: 250 prop = m.group(3) 251 if prop_prefix is not None: 252 prop = prop_prefix + prop 253 254 if m.group(2): 255 start = int(m.group(1), 16) 256 end = int(m.group(2), 16) 257 add_range_into_dic(dic, prop, start, end) 258 else: 259 start = int(m.group(1), 16) 260 add_range_into_dic(dic, prop, start, start) 261 262 elif PR_TOTAL_REG.match(s) is not None: 263 KDIC[prop] = klass 264 props.append(prop) 265 266 normalize_ranges_in_dic(dic) 267 return (dic, props, version_match) 268 269def parse_property_aliases(path): 270 a = { } 271 with open(path, 'r') as f: 272 for line in f: 273 s = line.strip() 274 if len(s) == 0: 275 continue 276 277 m = PA_LINE_REG.match(s) 278 if not(m): 279 continue 280 281 if m.group(1) == m.group(2): 282 continue 283 284 a[m.group(1)] = m.group(2) 285 286 return a 287 288def parse_property_value_aliases(path): 289 a = { } 290 with open(path, 'r') as f: 291 for line in f: 292 s = line.strip() 293 if len(s) == 0: 294 continue 295 296 m = PVA_LINE_REG.match(s) 297 if not(m): 298 continue 299 300 cat = m.group(1) 301 x2 = m.group(2) 302 x3 = m.group(3) 303 x4 = m.group(4) 304 if cat == 'sc': 305 if x2 != x3: 306 a[x2] = x3 307 if x4 and x4 != x3: 308 a[x4] = x3 309 else: 310 if x2 != x3: 311 a[x3] = x2 312 if x4 and x4 != x2: 313 a[x4] = x2 314 315 return a 316 317def parse_blocks(path): 318 dic = { } 319 blocks = [] 320 with open(path, 'r') as f: 321 for line in f: 322 s = line.strip() 323 if len(s) == 0: 324 continue 325 326 m = BL_LINE_REG.match(s) 327 if not(m): 328 continue 329 330 start = int(m.group(1), 16) 331 end = int(m.group(2), 16) 332 block = fix_block_name(m.group(3)) 333 add_range_into_dic(dic, block, start, end) 334 blocks.append(block) 335 336 noblock = fix_block_name('No_Block') 337 dic[noblock] = inverse_ranges(add_ranges_in_dic(dic)) 338 blocks.append(noblock) 339 return dic, blocks 340 341def add_primitive_props(assigned): 342 DIC['Assigned'] = normalize_ranges(assigned) 343 DIC['Any'] = [(0x000000, 0x10ffff)] 344 DIC['ASCII'] = [(0x000000, 0x00007f)] 345 DIC['NEWLINE'] = [(0x00000a, 0x00000a)] 346 DIC['Cn'] = inverse_ranges(DIC['Assigned']) 347 DIC['C'].extend(DIC['Cn']) 348 DIC['C'] = normalize_ranges(DIC['C'], True) 349 350 d = [] 351 d.extend(DIC['Ll']) 352 d.extend(DIC['Lt']) 353 d.extend(DIC['Lu']) 354 DIC['LC'] = normalize_ranges(d, True) 355 356def add_posix_props(dic): 357 alnum = [] 358 alnum.extend(dic['Alphabetic']) 359 alnum.extend(dic['Nd']) # Nd == Decimal_Number 360 alnum = normalize_ranges(alnum, True) 361 362 blank = [(0x0009, 0x0009)] 363 blank.extend(dic['Zs']) # Zs == Space_Separator 364 blank = normalize_ranges(blank, True) 365 366 word = [] 367 word.extend(dic['Alphabetic']) 368 word.extend(dic['M']) # M == Mark 369 word.extend(dic['Nd']) 370 word.extend(dic['Pc']) # Pc == Connector_Punctuation 371 word = normalize_ranges(word, True) 372 373 graph = sub_ranges(dic['Any'], dic['White_Space']) 374 graph = sub_ranges(graph, dic['Cc']) 375 graph = sub_ranges(graph, dic['Cs']) # Cs == Surrogate 376 graph = sub_ranges(graph, dic['Cn']) # Cn == Unassigned 377 graph = normalize_ranges(graph, True) 378 379 p = [] 380 p.extend(graph) 381 p.extend(dic['Zs']) 382 p = normalize_ranges(p, True) 383 384 dic['Alpha'] = dic['Alphabetic'] 385 dic['Upper'] = dic['Uppercase'] 386 dic['Lower'] = dic['Lowercase'] 387 dic['Punct'] = dic['P'] # P == Punctuation 388 dic['Digit'] = dic['Nd'] 389 dic['XDigit'] = [(0x0030, 0x0039), (0x0041, 0x0046), (0x0061, 0x0066)] 390 dic['Alnum'] = alnum 391 dic['Space'] = dic['White_Space'] 392 dic['Blank'] = blank 393 dic['Cntrl'] = dic['Cc'] 394 dic['Word'] = word 395 dic['Graph'] = graph 396 dic['Print'] = p 397 398 399def set_max_prop_name(name): 400 global PROPERTY_NAME_MAX_LEN 401 n = len(name) 402 if n > PROPERTY_NAME_MAX_LEN: 403 PROPERTY_NAME_MAX_LEN = n 404 405def entry_prop_name(name, index): 406 set_max_prop_name(name) 407 if OUTPUT_LIST_MODE and index >= len(POSIX_LIST): 408 print >> UPF, "%3d: %s" % (index, name) 409 410def entry_and_print_prop_and_index(name, index): 411 entry_prop_name(name, index) 412 nname = normalize_prop_name(name) 413 print_prop_and_index(nname, index) 414 415def parse_and_merge_properties(path, klass, prop_prefix = None, version_reg = None): 416 dic, props, ver_m = parse_properties(path, klass, prop_prefix, version_reg) 417 merge_dic(DIC, dic) 418 merge_props(PROPS, props) 419 return dic, props, ver_m 420 421### main ### 422argv = sys.argv 423argc = len(argv) 424 425POSIX_ONLY = False 426INCLUDE_GRAPHEME_CLUSTER_DATA = False 427 428for i in range(1, argc): 429 arg = argv[i] 430 if arg == '-posix': 431 POSIX_ONLY = True 432 elif arg == '-gc': 433 INCLUDE_GRAPHEME_CLUSTER_DATA = True 434 else: 435 print >> sys.stderr, "Invalid argument: %s" % arg 436 437 438OUTPUT_LIST_MODE = not(POSIX_ONLY) 439 440with open('UnicodeData.txt', 'r') as f: 441 dic, assigned = parse_unicode_data_file(f) 442 DIC = dic 443 add_primitive_props(assigned) 444 445PROPS = DIC.keys() 446PROPS = list_sub(PROPS, POSIX_LIST) 447 448_, _, ver_m = parse_and_merge_properties('DerivedCoreProperties.txt', 'Derived Property', None, UNICODE_VERSION_REG) 449if ver_m is not None: 450 VERSION_INFO[0] = int(ver_m.group(1)) 451 VERSION_INFO[1] = int(ver_m.group(2)) 452 VERSION_INFO[2] = int(ver_m.group(3)) 453 454dic, props, _ = parse_and_merge_properties('Scripts.txt', 'Script') 455DIC['Unknown'] = inverse_ranges(add_ranges_in_dic(dic)) 456 457parse_and_merge_properties('PropList.txt', 'Binary Property') 458 459_, _, ver_m = parse_and_merge_properties('emoji-data.txt', 'Emoji Property', None, EMOJI_VERSION_REG) 460if ver_m is not None: 461 EMOJI_VERSION_INFO[0] = int(ver_m.group(1)) 462 EMOJI_VERSION_INFO[1] = int(ver_m.group(2)) 463 464PROPS.append('Unknown') 465KDIC['Unknown'] = 'Script' 466 467ALIASES = parse_property_aliases('PropertyAliases.txt') 468a = parse_property_value_aliases('PropertyValueAliases.txt') 469merge_dic(ALIASES, a) 470 471dic, BLOCKS = parse_blocks('Blocks.txt') 472merge_dic(DIC, dic) 473 474if INCLUDE_GRAPHEME_CLUSTER_DATA: 475 dic, props, _ = parse_properties('GraphemeBreakProperty.txt', 476 'GraphemeBreak Property', 477 GRAPHEME_CLUSTER_BREAK_NAME_PREFIX) 478 merge_dic(DIC, dic) 479 merge_props(PROPS, props) 480 #prop = GRAPHEME_CLUSTER_BREAK_NAME_PREFIX + 'Other' 481 #DIC[prop] = inverse_ranges(add_ranges_in_dic(dic)) 482 #PROPS.append(prop) 483 #KDIC[prop] = 'GrapemeBreak Property' 484 485add_posix_props(DIC) 486PROPS = sorted(PROPS) 487 488s = '''%{ 489/* Generated by make_unicode_property_data.py. */ 490''' 491print s 492for prop in POSIX_LIST: 493 print_property(prop, DIC[prop], "POSIX [[:%s:]]" % prop) 494 495print '' 496 497if not(POSIX_ONLY): 498 for prop in PROPS: 499 klass = KDIC.get(prop, None) 500 if klass is None: 501 n = len(prop) 502 if n == 1: 503 klass = 'Major Category' 504 elif n == 2: 505 klass = 'General Category' 506 else: 507 klass = '-' 508 509 print_property(prop, DIC[prop], klass) 510 511 for block in BLOCKS: 512 print_property(block, DIC[block], 'Block') 513 514 515print '' 516print "static const OnigCodePoint*\nconst CodeRanges[] = {" 517 518for prop in POSIX_LIST: 519 print " CR_%s," % prop 520 521if not(POSIX_ONLY): 522 for prop in PROPS: 523 print " CR_%s," % prop 524 525 for prop in BLOCKS: 526 print " CR_%s," % prop 527 528s = '''}; 529 530#define pool_offset(s) offsetof(struct unicode_prop_name_pool_t, unicode_prop_name_pool_str##s) 531 532%} 533struct PoolPropertyNameCtype { 534 short int name; 535 short int ctype; 536}; 537 538%% 539''' 540sys.stdout.write(s) 541 542if OUTPUT_LIST_MODE: 543 UPF = open("UNICODE_PROPERTIES", "w") 544 if VERSION_INFO[0] < 0: 545 raise RuntimeError("Unicode Version is not found") 546 if EMOJI_VERSION_INFO[0] < 0: 547 raise RuntimeError("Emoji Version is not found") 548 549 print >> UPF, "Unicode Properties (Unicode Version: %d.%d.%d, Emoji: %d.%d)" % (VERSION_INFO[0], VERSION_INFO[1], VERSION_INFO[2], EMOJI_VERSION_INFO[0], EMOJI_VERSION_INFO[1]) 550 print >> UPF, '' 551 552index = -1 553for prop in POSIX_LIST: 554 index += 1 555 entry_and_print_prop_and_index(prop, index) 556 557if not(POSIX_ONLY): 558 for prop in PROPS: 559 index += 1 560 entry_and_print_prop_and_index(prop, index) 561 562 NALIASES = map(lambda (k,v):(normalize_prop_name(k), k, v), ALIASES.items()) 563 NALIASES = sorted(NALIASES) 564 for (nk, k, v) in NALIASES: 565 nv = normalize_prop_name(v) 566 if PropIndex.get(nk, None) is not None: 567 print >> sys.stderr, "ALIASES: already exists: %s => %s" % (k, v) 568 continue 569 aindex = PropIndex.get(nv, None) 570 if aindex is None: 571 #print >> sys.stderr, "ALIASES: value is not exist: %s => %s" % (k, v) 572 continue 573 574 entry_prop_name(k, aindex) 575 print_prop_and_index(nk, aindex) 576 577 for name in BLOCKS: 578 index += 1 579 entry_and_print_prop_and_index(name, index) 580 581print '%%' 582print '' 583if not(POSIX_ONLY): 584 if VERSION_INFO[0] < 0: 585 raise RuntimeError("Unicode Version is not found") 586 if EMOJI_VERSION_INFO[0] < 0: 587 raise RuntimeError("Emoji Version is not found") 588 589 print "#define UNICODE_PROPERTY_VERSION %02d%02d%02d" % (VERSION_INFO[0], VERSION_INFO[1], VERSION_INFO[2]) 590 print "#define UNICODE_EMOJI_VERSION %02d%02d" % (EMOJI_VERSION_INFO[0], EMOJI_VERSION_INFO[1]) 591 print '' 592 593print "#define PROPERTY_NAME_MAX_SIZE %d" % (PROPERTY_NAME_MAX_LEN + 10) 594print "#define CODE_RANGES_NUM %d" % (index + 1) 595 596index_props = make_reverse_dic(PropIndex) 597print '' 598for i in range(index + 1): 599 for p in index_props[i]: 600 print "#define PROP_INDEX_%s %d" % (p.upper(), i) 601 602if OUTPUT_LIST_MODE: 603 UPF.close() 604 605sys.exit(0) 606