1#!/usr/bin/python
2# -*- coding: utf-8 -*-
3# make_unicode_property_data.py
4# Copyright (c) 2016-2019  K.Kosako
5
6import sys
7import re
8
9POSIX_LIST = [
10    'NEWLINE', 'Alpha', 'Blank', 'Cntrl', 'Digit', 'Graph', 'Lower',
11    'Print', 'Punct', 'Space', 'Upper', 'XDigit', 'Word', 'Alnum', 'ASCII'
12]
13
14MAX_CODE_POINT = 0x10ffff
15
16GRAPHEME_CLUSTER_BREAK_NAME_PREFIX = 'Grapheme_Cluster_Break_'
17
18UD_FIRST_REG = re.compile("<.+,\s*First>")
19UD_LAST_REG  = re.compile("<.+,\s*Last>")
20PR_TOTAL_REG = re.compile("#\s*Total\s+(?:code\s+points|elements):")
21PR_LINE_REG  = re.compile("([0-9A-Fa-f]+)(?:..([0-9A-Fa-f]+))?\s*;\s*(\w+)")
22PA_LINE_REG  = re.compile("(\w+)\s*;\s*(\w+)")
23PVA_LINE_REG = re.compile("(sc|gc)\s*;\s*(\w+)\s*;\s*(\w+)(?:\s*;\s*(\w+))?")
24BL_LINE_REG  = re.compile("([0-9A-Fa-f]+)\.\.([0-9A-Fa-f]+)\s*;\s*(.*)")
25UNICODE_VERSION_REG = re.compile("#\s*.*-(\d+)\.(\d+)\.(\d+)\.txt")
26EMOJI_VERSION_REG   = re.compile("(?i)#\s*Version:\s*(\d+)\.(\d+)")
27
28VERSION_INFO = [-1, -1, -1]
29EMOJI_VERSION_INFO = [-1, -1]
30
31DIC  = { }
32KDIC = { }
33PropIndex = { }
34PROPERTY_NAME_MAX_LEN = 0
35PROPS = None
36
37def normalize_prop_name(name):
38  name = re.sub(r'[ _]', '', name)
39  name = name.lower()
40  return name
41
42def fix_block_name(name):
43  s = re.sub(r'[- ]+', '_', name)
44  return 'In_' + s
45
46def print_ranges(ranges):
47  for (start, end) in ranges:
48    print "0x%06x, 0x%06x" % (start, end)
49
50  print len(ranges)
51
52def print_prop_and_index(prop, i):
53  print "%-35s %3d" % (prop + ',', i)
54  PropIndex[prop] = i
55
56PRINT_CACHE = { }
57
58def print_property(prop, data, desc):
59  print ''
60  print "/* PROPERTY: '%s': %s */" % (prop, desc)
61
62  prev_prop = dic_find_by_value(PRINT_CACHE, data)
63  if prev_prop is not None:
64    print "#define CR_%s CR_%s" % (prop, prev_prop)
65  else:
66    PRINT_CACHE[prop] = data
67    print "static const OnigCodePoint"
68    print "CR_%s[] = { %d," % (prop, len(data))
69    for (start, end) in data:
70      print "0x%04x, 0x%04x," % (start, end)
71
72    print "}; /* END of CR_%s */" % prop
73
74
75def dic_find_by_value(dic, v):
76  for key, val in dic.items():
77    if val == v:
78      return key
79
80  return None
81
82def make_reverse_dic(dic):
83  rev = {}
84  for key, val in dic.items():
85    d = rev.get(val, None)
86    if d is None:
87      rev[val] = [key]
88    else:
89      d.append(key)
90
91  return rev
92
93def normalize_ranges(in_ranges, sort=False):
94  if sort:
95    ranges = sorted(in_ranges)
96  else:
97    ranges = in_ranges
98
99  r = []
100  prev = None
101  for (start, end) in ranges:
102    if prev >= start - 1:
103      (pstart, pend) = r.pop()
104      end = max(pend, end)
105      start = pstart
106
107    r.append((start, end))
108    prev = end
109
110  return r
111
112def inverse_ranges(in_ranges):
113  r = []
114  prev = 0x000000
115  for (start, end) in in_ranges:
116    if prev < start:
117      r.append((prev, start - 1))
118
119    prev = end + 1
120
121  if prev < MAX_CODE_POINT:
122    r.append((prev, MAX_CODE_POINT))
123
124  return r
125
126def add_ranges(r1, r2):
127  r = r1 + r2
128  return normalize_ranges(r, True)
129
130def sub_one_range(one_range, rs):
131  r = []
132  (s1, e1) = one_range
133  n = len(rs)
134  for i in range(0, n):
135    (s2, e2) = rs[i]
136    if s2 >= s1 and s2 <= e1:
137      if s2 > s1:
138        r.append((s1, s2 - 1))
139      if e2 >= e1:
140        return r
141
142      s1 = e2 + 1
143    elif s2 < s1 and e2 >= s1:
144      if e2 < e1:
145        s1 = e2 + 1
146      else:
147        return r
148
149  r.append((s1, e1))
150  return r
151
152def sub_ranges(r1, r2):
153  r = []
154  for one_range in r1:
155    rs = sub_one_range(one_range, r2)
156    r.extend(rs)
157
158  return r
159
160def add_ranges_in_dic(dic):
161  r = []
162  for k, v in dic.items():
163    r = r + v
164
165  return normalize_ranges(r, True)
166
167def normalize_ranges_in_dic(dic, sort=False):
168  for k, v in dic.items():
169    r = normalize_ranges(v, sort)
170    dic[k] = r
171
172def merge_dic(to_dic, from_dic):
173  to_keys   = to_dic.keys()
174  from_keys = from_dic.keys()
175  common = list(set(to_keys) & set(from_keys))
176  if len(common) != 0:
177    print >> sys.stderr, "merge_dic: collision: %s" % sorted(common)
178
179  to_dic.update(from_dic)
180
181def merge_props(to_props, from_props):
182  common = list(set(to_props) & set(from_props))
183  if len(common) != 0:
184    print >> sys.stderr, "merge_props: collision: %s" % sorted(common)
185
186  to_props.extend(from_props)
187
188def add_range_into_dic(dic, name, start, end):
189  d = dic.get(name, None)
190  if d is None:
191    d = [(start, end)]
192    dic[name] = d
193  else:
194    d.append((start, end))
195
196def list_sub(a, b):
197  x = set(a) - set(b)
198  return list(x)
199
200
201def parse_unicode_data_file(f):
202  dic = { }
203  assigned = []
204  for line in f:
205    s = line.strip()
206    if len(s) == 0:
207      continue
208    if s[0] == '#':
209      continue
210
211    a = s.split(';')
212    code = int(a[0], 16)
213    desc = a[1]
214    prop = a[2]
215    if UD_FIRST_REG.match(desc) is not None:
216      start = code
217      end   = None
218    elif UD_LAST_REG.match(desc) is not None:
219      end = code
220    else:
221      start = end = code
222
223    if end is not None:
224      assigned.append((start, end))
225      add_range_into_dic(dic, prop, start, end)
226      if len(prop) == 2:
227        add_range_into_dic(dic, prop[0:1], start, end)
228
229  normalize_ranges_in_dic(dic)
230  return dic, assigned
231
232def parse_properties(path, klass, prop_prefix = None, version_reg = None):
233  version_match = None
234  with open(path, 'r') as f:
235    dic = { }
236    prop = None
237    props = []
238    for line in f:
239      s = line.strip()
240      if len(s) == 0:
241        continue
242
243      if s[0] == '#' and version_reg is not None and version_match is None:
244        version_match = version_reg.match(s)
245        if version_match is not None:
246          continue
247
248      m = PR_LINE_REG.match(s)
249      if m:
250        prop = m.group(3)
251        if prop_prefix is not None:
252          prop = prop_prefix + prop
253
254        if m.group(2):
255          start = int(m.group(1), 16)
256          end   = int(m.group(2), 16)
257          add_range_into_dic(dic, prop, start, end)
258        else:
259          start = int(m.group(1), 16)
260          add_range_into_dic(dic, prop, start, start)
261
262      elif PR_TOTAL_REG.match(s) is not None:
263        KDIC[prop] = klass
264        props.append(prop)
265
266  normalize_ranges_in_dic(dic)
267  return (dic, props, version_match)
268
269def parse_property_aliases(path):
270  a = { }
271  with open(path, 'r') as f:
272    for line in f:
273      s = line.strip()
274      if len(s) == 0:
275        continue
276
277      m = PA_LINE_REG.match(s)
278      if not(m):
279        continue
280
281      if m.group(1) == m.group(2):
282        continue
283
284      a[m.group(1)] = m.group(2)
285
286  return a
287
288def parse_property_value_aliases(path):
289  a = { }
290  with open(path, 'r') as f:
291    for line in f:
292      s = line.strip()
293      if len(s) == 0:
294        continue
295
296      m = PVA_LINE_REG.match(s)
297      if not(m):
298        continue
299
300      cat = m.group(1)
301      x2  = m.group(2)
302      x3  = m.group(3)
303      x4  = m.group(4)
304      if cat == 'sc':
305        if x2 != x3:
306          a[x2] = x3
307        if x4 and x4 != x3:
308          a[x4] = x3
309      else:
310        if x2 != x3:
311          a[x3] = x2
312        if x4 and x4 != x2:
313          a[x4] = x2
314
315  return a
316
317def parse_blocks(path):
318  dic = { }
319  blocks = []
320  with open(path, 'r') as f:
321    for line in f:
322      s = line.strip()
323      if len(s) == 0:
324        continue
325
326      m = BL_LINE_REG.match(s)
327      if not(m):
328        continue
329
330      start = int(m.group(1), 16)
331      end   = int(m.group(2), 16)
332      block = fix_block_name(m.group(3))
333      add_range_into_dic(dic, block, start, end)
334      blocks.append(block)
335
336  noblock = fix_block_name('No_Block')
337  dic[noblock] = inverse_ranges(add_ranges_in_dic(dic))
338  blocks.append(noblock)
339  return dic, blocks
340
341def add_primitive_props(assigned):
342  DIC['Assigned'] = normalize_ranges(assigned)
343  DIC['Any']     = [(0x000000, 0x10ffff)]
344  DIC['ASCII']   = [(0x000000, 0x00007f)]
345  DIC['NEWLINE'] = [(0x00000a, 0x00000a)]
346  DIC['Cn'] = inverse_ranges(DIC['Assigned'])
347  DIC['C'].extend(DIC['Cn'])
348  DIC['C'] = normalize_ranges(DIC['C'], True)
349
350  d = []
351  d.extend(DIC['Ll'])
352  d.extend(DIC['Lt'])
353  d.extend(DIC['Lu'])
354  DIC['LC'] = normalize_ranges(d, True)
355
356def add_posix_props(dic):
357  alnum = []
358  alnum.extend(dic['Alphabetic'])
359  alnum.extend(dic['Nd'])  # Nd == Decimal_Number
360  alnum = normalize_ranges(alnum, True)
361
362  blank = [(0x0009, 0x0009)]
363  blank.extend(dic['Zs'])  # Zs == Space_Separator
364  blank = normalize_ranges(blank, True)
365
366  word = []
367  word.extend(dic['Alphabetic'])
368  word.extend(dic['M'])   # M == Mark
369  word.extend(dic['Nd'])
370  word.extend(dic['Pc'])  # Pc == Connector_Punctuation
371  word = normalize_ranges(word, True)
372
373  graph = sub_ranges(dic['Any'], dic['White_Space'])
374  graph = sub_ranges(graph, dic['Cc'])
375  graph = sub_ranges(graph, dic['Cs'])  # Cs == Surrogate
376  graph = sub_ranges(graph, dic['Cn'])  # Cn == Unassigned
377  graph = normalize_ranges(graph, True)
378
379  p = []
380  p.extend(graph)
381  p.extend(dic['Zs'])
382  p = normalize_ranges(p, True)
383
384  dic['Alpha']  = dic['Alphabetic']
385  dic['Upper']  = dic['Uppercase']
386  dic['Lower']  = dic['Lowercase']
387  dic['Punct']  = dic['P']  # P == Punctuation
388  dic['Digit']  = dic['Nd']
389  dic['XDigit'] = [(0x0030, 0x0039), (0x0041, 0x0046), (0x0061, 0x0066)]
390  dic['Alnum']  = alnum
391  dic['Space']  = dic['White_Space']
392  dic['Blank']  = blank
393  dic['Cntrl']  = dic['Cc']
394  dic['Word']   = word
395  dic['Graph']  = graph
396  dic['Print']  = p
397
398
399def set_max_prop_name(name):
400  global PROPERTY_NAME_MAX_LEN
401  n = len(name)
402  if n > PROPERTY_NAME_MAX_LEN:
403    PROPERTY_NAME_MAX_LEN = n
404
405def entry_prop_name(name, index):
406  set_max_prop_name(name)
407  if OUTPUT_LIST_MODE and index >= len(POSIX_LIST):
408    print >> UPF, "%3d: %s" % (index, name)
409
410def entry_and_print_prop_and_index(name, index):
411  entry_prop_name(name, index)
412  nname = normalize_prop_name(name)
413  print_prop_and_index(nname, index)
414
415def parse_and_merge_properties(path, klass, prop_prefix = None, version_reg = None):
416  dic, props, ver_m = parse_properties(path, klass, prop_prefix, version_reg)
417  merge_dic(DIC, dic)
418  merge_props(PROPS, props)
419  return dic, props, ver_m
420
421### main ###
422argv = sys.argv
423argc = len(argv)
424
425POSIX_ONLY = False
426INCLUDE_GRAPHEME_CLUSTER_DATA = False
427
428for i in range(1, argc):
429  arg = argv[i]
430  if arg == '-posix':
431    POSIX_ONLY = True
432  elif arg == '-gc':
433    INCLUDE_GRAPHEME_CLUSTER_DATA = True
434  else:
435    print >> sys.stderr, "Invalid argument: %s" % arg
436
437
438OUTPUT_LIST_MODE = not(POSIX_ONLY)
439
440with open('UnicodeData.txt', 'r') as f:
441  dic, assigned = parse_unicode_data_file(f)
442  DIC = dic
443  add_primitive_props(assigned)
444
445PROPS = DIC.keys()
446PROPS = list_sub(PROPS, POSIX_LIST)
447
448_, _, ver_m = parse_and_merge_properties('DerivedCoreProperties.txt', 'Derived Property', None, UNICODE_VERSION_REG)
449if ver_m is not None:
450  VERSION_INFO[0] = int(ver_m.group(1))
451  VERSION_INFO[1] = int(ver_m.group(2))
452  VERSION_INFO[2] = int(ver_m.group(3))
453
454dic, props, _ = parse_and_merge_properties('Scripts.txt', 'Script')
455DIC['Unknown'] = inverse_ranges(add_ranges_in_dic(dic))
456
457parse_and_merge_properties('PropList.txt',   'Binary Property')
458
459_, _, ver_m = parse_and_merge_properties('emoji-data.txt', 'Emoji Property', None, EMOJI_VERSION_REG)
460if ver_m is not None:
461  EMOJI_VERSION_INFO[0] = int(ver_m.group(1))
462  EMOJI_VERSION_INFO[1] = int(ver_m.group(2))
463
464PROPS.append('Unknown')
465KDIC['Unknown'] = 'Script'
466
467ALIASES = parse_property_aliases('PropertyAliases.txt')
468a = parse_property_value_aliases('PropertyValueAliases.txt')
469merge_dic(ALIASES, a)
470
471dic, BLOCKS = parse_blocks('Blocks.txt')
472merge_dic(DIC, dic)
473
474if INCLUDE_GRAPHEME_CLUSTER_DATA:
475  dic, props, _ = parse_properties('GraphemeBreakProperty.txt',
476                                   'GraphemeBreak Property',
477                                   GRAPHEME_CLUSTER_BREAK_NAME_PREFIX)
478  merge_dic(DIC, dic)
479  merge_props(PROPS, props)
480  #prop = GRAPHEME_CLUSTER_BREAK_NAME_PREFIX + 'Other'
481  #DIC[prop] = inverse_ranges(add_ranges_in_dic(dic))
482  #PROPS.append(prop)
483  #KDIC[prop] = 'GrapemeBreak Property'
484
485add_posix_props(DIC)
486PROPS = sorted(PROPS)
487
488s = '''%{
489/* Generated by make_unicode_property_data.py. */
490'''
491print s
492for prop in POSIX_LIST:
493  print_property(prop, DIC[prop], "POSIX [[:%s:]]" % prop)
494
495print ''
496
497if not(POSIX_ONLY):
498  for prop in PROPS:
499    klass = KDIC.get(prop, None)
500    if klass is None:
501      n = len(prop)
502      if n == 1:
503        klass = 'Major Category'
504      elif n == 2:
505        klass = 'General Category'
506      else:
507        klass = '-'
508
509    print_property(prop, DIC[prop], klass)
510
511  for block in BLOCKS:
512    print_property(block, DIC[block], 'Block')
513
514
515print ''
516print "static const OnigCodePoint*\nconst CodeRanges[] = {"
517
518for prop in POSIX_LIST:
519  print "  CR_%s," % prop
520
521if not(POSIX_ONLY):
522  for prop in PROPS:
523    print "  CR_%s," % prop
524
525  for prop in BLOCKS:
526    print "  CR_%s," % prop
527
528s = '''};
529
530#define pool_offset(s) offsetof(struct unicode_prop_name_pool_t, unicode_prop_name_pool_str##s)
531
532%}
533struct PoolPropertyNameCtype {
534  short int name;
535  short int ctype;
536};
537
538%%
539'''
540sys.stdout.write(s)
541
542if OUTPUT_LIST_MODE:
543  UPF = open("UNICODE_PROPERTIES", "w")
544  if VERSION_INFO[0] < 0:
545    raise RuntimeError("Unicode Version is not found")
546  if EMOJI_VERSION_INFO[0] < 0:
547    raise RuntimeError("Emoji Version is not found")
548
549  print >> UPF, "Unicode Properties (Unicode Version: %d.%d.%d,  Emoji: %d.%d)" % (VERSION_INFO[0], VERSION_INFO[1], VERSION_INFO[2], EMOJI_VERSION_INFO[0], EMOJI_VERSION_INFO[1])
550  print >> UPF, ''
551
552index = -1
553for prop in POSIX_LIST:
554  index += 1
555  entry_and_print_prop_and_index(prop, index)
556
557if not(POSIX_ONLY):
558  for prop in PROPS:
559    index += 1
560    entry_and_print_prop_and_index(prop, index)
561
562  NALIASES = map(lambda (k,v):(normalize_prop_name(k), k, v), ALIASES.items())
563  NALIASES = sorted(NALIASES)
564  for (nk, k, v) in NALIASES:
565    nv = normalize_prop_name(v)
566    if PropIndex.get(nk, None) is not None:
567      print >> sys.stderr, "ALIASES: already exists: %s => %s" % (k, v)
568      continue
569    aindex = PropIndex.get(nv, None)
570    if aindex is None:
571      #print >> sys.stderr, "ALIASES: value is not exist: %s => %s" % (k, v)
572      continue
573
574    entry_prop_name(k, aindex)
575    print_prop_and_index(nk, aindex)
576
577  for name in BLOCKS:
578    index += 1
579    entry_and_print_prop_and_index(name, index)
580
581print '%%'
582print ''
583if not(POSIX_ONLY):
584  if VERSION_INFO[0] < 0:
585    raise RuntimeError("Unicode Version is not found")
586  if EMOJI_VERSION_INFO[0] < 0:
587    raise RuntimeError("Emoji Version is not found")
588
589  print "#define UNICODE_PROPERTY_VERSION  %02d%02d%02d" % (VERSION_INFO[0], VERSION_INFO[1], VERSION_INFO[2])
590  print "#define UNICODE_EMOJI_VERSION     %02d%02d" % (EMOJI_VERSION_INFO[0], EMOJI_VERSION_INFO[1])
591  print ''
592
593print "#define PROPERTY_NAME_MAX_SIZE  %d" % (PROPERTY_NAME_MAX_LEN + 10)
594print "#define CODE_RANGES_NUM         %d" % (index + 1)
595
596index_props = make_reverse_dic(PropIndex)
597print ''
598for i in range(index + 1):
599  for p in index_props[i]:
600    print "#define PROP_INDEX_%s %d" % (p.upper(), i)
601
602if OUTPUT_LIST_MODE:
603  UPF.close()
604
605sys.exit(0)
606