1#!/usr/bin/python
2# -*- coding: utf-8 -*-
3# make_unicode_property_data.py
4# Copyright (c) 2016-2020  K.Kosako
5
6import sys
7import re
8
9POSIX_LIST = [
10    'NEWLINE', 'Alpha', 'Blank', 'Cntrl', 'Digit', 'Graph', 'Lower',
11    'Print', 'Punct', 'Space', 'Upper', 'XDigit', 'Word', 'Alnum', 'ASCII'
12]
13
14MAX_CODE_POINT = 0x10ffff
15
16GRAPHEME_CLUSTER_BREAK_NAME_PREFIX = 'Grapheme_Cluster_Break_'
17
18UD_FIRST_REG = re.compile("<.+,\s*First>")
19UD_LAST_REG  = re.compile("<.+,\s*Last>")
20PR_TOTAL_REG = re.compile("#\s*Total\s+(?:code\s+points|elements):")
21PR_LINE_REG  = re.compile("([0-9A-Fa-f]+)(?:..([0-9A-Fa-f]+))?\s*;\s*(\w+)")
22PA_LINE_REG  = re.compile("(\w+)\s*;\s*(\w+)")
23PVA_LINE_REG = re.compile("(sc|gc)\s*;\s*(\w+)\s*;\s*(\w+)(?:\s*;\s*(\w+))?")
24BL_LINE_REG  = re.compile("([0-9A-Fa-f]+)\.\.([0-9A-Fa-f]+)\s*;\s*(.*)")
25UNICODE_VERSION_REG = re.compile("#\s*.*-(\d+)\.(\d+)\.(\d+)\.txt")
26EMOJI_VERSION_REG   = re.compile("(?i)#\s*Version:\s*(\d+)\.(\d+)")
27
28VERSION_INFO = [-1, -1, -1]
29EMOJI_VERSION_INFO = [-1, -1]
30
31DIC  = { }
32KDIC = { }
33PropIndex = { }
34PROPERTY_NAME_MAX_LEN = 0
35PROPS = None
36
37def normalize_prop_name(name):
38  name = re.sub(r'[ _]', '', name)
39  name = name.lower()
40  return name
41
42def fix_block_name(name):
43  s = re.sub(r'[- ]+', '_', name)
44  return 'In_' + s
45
46def print_ranges(ranges):
47  for (start, end) in ranges:
48    print "0x%06x, 0x%06x" % (start, end)
49
50  print len(ranges)
51
52def print_prop_and_index(prop, i):
53  print "%-35s %3d" % (prop + ',', i)
54  PropIndex[prop] = i
55
56PRINT_CACHE = { }
57
58def print_property(prop, data, desc):
59  print ''
60  print "/* PROPERTY: '%s': %s */" % (prop, desc)
61
62  prev_prop = dic_find_by_value(PRINT_CACHE, data)
63  if prev_prop is not None:
64    print "#define CR_%s CR_%s" % (prop, prev_prop)
65  else:
66    PRINT_CACHE[prop] = data
67    print "static const OnigCodePoint"
68    print "CR_%s[] = { %d," % (prop, len(data))
69    for (start, end) in data:
70      print "0x%04x, 0x%04x," % (start, end)
71
72    print "}; /* END of CR_%s */" % prop
73
74
75def dic_find_by_value(dic, v):
76  for key, val in dic.items():
77    if val == v:
78      return key
79
80  return None
81
82def make_reverse_dic(dic):
83  rev = {}
84  for key, val in dic.items():
85    d = rev.get(val, None)
86    if d is None:
87      rev[val] = [key]
88    else:
89      d.append(key)
90
91  return rev
92
93def normalize_ranges(in_ranges, sort=False):
94  if sort:
95    ranges = sorted(in_ranges)
96  else:
97    ranges = in_ranges
98
99  r = []
100  prev = None
101  for (start, end) in ranges:
102    if prev >= start - 1:
103      (pstart, pend) = r.pop()
104      end = max(pend, end)
105      start = pstart
106
107    r.append((start, end))
108    prev = end
109
110  return r
111
112def inverse_ranges(in_ranges):
113  r = []
114  prev = 0x000000
115  for (start, end) in in_ranges:
116    if prev < start:
117      r.append((prev, start - 1))
118
119    prev = end + 1
120
121  if prev < MAX_CODE_POINT:
122    r.append((prev, MAX_CODE_POINT))
123
124  return r
125
126def add_ranges(r1, r2):
127  r = r1 + r2
128  return normalize_ranges(r, True)
129
130def sub_one_range(one_range, rs):
131  r = []
132  (s1, e1) = one_range
133  n = len(rs)
134  for i in range(0, n):
135    (s2, e2) = rs[i]
136    if s2 >= s1 and s2 <= e1:
137      if s2 > s1:
138        r.append((s1, s2 - 1))
139      if e2 >= e1:
140        return r
141
142      s1 = e2 + 1
143    elif s2 < s1 and e2 >= s1:
144      if e2 < e1:
145        s1 = e2 + 1
146      else:
147        return r
148
149  r.append((s1, e1))
150  return r
151
152def sub_ranges(r1, r2):
153  r = []
154  for one_range in r1:
155    rs = sub_one_range(one_range, r2)
156    r.extend(rs)
157
158  return r
159
160def add_ranges_in_dic(dic):
161  r = []
162  for k, v in dic.items():
163    r = r + v
164
165  return normalize_ranges(r, True)
166
167def normalize_ranges_in_dic(dic, sort=False):
168  for k, v in dic.items():
169    r = normalize_ranges(v, sort)
170    dic[k] = r
171
172def merge_dic(to_dic, from_dic):
173  to_keys   = to_dic.keys()
174  from_keys = from_dic.keys()
175  common = list(set(to_keys) & set(from_keys))
176  if len(common) != 0:
177    print >> sys.stderr, "merge_dic: collision: %s" % sorted(common)
178
179  to_dic.update(from_dic)
180
181def merge_props(to_props, from_props):
182  common = list(set(to_props) & set(from_props))
183  if len(common) != 0:
184    print >> sys.stderr, "merge_props: collision: %s" % sorted(common)
185
186  to_props.extend(from_props)
187
188def add_range_into_dic(dic, name, start, end):
189  d = dic.get(name, None)
190  if d is None:
191    d = [(start, end)]
192    dic[name] = d
193  else:
194    d.append((start, end))
195
196def list_sub(a, b):
197  x = set(a) - set(b)
198  return list(x)
199
200
201def parse_unicode_data_file(f):
202  dic = { }
203  assigned = []
204  for line in f:
205    s = line.strip()
206    if len(s) == 0:
207      continue
208    if s[0] == '#':
209      continue
210
211    a = s.split(';')
212    code = int(a[0], 16)
213    desc = a[1]
214    prop = a[2]
215    if UD_FIRST_REG.match(desc) is not None:
216      start = code
217      end   = None
218    elif UD_LAST_REG.match(desc) is not None:
219      end = code
220    else:
221      start = end = code
222
223    if end is not None:
224      assigned.append((start, end))
225      add_range_into_dic(dic, prop, start, end)
226      if len(prop) == 2:
227        add_range_into_dic(dic, prop[0:1], start, end)
228
229  normalize_ranges_in_dic(dic)
230  return dic, assigned
231
232def parse_properties(path, klass, prop_prefix = None, version_reg = None):
233  version_match = None
234  with open(path, 'r') as f:
235    dic = { }
236    prop = None
237    props = []
238    for line in f:
239      s = line.strip()
240      if len(s) == 0:
241        continue
242
243      if s[0] == '#' and version_reg is not None and version_match is None:
244        version_match = version_reg.match(s)
245        if version_match is not None:
246          continue
247
248      m = PR_LINE_REG.match(s)
249      if m:
250        prop = m.group(3)
251        if prop_prefix is not None:
252          prop = prop_prefix + prop
253
254        if m.group(2):
255          start = int(m.group(1), 16)
256          end   = int(m.group(2), 16)
257          add_range_into_dic(dic, prop, start, end)
258        else:
259          start = int(m.group(1), 16)
260          add_range_into_dic(dic, prop, start, start)
261
262      elif PR_TOTAL_REG.match(s) is not None:
263        KDIC[prop] = klass
264        props.append(prop)
265
266  normalize_ranges_in_dic(dic)
267  return (dic, props, version_match)
268
269def parse_property_aliases(path):
270  a = { }
271  with open(path, 'r') as f:
272    for line in f:
273      s = line.strip()
274      if len(s) == 0:
275        continue
276
277      m = PA_LINE_REG.match(s)
278      if not(m):
279        continue
280
281      if m.group(1) == m.group(2):
282        continue
283
284      a[m.group(1)] = m.group(2)
285
286  return a
287
288def parse_property_value_aliases(path):
289  a = { }
290  with open(path, 'r') as f:
291    for line in f:
292      s = line.strip()
293      if len(s) == 0:
294        continue
295
296      m = PVA_LINE_REG.match(s)
297      if not(m):
298        continue
299
300      cat = m.group(1)
301      x2  = m.group(2)
302      x3  = m.group(3)
303      x4  = m.group(4)
304      if cat == 'sc':
305        if x2 != x3:
306          a[x2] = x3
307        if x4 and x4 != x3:
308          a[x4] = x3
309      else:
310        if x2 != x3:
311          a[x3] = x2
312        if x4 and x4 != x2:
313          a[x4] = x2
314
315  return a
316
317def parse_blocks(path):
318  dic = { }
319  blocks = []
320  with open(path, 'r') as f:
321    for line in f:
322      s = line.strip()
323      if len(s) == 0:
324        continue
325
326      m = BL_LINE_REG.match(s)
327      if not(m):
328        continue
329
330      start = int(m.group(1), 16)
331      end   = int(m.group(2), 16)
332      block = fix_block_name(m.group(3))
333      add_range_into_dic(dic, block, start, end)
334      blocks.append(block)
335
336  noblock = fix_block_name('No_Block')
337  dic[noblock] = inverse_ranges(add_ranges_in_dic(dic))
338  blocks.append(noblock)
339  return dic, blocks
340
341def add_primitive_props(assigned):
342  DIC['Assigned'] = normalize_ranges(assigned)
343  DIC['Any']     = [(0x000000, 0x10ffff)]
344  DIC['ASCII']   = [(0x000000, 0x00007f)]
345  DIC['NEWLINE'] = [(0x00000a, 0x00000a)]
346  DIC['Cn'] = inverse_ranges(DIC['Assigned'])
347  DIC['C'].extend(DIC['Cn'])
348  DIC['C'] = normalize_ranges(DIC['C'], True)
349
350  d = []
351  d.extend(DIC['Ll'])
352  d.extend(DIC['Lt'])
353  d.extend(DIC['Lu'])
354  DIC['LC'] = normalize_ranges(d, True)
355
356def add_posix_props(dic):
357  alnum = []
358  alnum.extend(dic['Alphabetic'])
359  alnum.extend(dic['Nd'])  # Nd == Decimal_Number
360  alnum = normalize_ranges(alnum, True)
361
362  blank = [(0x0009, 0x0009)]
363  blank.extend(dic['Zs'])  # Zs == Space_Separator
364  blank = normalize_ranges(blank, True)
365
366  word = []
367  word.extend(dic['Alphabetic'])
368  word.extend(dic['M'])   # M == Mark
369  word.extend(dic['Nd'])
370  word.extend(dic['Pc'])  # Pc == Connector_Punctuation
371  word = normalize_ranges(word, True)
372
373  graph = sub_ranges(dic['Any'], dic['White_Space'])
374  graph = sub_ranges(graph, dic['Cc'])
375  graph = sub_ranges(graph, dic['Cs'])  # Cs == Surrogate
376  graph = sub_ranges(graph, dic['Cn'])  # Cn == Unassigned
377  graph = normalize_ranges(graph, True)
378
379  p = []
380  p.extend(graph)
381  p.extend(dic['Zs'])
382  p = normalize_ranges(p, True)
383
384  dic['Alpha']  = dic['Alphabetic']
385  dic['Upper']  = dic['Uppercase']
386  dic['Lower']  = dic['Lowercase']
387  dic['Punct']  = dic['P']  # P == Punctuation
388  dic['Digit']  = dic['Nd']
389  dic['XDigit'] = [(0x0030, 0x0039), (0x0041, 0x0046), (0x0061, 0x0066)]
390  dic['Alnum']  = alnum
391  dic['Space']  = dic['White_Space']
392  dic['Blank']  = blank
393  dic['Cntrl']  = dic['Cc']
394  dic['Word']   = word
395  dic['Graph']  = graph
396  dic['Print']  = p
397
398
399def set_max_prop_name(name):
400  global PROPERTY_NAME_MAX_LEN
401  n = len(name)
402  if n > PROPERTY_NAME_MAX_LEN:
403    PROPERTY_NAME_MAX_LEN = n
404
405def entry_prop_name(name, index):
406  set_max_prop_name(name)
407  if OUTPUT_LIST_MODE and index >= len(POSIX_LIST):
408    print >> UPF, "%s" % (name)
409
410def entry_and_print_prop_and_index(name, index):
411  entry_prop_name(name, index)
412  nname = normalize_prop_name(name)
413  print_prop_and_index(nname, index)
414
415def parse_and_merge_properties(path, klass, prop_prefix = None, version_reg = None):
416  dic, props, ver_m = parse_properties(path, klass, prop_prefix, version_reg)
417  merge_dic(DIC, dic)
418  merge_props(PROPS, props)
419  return dic, props, ver_m
420
421
422### main ###
423argv = sys.argv
424argc = len(argv)
425
426COPYRIGHT = '''
427/*-
428 * Copyright (c) 2016-2020  K.Kosako
429 * All rights reserved.
430 *
431 * Redistribution and use in source and binary forms, with or without
432 * modification, are permitted provided that the following conditions
433 * are met:
434 * 1. Redistributions of source code must retain the above copyright
435 *    notice, this list of conditions and the following disclaimer.
436 * 2. Redistributions in binary form must reproduce the above copyright
437 *    notice, this list of conditions and the following disclaimer in the
438 *    documentation and/or other materials provided with the distribution.
439 *
440 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
441 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
442 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
443 * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
444 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
445 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
446 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
447 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
448 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
449 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
450 * SUCH DAMAGE.
451 */
452'''.strip()
453
454POSIX_ONLY = False
455INCLUDE_GRAPHEME_CLUSTER_DATA = False
456
457for i in range(1, argc):
458  arg = argv[i]
459  if arg == '-posix':
460    POSIX_ONLY = True
461  elif arg == '-gc':
462    INCLUDE_GRAPHEME_CLUSTER_DATA = True
463  else:
464    print >> sys.stderr, "Invalid argument: %s" % arg
465
466
467OUTPUT_LIST_MODE = not(POSIX_ONLY)
468
469with open('UnicodeData.txt', 'r') as f:
470  dic, assigned = parse_unicode_data_file(f)
471  DIC = dic
472  add_primitive_props(assigned)
473
474PROPS = DIC.keys()
475PROPS = list_sub(PROPS, POSIX_LIST)
476
477_, _, ver_m = parse_and_merge_properties('DerivedCoreProperties.txt', 'Derived Property', None, UNICODE_VERSION_REG)
478if ver_m is not None:
479  VERSION_INFO[0] = int(ver_m.group(1))
480  VERSION_INFO[1] = int(ver_m.group(2))
481  VERSION_INFO[2] = int(ver_m.group(3))
482
483dic, props, _ = parse_and_merge_properties('Scripts.txt', 'Script')
484DIC['Unknown'] = inverse_ranges(add_ranges_in_dic(dic))
485
486parse_and_merge_properties('PropList.txt',   'Binary Property')
487
488_, _, ver_m = parse_and_merge_properties('emoji-data.txt', 'Emoji Property', None, EMOJI_VERSION_REG)
489if ver_m is not None:
490  EMOJI_VERSION_INFO[0] = int(ver_m.group(1))
491  EMOJI_VERSION_INFO[1] = int(ver_m.group(2))
492
493PROPS.append('Unknown')
494KDIC['Unknown'] = 'Script'
495
496ALIASES = parse_property_aliases('PropertyAliases.txt')
497a = parse_property_value_aliases('PropertyValueAliases.txt')
498merge_dic(ALIASES, a)
499
500dic, BLOCKS = parse_blocks('Blocks.txt')
501merge_dic(DIC, dic)
502
503if INCLUDE_GRAPHEME_CLUSTER_DATA:
504  dic, props, _ = parse_properties('GraphemeBreakProperty.txt',
505                                   'GraphemeBreak Property',
506                                   GRAPHEME_CLUSTER_BREAK_NAME_PREFIX)
507  merge_dic(DIC, dic)
508  merge_props(PROPS, props)
509  #prop = GRAPHEME_CLUSTER_BREAK_NAME_PREFIX + 'Other'
510  #DIC[prop] = inverse_ranges(add_ranges_in_dic(dic))
511  #PROPS.append(prop)
512  #KDIC[prop] = 'GrapemeBreak Property'
513
514add_posix_props(DIC)
515PROPS = sorted(PROPS)
516
517
518s = '''%{
519/* Generated by make_unicode_property_data.py. */
520'''
521print s
522print COPYRIGHT
523print ''
524
525for prop in POSIX_LIST:
526  print_property(prop, DIC[prop], "POSIX [[:%s:]]" % prop)
527
528print ''
529
530if not(POSIX_ONLY):
531  for prop in PROPS:
532    klass = KDIC.get(prop, None)
533    if klass is None:
534      n = len(prop)
535      if n == 1:
536        klass = 'Major Category'
537      elif n == 2:
538        klass = 'General Category'
539      else:
540        klass = '-'
541
542    print_property(prop, DIC[prop], klass)
543
544  for block in BLOCKS:
545    print_property(block, DIC[block], 'Block')
546
547
548print ''
549print "static const OnigCodePoint*\nconst CodeRanges[] = {"
550
551for prop in POSIX_LIST:
552  print "  CR_%s," % prop
553
554if not(POSIX_ONLY):
555  for prop in PROPS:
556    print "  CR_%s," % prop
557
558  for prop in BLOCKS:
559    print "  CR_%s," % prop
560
561s = '''};
562
563#define pool_offset(s) offsetof(struct unicode_prop_name_pool_t, unicode_prop_name_pool_str##s)
564
565%}
566struct PoolPropertyNameCtype {
567  short int name;
568  short int ctype;
569};
570
571%%
572'''
573sys.stdout.write(s)
574
575if OUTPUT_LIST_MODE:
576  UPF = open("UNICODE_PROPERTIES", "w")
577  if VERSION_INFO[0] < 0:
578    raise RuntimeError("Unicode Version is not found")
579  if EMOJI_VERSION_INFO[0] < 0:
580    raise RuntimeError("Emoji Version is not found")
581
582  print >> UPF, "Unicode Properties (Unicode Version: %d.%d.%d,  Emoji: %d.%d)" % (VERSION_INFO[0], VERSION_INFO[1], VERSION_INFO[2], EMOJI_VERSION_INFO[0], EMOJI_VERSION_INFO[1])
583  print >> UPF, ''
584
585index = -1
586for prop in POSIX_LIST:
587  index += 1
588  entry_and_print_prop_and_index(prop, index)
589
590if not(POSIX_ONLY):
591  for prop in PROPS:
592    index += 1
593    entry_and_print_prop_and_index(prop, index)
594
595  NALIASES = map(lambda (k,v):(normalize_prop_name(k), k, v), ALIASES.items())
596  NALIASES = sorted(NALIASES)
597  for (nk, k, v) in NALIASES:
598    nv = normalize_prop_name(v)
599    if PropIndex.get(nk, None) is not None:
600      print >> sys.stderr, "ALIASES: already exists: %s => %s" % (k, v)
601      continue
602    aindex = PropIndex.get(nv, None)
603    if aindex is None:
604      #print >> sys.stderr, "ALIASES: value is not exist: %s => %s" % (k, v)
605      continue
606
607    entry_prop_name(k, aindex)
608    print_prop_and_index(nk, aindex)
609
610  for name in BLOCKS:
611    index += 1
612    entry_and_print_prop_and_index(name, index)
613
614print '%%'
615print ''
616if not(POSIX_ONLY):
617  if VERSION_INFO[0] < 0:
618    raise RuntimeError("Unicode Version is not found")
619  if EMOJI_VERSION_INFO[0] < 0:
620    raise RuntimeError("Emoji Version is not found")
621
622  print "#define UNICODE_PROPERTY_VERSION  %02d%02d%02d" % (VERSION_INFO[0], VERSION_INFO[1], VERSION_INFO[2])
623  print "#define UNICODE_EMOJI_VERSION     %02d%02d" % (EMOJI_VERSION_INFO[0], EMOJI_VERSION_INFO[1])
624  print ''
625
626print "#define PROPERTY_NAME_MAX_SIZE  %d" % (PROPERTY_NAME_MAX_LEN + 10)
627print "#define CODE_RANGES_NUM         %d" % (index + 1)
628
629index_props = make_reverse_dic(PropIndex)
630print ''
631for i in range(index + 1):
632  for p in index_props[i]:
633    print "#define PROP_INDEX_%s %d" % (p.upper(), i)
634
635if OUTPUT_LIST_MODE:
636  UPF.close()
637
638sys.exit(0)
639