1#!/usr/bin/python
2# -*- coding: utf-8 -*-
3# make_unicode_wb_data.py
4# Copyright (c) 2019  K.Kosako
5
6import sys
7import re
8
9MAX_CODE_POINT = 0x10ffff
10
11PR_TOTAL_REG = re.compile("#\s*Total\s+(?:code\s+points|elements):")
12PR_LINE_REG  = re.compile("([0-9A-Fa-f]+)(?:..([0-9A-Fa-f]+))?\s*;\s*(\w+)")
13PA_LINE_REG  = re.compile("(\w+)\s*;\s*(\w+)")
14PVA_LINE_REG = re.compile("(sc|gc)\s*;\s*(\w+)\s*;\s*(\w+)(?:\s*;\s*(\w+))?")
15BL_LINE_REG  = re.compile("([0-9A-Fa-f]+)\.\.([0-9A-Fa-f]+)\s*;\s*(.*)")
16VERSION_REG  = re.compile("#\s*.*-(\d+)\.(\d+)\.(\d+)\.txt")
17
18VERSION_INFO = [-1, -1, -1]
19DIC  = { }
20PROPS = []
21PropIndex = { }
22
23def check_version_info(s):
24  m = VERSION_REG.match(s)
25  if m is not None:
26    VERSION_INFO[0] = int(m.group(1))
27    VERSION_INFO[1] = int(m.group(2))
28    VERSION_INFO[2] = int(m.group(3))
29
30def print_ranges(ranges):
31  for (start, end) in ranges:
32    print "0x%06x, 0x%06x" % (start, end)
33
34def print_prop_and_index(prop, i):
35  print "%-35s %3d" % (prop + ',', i)
36  PropIndex[prop] = i
37
38def dic_find_by_value(dic, v):
39  for key, val in dic.items():
40    if val == v:
41      return key
42
43  return None
44
45
46def normalize_ranges(in_ranges, sort=False):
47  if sort:
48    ranges = sorted(in_ranges)
49  else:
50    ranges = in_ranges
51
52  r = []
53  prev = None
54  for (start, end) in ranges:
55    if prev >= start - 1:
56      (pstart, pend) = r.pop()
57      end = max(pend, end)
58      start = pstart
59
60    r.append((start, end))
61    prev = end
62
63  return r
64
65def inverse_ranges(in_ranges):
66  r = []
67  prev = 0x000000
68  for (start, end) in in_ranges:
69    if prev < start:
70      r.append((prev, start - 1))
71
72    prev = end + 1
73
74  if prev < MAX_CODE_POINT:
75    r.append((prev, MAX_CODE_POINT))
76
77  return r
78
79def add_ranges(r1, r2):
80  r = r1 + r2
81  return normalize_ranges(r, True)
82
83def sub_one_range(one_range, rs):
84  r = []
85  (s1, e1) = one_range
86  n = len(rs)
87  for i in range(0, n):
88    (s2, e2) = rs[i]
89    if s2 >= s1 and s2 <= e1:
90      if s2 > s1:
91        r.append((s1, s2 - 1))
92      if e2 >= e1:
93        return r
94
95      s1 = e2 + 1
96    elif s2 < s1 and e2 >= s1:
97      if e2 < e1:
98        s1 = e2 + 1
99      else:
100        return r
101
102  r.append((s1, e1))
103  return r
104
105def sub_ranges(r1, r2):
106  r = []
107  for one_range in r1:
108    rs = sub_one_range(one_range, r2)
109    r.extend(rs)
110
111  return r
112
113def add_ranges_in_dic(dic):
114  r = []
115  for k, v in dic.items():
116    r = r + v
117
118  return normalize_ranges(r, True)
119
120def normalize_ranges_in_dic(dic, sort=False):
121  for k, v in dic.items():
122    r = normalize_ranges(v, sort)
123    dic[k] = r
124
125def merge_dic(to_dic, from_dic):
126  to_keys   = to_dic.keys()
127  from_keys = from_dic.keys()
128  common = list(set(to_keys) & set(from_keys))
129  if len(common) != 0:
130    print >> sys.stderr, "merge_dic: collision: %s" % sorted(common)
131
132  to_dic.update(from_dic)
133
134def merge_props(to_props, from_props):
135  common = list(set(to_props) & set(from_props))
136  if len(common) != 0:
137    print >> sys.stderr, "merge_props: collision: %s" % sorted(common)
138
139  to_props.extend(from_props)
140
141def add_range_into_dic(dic, name, start, end):
142  d = dic.get(name, None)
143  if d is None:
144    d = [(start, end)]
145    dic[name] = d
146  else:
147    d.append((start, end))
148
149def list_sub(a, b):
150  x = set(a) - set(b)
151  return list(x)
152
153def parse_properties(path):
154  with open(path, 'r') as f:
155    dic = { }
156    prop = None
157    props = []
158    for line in f:
159      s = line.strip()
160      if len(s) == 0:
161        continue
162
163      if s[0] == '#':
164        if VERSION_INFO[0] < 0:
165          check_version_info(s)
166
167      m = PR_LINE_REG.match(s)
168      if m:
169        prop = m.group(3)
170        if m.group(2):
171          start = int(m.group(1), 16)
172          end   = int(m.group(2), 16)
173          add_range_into_dic(dic, prop, start, end)
174        else:
175          start = int(m.group(1), 16)
176          add_range_into_dic(dic, prop, start, start)
177
178      elif PR_TOTAL_REG.match(s) is not None:
179        props.append(prop)
180
181  normalize_ranges_in_dic(dic)
182  return (dic, props)
183
184
185### main ###
186argv = sys.argv
187argc = len(argv)
188
189dic, props = parse_properties('WordBreakProperty.txt')
190merge_dic(DIC, dic)
191merge_props(PROPS, props)
192
193PROPS = sorted(PROPS)
194
195print '/* unicode_wb_data.c: Generated by make_unicode_wb_data.py. */'
196COPYRIGHT = '''
197/*-
198 * Copyright (c) 2019  K.Kosako
199 * All rights reserved.
200 *
201 * Redistribution and use in source and binary forms, with or without
202 * modification, are permitted provided that the following conditions
203 * are met:
204 * 1. Redistributions of source code must retain the above copyright
205 *    notice, this list of conditions and the following disclaimer.
206 * 2. Redistributions in binary form must reproduce the above copyright
207 *    notice, this list of conditions and the following disclaimer in the
208 *    documentation and/or other materials provided with the distribution.
209 *
210 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
211 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
212 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
213 * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
214 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
215 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
216 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
217 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
218 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
219 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
220 * SUCH DAMAGE.
221 */
222'''.strip()
223
224print COPYRIGHT
225print ''
226if VERSION_INFO[0] < 0:
227  raise RuntimeError("Version is not found.")
228
229print "#define WORD_BREAK_PROPERTY_VERSION  %02d%02d%02d" % (VERSION_INFO[0], VERSION_INFO[1], VERSION_INFO[2])
230print ''
231
232ranges = []
233for prop in PROPS:
234  rs = DIC[prop]
235  for (start, end) in rs:
236    ranges.append((start, end, prop))
237
238ranges = sorted(ranges, key=lambda x: x[0])
239
240prev = -1
241for (start, end, prop) in ranges:
242  if prev >= start:
243    raise ValueError("{2}:{0} - {1} range overlap prev value {3}".format(start, end, prop, prev))
244
245
246print '/*'
247for prop in PROPS:
248  print "%s" % prop
249print '*/'
250print ''
251
252num_ranges = len(ranges)
253print "static int WB_RANGE_NUM = %d;" % num_ranges
254
255print 'static WB_RANGE_TYPE WB_RANGES[] = {'
256for i, (start, end, prop) in enumerate(ranges):
257  if i == num_ranges - 1:
258    comma = ''
259  else:
260    comma = ','
261
262  type_name = 'WB_' + prop
263  print " {0x%06x, 0x%06x, %s }%s" % (start, end, type_name, comma)
264
265print '};'
266
267sys.exit(0)
268