1#!/usr/bin/python
2# -*- coding: utf-8 -*-
3# make_unicode_egcb_data.py
4# Copyright (c) 2017-2018  K.Kosako
5
6import sys
7import re
8
9MAX_CODE_POINT = 0x10ffff
10
11PR_TOTAL_REG = re.compile("#\s*Total\s+(?:code\s+points|elements):")
12PR_LINE_REG  = re.compile("([0-9A-Fa-f]+)(?:..([0-9A-Fa-f]+))?\s*;\s*(\w+)")
13PA_LINE_REG  = re.compile("(\w+)\s*;\s*(\w+)")
14PVA_LINE_REG = re.compile("(sc|gc)\s*;\s*(\w+)\s*;\s*(\w+)(?:\s*;\s*(\w+))?")
15BL_LINE_REG  = re.compile("([0-9A-Fa-f]+)\.\.([0-9A-Fa-f]+)\s*;\s*(.*)")
16VERSION_REG  = re.compile("#\s*.*-(\d+\.\d+\.\d+)\.txt")
17
18VERSION_INFO = None
19DIC  = { }
20PROPS = []
21PropIndex = { }
22
23def check_version_info(s):
24  global VERSION_INFO
25  m = VERSION_REG.match(s)
26  if m is not None:
27    VERSION_INFO = m.group(1)
28
29def print_ranges(ranges):
30  for (start, end) in ranges:
31    print "0x%06x, 0x%06x" % (start, end)
32
33def print_prop_and_index(prop, i):
34  print "%-35s %3d" % (prop + ',', i)
35  PropIndex[prop] = i
36
37def dic_find_by_value(dic, v):
38  for key, val in dic.items():
39    if val == v:
40      return key
41
42  return None
43
44
45def normalize_ranges(in_ranges, sort=False):
46  if sort:
47    ranges = sorted(in_ranges)
48  else:
49    ranges = in_ranges
50
51  r = []
52  prev = None
53  for (start, end) in ranges:
54    if prev >= start - 1:
55      (pstart, pend) = r.pop()
56      end = max(pend, end)
57      start = pstart
58
59    r.append((start, end))
60    prev = end
61
62  return r
63
64def inverse_ranges(in_ranges):
65  r = []
66  prev = 0x000000
67  for (start, end) in in_ranges:
68    if prev < start:
69      r.append((prev, start - 1))
70
71    prev = end + 1
72
73  if prev < MAX_CODE_POINT:
74    r.append((prev, MAX_CODE_POINT))
75
76  return r
77
78def add_ranges(r1, r2):
79  r = r1 + r2
80  return normalize_ranges(r, True)
81
82def sub_one_range(one_range, rs):
83  r = []
84  (s1, e1) = one_range
85  n = len(rs)
86  for i in range(0, n):
87    (s2, e2) = rs[i]
88    if s2 >= s1 and s2 <= e1:
89      if s2 > s1:
90        r.append((s1, s2 - 1))
91      if e2 >= e1:
92        return r
93
94      s1 = e2 + 1
95    elif s2 < s1 and e2 >= s1:
96      if e2 < e1:
97        s1 = e2 + 1
98      else:
99        return r
100
101  r.append((s1, e1))
102  return r
103
104def sub_ranges(r1, r2):
105  r = []
106  for one_range in r1:
107    rs = sub_one_range(one_range, r2)
108    r.extend(rs)
109
110  return r
111
112def add_ranges_in_dic(dic):
113  r = []
114  for k, v in dic.items():
115    r = r + v
116
117  return normalize_ranges(r, True)
118
119def normalize_ranges_in_dic(dic, sort=False):
120  for k, v in dic.items():
121    r = normalize_ranges(v, sort)
122    dic[k] = r
123
124def merge_dic(to_dic, from_dic):
125  to_keys   = to_dic.keys()
126  from_keys = from_dic.keys()
127  common = list(set(to_keys) & set(from_keys))
128  if len(common) != 0:
129    print >> sys.stderr, "merge_dic: collision: %s" % sorted(common)
130
131  to_dic.update(from_dic)
132
133def merge_props(to_props, from_props):
134  common = list(set(to_props) & set(from_props))
135  if len(common) != 0:
136    print >> sys.stderr, "merge_props: collision: %s" % sorted(common)
137
138  to_props.extend(from_props)
139
140def add_range_into_dic(dic, name, start, end):
141  d = dic.get(name, None)
142  if d is None:
143    d = [(start, end)]
144    dic[name] = d
145  else:
146    d.append((start, end))
147
148def list_sub(a, b):
149  x = set(a) - set(b)
150  return list(x)
151
152def parse_properties(path):
153  with open(path, 'r') as f:
154    dic = { }
155    prop = None
156    props = []
157    for line in f:
158      s = line.strip()
159      if len(s) == 0:
160        continue
161
162      if s[0] == '#':
163        if VERSION_INFO is None:
164          check_version_info(s)
165
166      m = PR_LINE_REG.match(s)
167      if m:
168        prop = m.group(3)
169        if m.group(2):
170          start = int(m.group(1), 16)
171          end   = int(m.group(2), 16)
172          add_range_into_dic(dic, prop, start, end)
173        else:
174          start = int(m.group(1), 16)
175          add_range_into_dic(dic, prop, start, start)
176
177      elif PR_TOTAL_REG.match(s) is not None:
178        props.append(prop)
179
180  normalize_ranges_in_dic(dic)
181  return (dic, props)
182
183
184### main ###
185argv = sys.argv
186argc = len(argv)
187
188dic, props = parse_properties('GraphemeBreakProperty.txt')
189merge_dic(DIC, dic)
190merge_props(PROPS, props)
191
192PROPS = sorted(PROPS)
193
194print '/* unicode_egcb_data.c: Generated by make_unicode_egcb_data.py. */'
195COPYRIGHT = '''
196/*-
197 * Copyright (c) 2017-2018  K.Kosako  <sndgk393 AT ybb DOT ne DOT jp>
198 * All rights reserved.
199 *
200 * Redistribution and use in source and binary forms, with or without
201 * modification, are permitted provided that the following conditions
202 * are met:
203 * 1. Redistributions of source code must retain the above copyright
204 *    notice, this list of conditions and the following disclaimer.
205 * 2. Redistributions in binary form must reproduce the above copyright
206 *    notice, this list of conditions and the following disclaimer in the
207 *    documentation and/or other materials provided with the distribution.
208 *
209 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
210 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
211 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
212 * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
213 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
214 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
215 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
216 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
217 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
218 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
219 * SUCH DAMAGE.
220 */
221'''.strip()
222
223print COPYRIGHT
224print ''
225if VERSION_INFO is not None:
226  print "#define GRAPHEME_BREAK_PROPERTY_VERSION  %s" % re.sub(r'[\.-]', '_', VERSION_INFO)
227  print ''
228
229ranges = []
230for prop in PROPS:
231  rs = DIC[prop]
232  for (start, end) in rs:
233    ranges.append((start, end, prop))
234
235ranges = sorted(ranges, key=lambda x: x[0])
236
237prev = -1
238for (start, end, prop) in ranges:
239  if prev >= start:
240    raise ValueError("{2}:{0} - {1} range overlap prev value {3}".format(start, end, prop, prev))
241
242
243print '/*'
244for prop in PROPS:
245  print "%s" % prop
246print '*/'
247print ''
248
249num_ranges = len(ranges)
250print "static int EGCB_RANGE_NUM = %d;" % num_ranges
251
252print 'static EGCB_RANGE_TYPE EGCB_RANGES[] = {'
253for i, (start, end, prop) in enumerate(ranges):
254  if i == num_ranges - 1:
255    comma = ''
256  else:
257    comma = ','
258
259  type_name = 'EGCB_' + prop
260  print " {0x%06x, 0x%06x, %s }%s" % (start, end, type_name, comma)
261
262print '};'
263
264sys.exit(0)
265