1#!/usr/bin/python 2# -*- coding: utf-8 -*- 3# make_unicode_egcb_data.py 4# Copyright (c) 2017-2018 K.Kosako 5 6import sys 7import re 8 9MAX_CODE_POINT = 0x10ffff 10 11PR_TOTAL_REG = re.compile("#\s*Total\s+(?:code\s+points|elements):") 12PR_LINE_REG = re.compile("([0-9A-Fa-f]+)(?:..([0-9A-Fa-f]+))?\s*;\s*(\w+)") 13PA_LINE_REG = re.compile("(\w+)\s*;\s*(\w+)") 14PVA_LINE_REG = re.compile("(sc|gc)\s*;\s*(\w+)\s*;\s*(\w+)(?:\s*;\s*(\w+))?") 15BL_LINE_REG = re.compile("([0-9A-Fa-f]+)\.\.([0-9A-Fa-f]+)\s*;\s*(.*)") 16VERSION_REG = re.compile("#\s*.*-(\d+\.\d+\.\d+)\.txt") 17 18VERSION_INFO = None 19DIC = { } 20PROPS = [] 21PropIndex = { } 22 23def check_version_info(s): 24 global VERSION_INFO 25 m = VERSION_REG.match(s) 26 if m is not None: 27 VERSION_INFO = m.group(1) 28 29def print_ranges(ranges): 30 for (start, end) in ranges: 31 print "0x%06x, 0x%06x" % (start, end) 32 33def print_prop_and_index(prop, i): 34 print "%-35s %3d" % (prop + ',', i) 35 PropIndex[prop] = i 36 37def dic_find_by_value(dic, v): 38 for key, val in dic.items(): 39 if val == v: 40 return key 41 42 return None 43 44 45def normalize_ranges(in_ranges, sort=False): 46 if sort: 47 ranges = sorted(in_ranges) 48 else: 49 ranges = in_ranges 50 51 r = [] 52 prev = None 53 for (start, end) in ranges: 54 if prev >= start - 1: 55 (pstart, pend) = r.pop() 56 end = max(pend, end) 57 start = pstart 58 59 r.append((start, end)) 60 prev = end 61 62 return r 63 64def inverse_ranges(in_ranges): 65 r = [] 66 prev = 0x000000 67 for (start, end) in in_ranges: 68 if prev < start: 69 r.append((prev, start - 1)) 70 71 prev = end + 1 72 73 if prev < MAX_CODE_POINT: 74 r.append((prev, MAX_CODE_POINT)) 75 76 return r 77 78def add_ranges(r1, r2): 79 r = r1 + r2 80 return normalize_ranges(r, True) 81 82def sub_one_range(one_range, rs): 83 r = [] 84 (s1, e1) = one_range 85 n = len(rs) 86 for i in range(0, n): 87 (s2, e2) = rs[i] 88 if s2 >= s1 and s2 <= e1: 89 if s2 > s1: 90 r.append((s1, s2 - 1)) 91 if e2 >= e1: 92 return r 93 94 s1 = e2 + 1 95 elif s2 < s1 and e2 >= s1: 96 if e2 < e1: 97 s1 = e2 + 1 98 else: 99 return r 100 101 r.append((s1, e1)) 102 return r 103 104def sub_ranges(r1, r2): 105 r = [] 106 for one_range in r1: 107 rs = sub_one_range(one_range, r2) 108 r.extend(rs) 109 110 return r 111 112def add_ranges_in_dic(dic): 113 r = [] 114 for k, v in dic.items(): 115 r = r + v 116 117 return normalize_ranges(r, True) 118 119def normalize_ranges_in_dic(dic, sort=False): 120 for k, v in dic.items(): 121 r = normalize_ranges(v, sort) 122 dic[k] = r 123 124def merge_dic(to_dic, from_dic): 125 to_keys = to_dic.keys() 126 from_keys = from_dic.keys() 127 common = list(set(to_keys) & set(from_keys)) 128 if len(common) != 0: 129 print >> sys.stderr, "merge_dic: collision: %s" % sorted(common) 130 131 to_dic.update(from_dic) 132 133def merge_props(to_props, from_props): 134 common = list(set(to_props) & set(from_props)) 135 if len(common) != 0: 136 print >> sys.stderr, "merge_props: collision: %s" % sorted(common) 137 138 to_props.extend(from_props) 139 140def add_range_into_dic(dic, name, start, end): 141 d = dic.get(name, None) 142 if d is None: 143 d = [(start, end)] 144 dic[name] = d 145 else: 146 d.append((start, end)) 147 148def list_sub(a, b): 149 x = set(a) - set(b) 150 return list(x) 151 152def parse_properties(path): 153 with open(path, 'r') as f: 154 dic = { } 155 prop = None 156 props = [] 157 for line in f: 158 s = line.strip() 159 if len(s) == 0: 160 continue 161 162 if s[0] == '#': 163 if VERSION_INFO is None: 164 check_version_info(s) 165 166 m = PR_LINE_REG.match(s) 167 if m: 168 prop = m.group(3) 169 if m.group(2): 170 start = int(m.group(1), 16) 171 end = int(m.group(2), 16) 172 add_range_into_dic(dic, prop, start, end) 173 else: 174 start = int(m.group(1), 16) 175 add_range_into_dic(dic, prop, start, start) 176 177 elif PR_TOTAL_REG.match(s) is not None: 178 props.append(prop) 179 180 normalize_ranges_in_dic(dic) 181 return (dic, props) 182 183 184### main ### 185argv = sys.argv 186argc = len(argv) 187 188dic, props = parse_properties('GraphemeBreakProperty.txt') 189merge_dic(DIC, dic) 190merge_props(PROPS, props) 191 192PROPS = sorted(PROPS) 193 194print '/* unicode_egcb_data.c: Generated by make_unicode_egcb_data.py. */' 195COPYRIGHT = ''' 196/*- 197 * Copyright (c) 2017-2018 K.Kosako <sndgk393 AT ybb DOT ne DOT jp> 198 * All rights reserved. 199 * 200 * Redistribution and use in source and binary forms, with or without 201 * modification, are permitted provided that the following conditions 202 * are met: 203 * 1. Redistributions of source code must retain the above copyright 204 * notice, this list of conditions and the following disclaimer. 205 * 2. Redistributions in binary form must reproduce the above copyright 206 * notice, this list of conditions and the following disclaimer in the 207 * documentation and/or other materials provided with the distribution. 208 * 209 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND 210 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 211 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 212 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE 213 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 214 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 215 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 216 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 217 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 218 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 219 * SUCH DAMAGE. 220 */ 221'''.strip() 222 223print COPYRIGHT 224print '' 225if VERSION_INFO is not None: 226 print "#define GRAPHEME_BREAK_PROPERTY_VERSION %s" % re.sub(r'[\.-]', '_', VERSION_INFO) 227 print '' 228 229ranges = [] 230for prop in PROPS: 231 rs = DIC[prop] 232 for (start, end) in rs: 233 ranges.append((start, end, prop)) 234 235ranges = sorted(ranges, key=lambda x: x[0]) 236 237prev = -1 238for (start, end, prop) in ranges: 239 if prev >= start: 240 raise ValueError("{2}:{0} - {1} range overlap prev value {3}".format(start, end, prop, prev)) 241 242 243print '/*' 244for prop in PROPS: 245 print "%s" % prop 246print '*/' 247print '' 248 249num_ranges = len(ranges) 250print "static int EGCB_RANGE_NUM = %d;" % num_ranges 251 252print 'static EGCB_RANGE_TYPE EGCB_RANGES[] = {' 253for i, (start, end, prop) in enumerate(ranges): 254 if i == num_ranges - 1: 255 comma = '' 256 else: 257 comma = ',' 258 259 type_name = 'EGCB_' + prop 260 print " {0x%06x, 0x%06x, %s }%s" % (start, end, type_name, comma) 261 262print '};' 263 264sys.exit(0) 265