1#!/usr/bin/python 2# -*- coding: utf-8 -*- 3# make_unicode_egcb_data.py 4# Copyright (c) 2017-2020 K.Kosako 5 6import sys 7import re 8 9MAX_CODE_POINT = 0x10ffff 10 11PR_TOTAL_REG = re.compile("#\s*Total\s+(?:code\s+points|elements):") 12PR_LINE_REG = re.compile("([0-9A-Fa-f]+)(?:..([0-9A-Fa-f]+))?\s*;\s*(\w+)") 13PA_LINE_REG = re.compile("(\w+)\s*;\s*(\w+)") 14PVA_LINE_REG = re.compile("(sc|gc)\s*;\s*(\w+)\s*;\s*(\w+)(?:\s*;\s*(\w+))?") 15BL_LINE_REG = re.compile("([0-9A-Fa-f]+)\.\.([0-9A-Fa-f]+)\s*;\s*(.*)") 16VERSION_REG = re.compile("#\s*.*-(\d+)\.(\d+)\.(\d+)\.txt") 17 18VERSION_INFO = [-1, -1, -1] 19DIC = { } 20PROPS = [] 21PropIndex = { } 22 23def check_version_info(s): 24 m = VERSION_REG.match(s) 25 if m is not None: 26 VERSION_INFO[0] = int(m.group(1)) 27 VERSION_INFO[1] = int(m.group(2)) 28 VERSION_INFO[2] = int(m.group(3)) 29 30def print_ranges(ranges): 31 for (start, end) in ranges: 32 print "0x%06x, 0x%06x" % (start, end) 33 34def print_prop_and_index(prop, i): 35 print "%-35s %3d" % (prop + ',', i) 36 PropIndex[prop] = i 37 38def dic_find_by_value(dic, v): 39 for key, val in dic.items(): 40 if val == v: 41 return key 42 43 return None 44 45 46def normalize_ranges(in_ranges, sort=False): 47 if sort: 48 ranges = sorted(in_ranges) 49 else: 50 ranges = in_ranges 51 52 r = [] 53 prev = None 54 for (start, end) in ranges: 55 if prev >= start - 1: 56 (pstart, pend) = r.pop() 57 end = max(pend, end) 58 start = pstart 59 60 r.append((start, end)) 61 prev = end 62 63 return r 64 65def inverse_ranges(in_ranges): 66 r = [] 67 prev = 0x000000 68 for (start, end) in in_ranges: 69 if prev < start: 70 r.append((prev, start - 1)) 71 72 prev = end + 1 73 74 if prev < MAX_CODE_POINT: 75 r.append((prev, MAX_CODE_POINT)) 76 77 return r 78 79def add_ranges(r1, r2): 80 r = r1 + r2 81 return normalize_ranges(r, True) 82 83def sub_one_range(one_range, rs): 84 r = [] 85 (s1, e1) = one_range 86 n = len(rs) 87 for i in range(0, n): 88 (s2, e2) = rs[i] 89 if s2 >= s1 and s2 <= e1: 90 if s2 > s1: 91 r.append((s1, s2 - 1)) 92 if e2 >= e1: 93 return r 94 95 s1 = e2 + 1 96 elif s2 < s1 and e2 >= s1: 97 if e2 < e1: 98 s1 = e2 + 1 99 else: 100 return r 101 102 r.append((s1, e1)) 103 return r 104 105def sub_ranges(r1, r2): 106 r = [] 107 for one_range in r1: 108 rs = sub_one_range(one_range, r2) 109 r.extend(rs) 110 111 return r 112 113def add_ranges_in_dic(dic): 114 r = [] 115 for k, v in dic.items(): 116 r = r + v 117 118 return normalize_ranges(r, True) 119 120def normalize_ranges_in_dic(dic, sort=False): 121 for k, v in dic.items(): 122 r = normalize_ranges(v, sort) 123 dic[k] = r 124 125def merge_dic(to_dic, from_dic): 126 to_keys = to_dic.keys() 127 from_keys = from_dic.keys() 128 common = list(set(to_keys) & set(from_keys)) 129 if len(common) != 0: 130 print >> sys.stderr, "merge_dic: collision: %s" % sorted(common) 131 132 to_dic.update(from_dic) 133 134def merge_props(to_props, from_props): 135 common = list(set(to_props) & set(from_props)) 136 if len(common) != 0: 137 print >> sys.stderr, "merge_props: collision: %s" % sorted(common) 138 139 to_props.extend(from_props) 140 141def add_range_into_dic(dic, name, start, end): 142 d = dic.get(name, None) 143 if d is None: 144 d = [(start, end)] 145 dic[name] = d 146 else: 147 d.append((start, end)) 148 149def list_sub(a, b): 150 x = set(a) - set(b) 151 return list(x) 152 153def parse_properties(path): 154 with open(path, 'r') as f: 155 dic = { } 156 prop = None 157 props = [] 158 for line in f: 159 s = line.strip() 160 if len(s) == 0: 161 continue 162 163 if s[0] == '#': 164 if VERSION_INFO[0] < 0: 165 check_version_info(s) 166 167 m = PR_LINE_REG.match(s) 168 if m: 169 prop = m.group(3) 170 if m.group(2): 171 start = int(m.group(1), 16) 172 end = int(m.group(2), 16) 173 add_range_into_dic(dic, prop, start, end) 174 else: 175 start = int(m.group(1), 16) 176 add_range_into_dic(dic, prop, start, start) 177 178 elif PR_TOTAL_REG.match(s) is not None: 179 props.append(prop) 180 181 normalize_ranges_in_dic(dic) 182 return (dic, props) 183 184 185### main ### 186argv = sys.argv 187argc = len(argv) 188 189dic, props = parse_properties('GraphemeBreakProperty.txt') 190merge_dic(DIC, dic) 191merge_props(PROPS, props) 192 193PROPS = sorted(PROPS) 194 195print '/* unicode_egcb_data.c: Generated by make_unicode_egcb_data.py. */' 196COPYRIGHT = ''' 197/*- 198 * Copyright (c) 2017-2020 K.Kosako 199 * All rights reserved. 200 * 201 * Redistribution and use in source and binary forms, with or without 202 * modification, are permitted provided that the following conditions 203 * are met: 204 * 1. Redistributions of source code must retain the above copyright 205 * notice, this list of conditions and the following disclaimer. 206 * 2. Redistributions in binary form must reproduce the above copyright 207 * notice, this list of conditions and the following disclaimer in the 208 * documentation and/or other materials provided with the distribution. 209 * 210 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND 211 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 212 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 213 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE 214 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 215 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 216 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 217 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 218 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 219 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 220 * SUCH DAMAGE. 221 */ 222'''.strip() 223 224print COPYRIGHT 225print '' 226if VERSION_INFO[0] < 0: 227 raise RuntimeError("Version is not found") 228 229print "#define GRAPHEME_BREAK_PROPERTY_VERSION %02d%02d%02d" % (VERSION_INFO[0], VERSION_INFO[1], VERSION_INFO[2]) 230print '' 231 232ranges = [] 233for prop in PROPS: 234 rs = DIC[prop] 235 for (start, end) in rs: 236 ranges.append((start, end, prop)) 237 238ranges = sorted(ranges, key=lambda x: x[0]) 239 240prev = -1 241for (start, end, prop) in ranges: 242 if prev >= start: 243 raise ValueError("{2}:{0} - {1} range overlap prev value {3}".format(start, end, prop, prev)) 244 245 246print '/*' 247for prop in PROPS: 248 print "%s" % prop 249print '*/' 250print '' 251 252num_ranges = len(ranges) 253print "static int EGCB_RANGE_NUM = %d;" % num_ranges 254 255print 'static EGCB_RANGE_TYPE EGCB_RANGES[] = {' 256for i, (start, end, prop) in enumerate(ranges): 257 if i == num_ranges - 1: 258 comma = '' 259 else: 260 comma = ',' 261 262 type_name = 'EGCB_' + prop 263 print " {0x%06x, 0x%06x, %s }%s" % (start, end, type_name, comma) 264 265print '};' 266 267sys.exit(0) 268