1#!/usr/bin/python 2# -*- coding: utf-8 -*- 3# make_unicode_fold_data.py 4# Copyright (c) 2016-2020 K.Kosako 5 6import sys 7import re 8 9SOURCE_FILE = 'CaseFolding.txt' 10GPERF_UNFOLD_KEY_FILE = 'unicode_unfold_key.gperf' 11GPERF_FOLD_KEY_FILES = ['unicode_fold1_key.gperf', 'unicode_fold2_key.gperf', 'unicode_fold3_key.gperf'] 12 13 14DataName = 'OnigUnicodeFolds' 15 16ENCODING = 'utf-8' 17 18LINE_REG = re.compile("([0-9A-F]{1,6}); (.); ([0-9A-F]{1,6})(?: ([0-9A-F]{1,6}))?(?: ([0-9A-F]{1,6}))?;(?:\s*#\s*)(.*)") 19VERSION_REG = re.compile("#.*-(\d+)\.(\d+)\.(\d+)\.txt") 20 21VERSION_INFO = [-1, -1, -1] 22 23FOLDS = {} 24TURKISH_FOLDS = {} 25LOCALE_FOLDS = {} 26 27UNFOLDS = {} 28TURKISH_UNFOLDS = {} 29LOCALE_UNFOLDS = {} 30 31COPYRIGHT = ''' 32/*- 33 * Copyright (c) 2017-2020 K.Kosako 34 * All rights reserved. 35 * 36 * Redistribution and use in source and binary forms, with or without 37 * modification, are permitted provided that the following conditions 38 * are met: 39 * 1. Redistributions of source code must retain the above copyright 40 * notice, this list of conditions and the following disclaimer. 41 * 2. Redistributions in binary form must reproduce the above copyright 42 * notice, this list of conditions and the following disclaimer in the 43 * documentation and/or other materials provided with the distribution. 44 * 45 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND 46 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 47 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 48 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE 49 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 50 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 51 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 52 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 53 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 54 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 55 * SUCH DAMAGE. 56 */ 57'''.strip() 58 59 60class Entry: 61 def __init__(self, fold): 62 self.fold = fold 63 self.unfolds = [] 64 self.fold_len = len(fold) 65 self.index = -1 66 self.comment = None 67 68def fold_key(fold): 69 sfold = map(lambda i: "%06x" % i, fold) 70 return ':'.join(sfold) 71 72def form16(x, size): 73 form = "0x%06x" if x > 0xffff else "0x%04x" 74 s = form % x 75 rem = size - len(s) 76 if rem > 0: 77 s = ' ' * rem + s 78 79 return s 80 81def form3bytes(x): 82 x0 = x & 0xff 83 x1 = (x>>8) & 0xff 84 x2 = (x>>16) & 0xff 85 return "\\x%02x\\x%02x\\x%02x" % (x2, x1, x0) 86 87def enc_len(code, encode): 88 u = unichr(code) 89 s = u.encode(encode) 90 return len(s) 91 92def check_version_info(s): 93 m = VERSION_REG.match(s) 94 if m is not None: 95 VERSION_INFO[0] = int(m.group(1)) 96 VERSION_INFO[1] = int(m.group(2)) 97 VERSION_INFO[2] = int(m.group(3)) 98 99def parse_line(s): 100 if len(s) == 0: 101 return False 102 if s[0] == '#': 103 if VERSION_INFO[0] < 0: 104 check_version_info(s) 105 return False 106 107 m = LINE_REG.match(s) 108 if m is None: 109 print >> sys.stderr, s.encode(ENCODING) 110 sys.exit(-1) 111 112 s_unfold = m.group(1) 113 s_type = m.group(2) 114 s_fold = m.group(3) 115 comment = m.group(6) 116 117 if s_type == 'S': 118 return False; 119 120 unfold = int(s_unfold, 16) 121 f1 = int(s_fold, 16) 122 fold = [f1] 123 if m.group(4) is not None: 124 f2 = int(m.group(4), 16) 125 fold.append(f2) 126 if m.group(5) is not None: 127 f3 = int(m.group(5), 16) 128 fold.append(f3) 129 130 if s_type == 'T': 131 dic = TURKISH_FOLDS 132 undic = TURKISH_UNFOLDS 133 else: 134 dic = FOLDS 135 undic = UNFOLDS 136 137 key = fold_key(fold) 138 e = dic.get(key, None) 139 if e is None: 140 e = Entry(fold) 141 e.comment = comment 142 dic[key] = e 143 144 e.unfolds.append(unfold) 145 146 if undic.get(unfold, None) is not None: 147 print >> sys.stderr, ("unfold dup: 0x%04x %s\n" % (unfold, s_type)) 148 undic[unfold] = e 149 150 return True 151 152def parse_file(f): 153 line = f.readline() 154 while line: 155 s = line.strip() 156 parse_line(s) 157 line = f.readline() 158 159def make_locale(): 160 for unfold, te in TURKISH_UNFOLDS.items(): 161 e = UNFOLDS.get(unfold, None) 162 if e is None: 163 continue 164 165 fkey = fold_key(e.fold) 166 if len(e.unfolds) == 1: 167 del FOLDS[fkey] 168 else: 169 e.unfolds.remove(unfold) 170 e = Entry(e.fold) 171 e.unfolds.append(unfold) 172 173 LOCALE_FOLDS[fkey] = e 174 LOCALE_UNFOLDS[unfold] = e 175 del UNFOLDS[unfold] 176 177def output_typedef(f): 178 s = """\ 179typedef unsigned long OnigCodePoint; 180""" 181 print >> f, s 182 183def divide_by_fold_len(d): 184 l = d.items() 185 l1 = filter(lambda (k,e):e.fold_len == 1, l) 186 l2 = filter(lambda (k,e):e.fold_len == 2, l) 187 l3 = filter(lambda (k,e):e.fold_len == 3, l) 188 sl1 = sorted(l1, key=lambda (k,e):k) 189 sl2 = sorted(l2, key=lambda (k,e):k) 190 sl3 = sorted(l3, key=lambda (k,e):k) 191 return (sl1, sl2, sl3) 192 193def output_comment(f, s): 194 f.write(" /* %s */" % s) 195 196def output_data_n1(f, n, fn, c, out_comment): 197 for k, e in fn: 198 e.index = c 199 if out_comment and n > 1 and e.comment is not None: 200 output_comment(f, e.comment) 201 print >> f, '' 202 203 f.write(' ') 204 f.write("/*%4d*/ " % c) 205 for i in range(0, n): 206 s = form16(e.fold[i], 8) 207 f.write(" %s," % s) 208 209 usize = len(e.unfolds) 210 f.write(" %d," % usize) 211 for u in e.unfolds: 212 s = form16(u, 8) 213 f.write(" %s," % s) 214 215 if out_comment and n == 1 and e.comment is not None: 216 if len(e.comment) < 35: 217 s = e.comment 218 else: 219 s = e.comment[0:33] + '..' 220 221 output_comment(f, s) 222 223 f.write("\n") 224 c += n + 1 + usize 225 226 return c 227 228def output_data_n(f, name, n, fn, lfn, out_comment): 229 print >> f, "OnigCodePoint %s%d[] = {" % (name, n) 230 c = 0 231 c = output_data_n1(f, n, fn, c, out_comment) 232 print >> f, "#define FOLDS%d_NORMAL_END_INDEX %d" % (n, c) 233 print >> f, " /* ----- LOCALE ----- */" 234 c = output_data_n1(f, n, lfn, c, out_comment) 235 print >> f, "#define FOLDS%d_END_INDEX %d" % (n, c) 236 print >> f, "};" 237 238def output_fold_data(f, name, out_comment): 239 f1, f2, f3 = divide_by_fold_len(FOLDS) 240 lf1, lf2, lf3 = divide_by_fold_len(LOCALE_FOLDS) 241 242 output_data_n(f, name, 1, f1, lf1, out_comment) 243 print >> f, '' 244 output_data_n(f, name, 2, f2, lf2, out_comment) 245 print >> f, '' 246 output_data_n(f, name, 3, f3, lf3, out_comment) 247 print >> f, '' 248 249def output_macros(f, name): 250 print >> f, "#define FOLDS1_FOLD(i) (%s1 + (i))" % name 251 print >> f, "#define FOLDS2_FOLD(i) (%s2 + (i))" % name 252 print >> f, "#define FOLDS3_FOLD(i) (%s3 + (i))" % name 253 254 print >> f, "#define FOLDS1_UNFOLDS_NUM(i) %s1[(i)+1]" % name 255 print >> f, "#define FOLDS2_UNFOLDS_NUM(i) %s2[(i)+2]" % name 256 print >> f, "#define FOLDS3_UNFOLDS_NUM(i) %s3[(i)+3]" % name 257 258 print >> f, "#define FOLDS1_UNFOLDS(i) (%s1 + (i) + 2)" % name 259 print >> f, "#define FOLDS2_UNFOLDS(i) (%s2 + (i) + 3)" % name 260 print >> f, "#define FOLDS3_UNFOLDS(i) (%s3 + (i) + 4)" % name 261 262 print >> f, "#define FOLDS1_NEXT_INDEX(i) ((i) + 2 + %s1[(i)+1])" % name 263 print >> f, "#define FOLDS2_NEXT_INDEX(i) ((i) + 3 + %s1[(i)+2])" % name 264 print >> f, "#define FOLDS3_NEXT_INDEX(i) ((i) + 4 + %s1[(i)+3])" % name 265 266def output_fold_source(f, out_comment): 267 print >> f, "/* This file was generated by make_unicode_fold_data.py. */" 268 print >> f, COPYRIGHT 269 print >> f, "\n" 270 print >> f, '#include "regenc.h"' 271 print >> f, '' 272 if VERSION_INFO[0] < 0: 273 raise RuntimeError("Version is not found") 274 275 print "#define UNICODE_CASEFOLD_VERSION %02d%02d%02d" % (VERSION_INFO[0], VERSION_INFO[1], VERSION_INFO[2]) 276 print '' 277 #output_macros(f, DataName) 278 print >> f, '' 279 #output_typedef(f) 280 output_fold_data(f, DataName, out_comment) 281 282 283def output_gperf_unfold_key(f): 284 head = "%{\n/* This gperf source file was generated by make_unicode_fold_data.py */\n\n" + COPYRIGHT + """\ 285 286#include "regint.h" 287%} 288 289struct ByUnfoldKey { 290 OnigCodePoint code; 291 short int index; 292 short int fold_len; 293}; 294%% 295""" 296 f.write(head) 297 UNFOLDS.update(LOCALE_UNFOLDS) 298 l = UNFOLDS.items() 299 sl = sorted(l, key=lambda (k,e):(e.fold_len, e.index)) 300 for k, e in sl: 301 f.write('"%s", /*0x%04x*/ %4d, %d\n' % 302 (form3bytes(k), k, e.index, e.fold_len)) 303 304 print >> f, '%%' 305 306def output_gperf_fold_key(f, key_len): 307 head = "%{\n/* This gperf source file was generated by make_unicode_fold_data.py */\n\n" + COPYRIGHT + """\ 308 309#include "regint.h" 310%} 311 312short int 313%% 314""" 315 f.write(head) 316 l = FOLDS.items() 317 l = filter(lambda (k,e):e.fold_len == key_len, l) 318 sl = sorted(l, key=lambda (k,e):e.index) 319 for k, e in sl: 320 skey = ''.join(map(lambda i: form3bytes(i), e.fold)) 321 f.write('"%s", %4d\n' % (skey, e.index)) 322 323 print >> f, '%%' 324 325def output_gperf_source(): 326 with open(GPERF_UNFOLD_KEY_FILE, 'w') as f: 327 output_gperf_unfold_key(f) 328 329 FOLDS.update(LOCALE_FOLDS) 330 331 for i in range(1, 4): 332 with open(GPERF_FOLD_KEY_FILES[i-1], 'w') as f: 333 output_gperf_fold_key(f, i) 334 335def unfolds_byte_length_check(encode): 336 l = UNFOLDS.items() 337 sl = sorted(l, key=lambda (k,e):(e.fold_len, e.index)) 338 for unfold, e in sl: 339 key_len = enc_len(unfold, encode) 340 fold_len = sum(map(lambda c: enc_len(c, encode), e.fold)) 341 if key_len > fold_len: 342 sfolds = ' '.join(map(lambda c: "0x%06x" % c, e.fold)) 343 s = "%s byte length: %d > %d: 0x%06x => %s" % (encode, key_len, fold_len, unfold, sfolds) 344 print >> sys.stderr, s 345 346def double_fold_check(): 347 l = UNFOLDS.items() 348 sl = sorted(l, key=lambda (k,e):(e.fold_len, e.index)) 349 for unfold, e in sl: 350 for f in e.fold: 351 #print >> sys.stderr, ("check 0x%06x" % f) 352 e2 = UNFOLDS.get(f) 353 if e2 is not None: 354 s = "double folds: 0x%06x => %s, 0x%06x => %s" % (unfold, e.fold, f, e2.fold) 355 print >> sys.stderr, s 356 357def unfold_is_multi_code_folds_head_check(): 358 l = UNFOLDS.items() 359 l2 = filter(lambda (k,e):e.fold_len == 2, l) 360 l3 = filter(lambda (k,e):e.fold_len == 3, l) 361 sl = sorted(l, key=lambda (k,e):(e.fold_len, e.index)) 362 for unfold, _ in sl: 363 for k, e in l2: 364 if e.fold[0] == unfold: 365 s = "unfold 0x%06x is multi-code fold head in %s" % (unfold, e.fold) 366 print >> sys.stderr, s 367 for k, e in l3: 368 if e.fold[0] == unfold: 369 s = "unfold 0x%06x is multi-code fold head in %s" % (unfold, e.fold) 370 print >> sys.stderr, s 371 372def make_one_folds(l): 373 h = {} 374 for unfold, e in l: 375 if e.fold_len != 1: 376 continue 377 fold = e.fold[0] 378 unfolds = h.get(fold) 379 if unfolds is None: 380 unfolds = [unfold] 381 h[fold] = unfolds 382 else: 383 unfolds.append(unfold) 384 385 return h 386 387def make_foldn_heads(l, fold_len, one_folds): 388 h = {} 389 for unfold, e in l: 390 if e.fold_len != fold_len: 391 continue 392 unfolds = one_folds.get(e.fold[0]) 393 h[e.fold[0]] = (e, unfolds) 394 395 return h 396 397def fold2_expansion_num(e, one_folds): 398 n = len(e.unfolds) 399 n0 = 1 400 u0 = one_folds.get(e.fold[0]) 401 if u0 is not None: 402 n0 += len(u0) 403 n1 = 1 404 u1 = one_folds.get(e.fold[1]) 405 if u1 is not None: 406 n1 += len(u1) 407 n += (n0 * n1) 408 return n 409 410def fold3_expansion_num(e, one_folds): 411 n = len(e.unfolds) 412 n0 = 1 413 u0 = one_folds.get(e.fold[0]) 414 if u0 is not None: 415 n0 += len(u0) 416 n1 = 1 417 u1 = one_folds.get(e.fold[1]) 418 if u1 is not None: 419 n1 += len(u1) 420 n2 = 1 421 u2 = one_folds.get(e.fold[2]) 422 if u2 is not None: 423 n2 += len(u2) 424 n += (n0 * n1 * n2) 425 return n 426 427def get_all_folds_expansion_num(x, one_folds, fold2_heads, fold3_heads): 428 e = UNFOLDS[x] 429 n = 0 430 if e.fold_len == 1: 431 n1 = len(e.unfolds) + 1 # +1: fold 432 fx = e.fold[0] 433 r = fold2_heads.get(fx) 434 n2 = n3 = 0 435 if r is not None: 436 e2, _ = r 437 n2 = fold2_expansion_num(e2, one_folds) 438 r = fold3_heads.get(fx) 439 if r is not None: 440 e3, _ = r 441 n3 = fold3_expansion_num(e3, one_folds) 442 n = max(n1, n2, n3) 443 elif e.fold_len == 2: 444 n = fold2_expansion_num(e, one_folds) 445 elif e.fold_len == 3: 446 n = fold3_expansion_num(e, one_folds) 447 else: 448 raise RuntimeError("Invalid fold_len %d" % (e.fold_len)) 449 450 return n 451 452def get_all_folds_expansion_max_num(): 453 l = UNFOLDS.items() 454 one_folds = make_one_folds(l) 455 fold2_heads = make_foldn_heads(l, 2, one_folds) 456 fold3_heads = make_foldn_heads(l, 3, one_folds) 457 sl = sorted(l, key=lambda (k,e):(e.fold_len, e.index)) 458 nmax = 0 459 max_unfold = None 460 for unfold, e in sl: 461 n = get_all_folds_expansion_num(unfold, one_folds, fold2_heads, fold3_heads) 462 if nmax < n: 463 nmax = n 464 max_unfold = unfold 465 466 return (nmax, max_unfold) 467 468## main ## 469with open(SOURCE_FILE, 'r') as f: 470 parse_file(f) 471 472make_locale() 473 474out_comment = True 475output_fold_source(sys.stdout, out_comment) 476 477output_gperf_source() 478 479#unfolds_byte_length_check('utf-8') 480#unfolds_byte_length_check('utf-16') 481double_fold_check() 482unfold_is_multi_code_folds_head_check() 483 484#max_num, max_code = get_all_folds_expansion_max_num() 485#max_num -= 1 # remove self 486#print >> sys.stderr, "max expansion: 0x%06x: %d" % (max_code, max_num) 487