1#!/usr/bin/python
2# -*- coding: utf-8 -*-
3# make_unicode_fold_data.py
4# Copyright (c) 2016-2020  K.Kosako
5
6import sys
7import re
8
9SOURCE_FILE = 'CaseFolding.txt'
10GPERF_UNFOLD_KEY_FILE = 'unicode_unfold_key.gperf'
11GPERF_FOLD_KEY_FILES  = ['unicode_fold1_key.gperf', 'unicode_fold2_key.gperf', 'unicode_fold3_key.gperf']
12
13
14DataName = 'OnigUnicodeFolds'
15
16ENCODING = 'utf-8'
17
18LINE_REG = re.compile("([0-9A-F]{1,6}); (.); ([0-9A-F]{1,6})(?: ([0-9A-F]{1,6}))?(?: ([0-9A-F]{1,6}))?;(?:\s*#\s*)(.*)")
19VERSION_REG  = re.compile("#.*-(\d+)\.(\d+)\.(\d+)\.txt")
20
21VERSION_INFO = [-1, -1, -1]
22
23FOLDS = {}
24TURKISH_FOLDS = {}
25LOCALE_FOLDS  = {}
26
27UNFOLDS = {}
28TURKISH_UNFOLDS = {}
29LOCALE_UNFOLDS  = {}
30
31COPYRIGHT = '''
32/*-
33 * Copyright (c) 2017-2020  K.Kosako
34 * All rights reserved.
35 *
36 * Redistribution and use in source and binary forms, with or without
37 * modification, are permitted provided that the following conditions
38 * are met:
39 * 1. Redistributions of source code must retain the above copyright
40 *    notice, this list of conditions and the following disclaimer.
41 * 2. Redistributions in binary form must reproduce the above copyright
42 *    notice, this list of conditions and the following disclaimer in the
43 *    documentation and/or other materials provided with the distribution.
44 *
45 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
46 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
47 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
48 * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
49 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
50 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
51 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
52 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
53 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
54 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
55 * SUCH DAMAGE.
56 */
57'''.strip()
58
59
60class Entry:
61    def __init__(self, fold):
62        self.fold = fold
63        self.unfolds = []
64        self.fold_len = len(fold)
65        self.index = -1
66        self.comment = None
67
68def fold_key(fold):
69    sfold = map(lambda i: "%06x" % i, fold)
70    return ':'.join(sfold)
71
72def form16(x, size):
73    form = "0x%06x" if x > 0xffff else "0x%04x"
74    s = form % x
75    rem = size - len(s)
76    if rem > 0:
77        s = ' ' * rem + s
78
79    return s
80
81def form3bytes(x):
82    x0 = x & 0xff
83    x1 = (x>>8)  & 0xff
84    x2 = (x>>16) & 0xff
85    return "\\x%02x\\x%02x\\x%02x" % (x2, x1, x0)
86
87def enc_len(code, encode):
88    u = unichr(code)
89    s = u.encode(encode)
90    return len(s)
91
92def check_version_info(s):
93  m = VERSION_REG.match(s)
94  if m is not None:
95    VERSION_INFO[0] = int(m.group(1))
96    VERSION_INFO[1] = int(m.group(2))
97    VERSION_INFO[2] = int(m.group(3))
98
99def parse_line(s):
100    if len(s) == 0:
101      return False
102    if s[0] == '#':
103      if VERSION_INFO[0] < 0:
104        check_version_info(s)
105      return False
106
107    m = LINE_REG.match(s)
108    if m is None:
109        print >> sys.stderr, s.encode(ENCODING)
110        sys.exit(-1)
111
112    s_unfold = m.group(1)
113    s_type   = m.group(2)
114    s_fold   = m.group(3)
115    comment  = m.group(6)
116
117    if s_type == 'S':
118        return False;
119
120    unfold = int(s_unfold, 16)
121    f1     = int(s_fold, 16)
122    fold = [f1]
123    if m.group(4) is not None:
124        f2 = int(m.group(4), 16)
125        fold.append(f2)
126        if m.group(5) is not None:
127            f3 = int(m.group(5), 16)
128            fold.append(f3)
129
130    if s_type == 'T':
131        dic   = TURKISH_FOLDS
132        undic = TURKISH_UNFOLDS
133    else:
134        dic   = FOLDS
135        undic = UNFOLDS
136
137    key = fold_key(fold)
138    e = dic.get(key, None)
139    if e is None:
140        e = Entry(fold)
141        e.comment = comment
142        dic[key] = e
143
144    e.unfolds.append(unfold)
145
146    if undic.get(unfold, None) is not None:
147        print >> sys.stderr, ("unfold dup: 0x%04x %s\n" % (unfold, s_type))
148    undic[unfold] = e
149
150    return True
151
152def parse_file(f):
153    line = f.readline()
154    while line:
155        s = line.strip()
156        parse_line(s)
157        line = f.readline()
158
159def make_locale():
160    for unfold, te in TURKISH_UNFOLDS.items():
161        e = UNFOLDS.get(unfold, None)
162        if e is None:
163            continue
164
165        fkey = fold_key(e.fold)
166        if len(e.unfolds) == 1:
167            del FOLDS[fkey]
168        else:
169            e.unfolds.remove(unfold)
170            e = Entry(e.fold)
171            e.unfolds.append(unfold)
172
173        LOCALE_FOLDS[fkey] = e
174        LOCALE_UNFOLDS[unfold] = e
175        del UNFOLDS[unfold]
176
177def output_typedef(f):
178    s = """\
179typedef unsigned long OnigCodePoint;
180"""
181    print >> f, s
182
183def divide_by_fold_len(d):
184    l = d.items()
185    l1 = filter(lambda (k,e):e.fold_len == 1, l)
186    l2 = filter(lambda (k,e):e.fold_len == 2, l)
187    l3 = filter(lambda (k,e):e.fold_len == 3, l)
188    sl1 = sorted(l1, key=lambda (k,e):k)
189    sl2 = sorted(l2, key=lambda (k,e):k)
190    sl3 = sorted(l3, key=lambda (k,e):k)
191    return (sl1, sl2, sl3)
192
193def output_comment(f, s):
194    f.write(" /* %s */" % s)
195
196def output_data_n1(f, n, fn, c, out_comment):
197    for k, e in fn:
198        e.index = c
199        if out_comment and n > 1 and e.comment is not None:
200            output_comment(f, e.comment)
201            print >> f, ''
202
203        f.write(' ')
204        f.write("/*%4d*/ " % c)
205        for i in range(0, n):
206            s = form16(e.fold[i], 8)
207            f.write(" %s," % s)
208
209        usize = len(e.unfolds)
210        f.write("  %d," % usize)
211        for u in e.unfolds:
212            s = form16(u, 8)
213            f.write(" %s," % s)
214
215        if out_comment and n == 1 and e.comment is not None:
216            if len(e.comment) < 35:
217                s = e.comment
218            else:
219                s = e.comment[0:33] + '..'
220
221            output_comment(f, s)
222
223        f.write("\n")
224        c += n + 1 + usize
225
226    return c
227
228def output_data_n(f, name, n, fn, lfn, out_comment):
229    print >> f, "OnigCodePoint %s%d[] = {" % (name, n)
230    c = 0
231    c = output_data_n1(f, n,  fn, c, out_comment)
232    print >> f, "#define FOLDS%d_NORMAL_END_INDEX   %d" % (n, c)
233    print >> f, " /* ----- LOCALE ----- */"
234    c = output_data_n1(f, n, lfn, c, out_comment)
235    print >> f, "#define FOLDS%d_END_INDEX   %d" % (n, c)
236    print >> f, "};"
237
238def output_fold_data(f, name, out_comment):
239    f1, f2, f3 = divide_by_fold_len(FOLDS)
240    lf1, lf2, lf3 = divide_by_fold_len(LOCALE_FOLDS)
241
242    output_data_n(f, name, 1, f1, lf1, out_comment)
243    print >> f, ''
244    output_data_n(f, name, 2, f2, lf2, out_comment)
245    print >> f, ''
246    output_data_n(f, name, 3, f3, lf3, out_comment)
247    print >> f, ''
248
249def output_macros(f, name):
250    print >> f, "#define FOLDS1_FOLD(i)         (%s1 + (i))" % name
251    print >> f, "#define FOLDS2_FOLD(i)         (%s2 + (i))" % name
252    print >> f, "#define FOLDS3_FOLD(i)         (%s3 + (i))" % name
253
254    print >> f, "#define FOLDS1_UNFOLDS_NUM(i)  %s1[(i)+1]" % name
255    print >> f, "#define FOLDS2_UNFOLDS_NUM(i)  %s2[(i)+2]" % name
256    print >> f, "#define FOLDS3_UNFOLDS_NUM(i)  %s3[(i)+3]" % name
257
258    print >> f, "#define FOLDS1_UNFOLDS(i)      (%s1 + (i) + 2)" % name
259    print >> f, "#define FOLDS2_UNFOLDS(i)      (%s2 + (i) + 3)" % name
260    print >> f, "#define FOLDS3_UNFOLDS(i)      (%s3 + (i) + 4)" % name
261
262    print >> f, "#define FOLDS1_NEXT_INDEX(i)   ((i) + 2 + %s1[(i)+1])" % name
263    print >> f, "#define FOLDS2_NEXT_INDEX(i)   ((i) + 3 + %s1[(i)+2])" % name
264    print >> f, "#define FOLDS3_NEXT_INDEX(i)   ((i) + 4 + %s1[(i)+3])" % name
265
266def output_fold_source(f, out_comment):
267    print >> f, "/* This file was generated by make_unicode_fold_data.py. */"
268    print >> f, COPYRIGHT
269    print >> f, "\n"
270    print >> f, '#include "regenc.h"'
271    print >> f, ''
272    if VERSION_INFO[0] < 0:
273      raise RuntimeError("Version is not found")
274
275    print "#define UNICODE_CASEFOLD_VERSION  %02d%02d%02d" % (VERSION_INFO[0], VERSION_INFO[1], VERSION_INFO[2])
276    print ''
277    #output_macros(f, DataName)
278    print >> f, ''
279    #output_typedef(f)
280    output_fold_data(f, DataName, out_comment)
281
282
283def output_gperf_unfold_key(f):
284    head = "%{\n/* This gperf source file was generated by make_unicode_fold_data.py */\n\n" + COPYRIGHT + """\
285
286#include "regint.h"
287%}
288
289struct ByUnfoldKey {
290  OnigCodePoint code;
291  short int   index;
292  short int   fold_len;
293};
294%%
295"""
296    f.write(head)
297    UNFOLDS.update(LOCALE_UNFOLDS)
298    l = UNFOLDS.items()
299    sl = sorted(l, key=lambda (k,e):(e.fold_len, e.index))
300    for k, e in sl:
301        f.write('"%s", /*0x%04x*/ %4d, %d\n' %
302                (form3bytes(k), k, e.index, e.fold_len))
303
304    print >> f, '%%'
305
306def output_gperf_fold_key(f, key_len):
307    head = "%{\n/* This gperf source file was generated by make_unicode_fold_data.py */\n\n" + COPYRIGHT + """\
308
309#include "regint.h"
310%}
311
312short int
313%%
314"""
315    f.write(head)
316    l = FOLDS.items()
317    l = filter(lambda (k,e):e.fold_len == key_len, l)
318    sl = sorted(l, key=lambda (k,e):e.index)
319    for k, e in sl:
320        skey = ''.join(map(lambda i: form3bytes(i), e.fold))
321        f.write('"%s", %4d\n' % (skey, e.index))
322
323    print >> f, '%%'
324
325def output_gperf_source():
326   with open(GPERF_UNFOLD_KEY_FILE, 'w') as f:
327       output_gperf_unfold_key(f)
328
329   FOLDS.update(LOCALE_FOLDS)
330
331   for i in range(1, 4):
332       with open(GPERF_FOLD_KEY_FILES[i-1], 'w') as f:
333           output_gperf_fold_key(f, i)
334
335def unfolds_byte_length_check(encode):
336    l = UNFOLDS.items()
337    sl = sorted(l, key=lambda (k,e):(e.fold_len, e.index))
338    for unfold, e in sl:
339        key_len = enc_len(unfold, encode)
340        fold_len = sum(map(lambda c: enc_len(c, encode), e.fold))
341        if key_len > fold_len:
342            sfolds = ' '.join(map(lambda c: "0x%06x" % c, e.fold))
343            s = "%s byte length: %d > %d: 0x%06x => %s" % (encode, key_len, fold_len, unfold, sfolds)
344            print >> sys.stderr, s
345
346def double_fold_check():
347    l = UNFOLDS.items()
348    sl = sorted(l, key=lambda (k,e):(e.fold_len, e.index))
349    for unfold, e in sl:
350        for f in e.fold:
351            #print >> sys.stderr, ("check 0x%06x" % f)
352            e2 = UNFOLDS.get(f)
353            if e2 is not None:
354                s = "double folds: 0x%06x => %s, 0x%06x => %s" % (unfold, e.fold, f, e2.fold)
355                print >> sys.stderr, s
356
357def unfold_is_multi_code_folds_head_check():
358    l = UNFOLDS.items()
359    l2 = filter(lambda (k,e):e.fold_len == 2, l)
360    l3 = filter(lambda (k,e):e.fold_len == 3, l)
361    sl = sorted(l, key=lambda (k,e):(e.fold_len, e.index))
362    for unfold, _ in sl:
363        for k, e in l2:
364            if e.fold[0] == unfold:
365                s = "unfold 0x%06x is multi-code fold head in %s" % (unfold, e.fold)
366                print >> sys.stderr, s
367        for k, e in l3:
368            if e.fold[0] == unfold:
369                s = "unfold 0x%06x is multi-code fold head in %s" % (unfold, e.fold)
370                print >> sys.stderr, s
371
372def make_one_folds(l):
373    h = {}
374    for unfold, e in l:
375        if e.fold_len != 1:
376            continue
377        fold = e.fold[0]
378        unfolds = h.get(fold)
379        if unfolds is None:
380            unfolds = [unfold]
381            h[fold] = unfolds
382        else:
383            unfolds.append(unfold)
384
385    return h
386
387def make_foldn_heads(l, fold_len, one_folds):
388    h = {}
389    for unfold, e in l:
390        if e.fold_len != fold_len:
391            continue
392        unfolds = one_folds.get(e.fold[0])
393        h[e.fold[0]] = (e, unfolds)
394
395    return h
396
397def fold2_expansion_num(e, one_folds):
398    n = len(e.unfolds)
399    n0 = 1
400    u0 = one_folds.get(e.fold[0])
401    if u0 is not None:
402        n0 += len(u0)
403    n1 = 1
404    u1 = one_folds.get(e.fold[1])
405    if u1 is not None:
406        n1 += len(u1)
407    n += (n0 * n1)
408    return n
409
410def fold3_expansion_num(e, one_folds):
411    n = len(e.unfolds)
412    n0 = 1
413    u0 = one_folds.get(e.fold[0])
414    if u0 is not None:
415        n0 += len(u0)
416    n1 = 1
417    u1 = one_folds.get(e.fold[1])
418    if u1 is not None:
419        n1 += len(u1)
420    n2 = 1
421    u2 = one_folds.get(e.fold[2])
422    if u2 is not None:
423        n2 += len(u2)
424    n += (n0 * n1 * n2)
425    return n
426
427def get_all_folds_expansion_num(x, one_folds, fold2_heads, fold3_heads):
428    e = UNFOLDS[x]
429    n = 0
430    if e.fold_len == 1:
431        n1 = len(e.unfolds) + 1 # +1: fold
432        fx = e.fold[0]
433        r = fold2_heads.get(fx)
434        n2 = n3 = 0
435        if r is not None:
436            e2, _ = r
437            n2 = fold2_expansion_num(e2, one_folds)
438        r = fold3_heads.get(fx)
439        if r is not None:
440            e3, _ = r
441            n3 = fold3_expansion_num(e3, one_folds)
442        n = max(n1, n2, n3)
443    elif e.fold_len == 2:
444        n = fold2_expansion_num(e, one_folds)
445    elif e.fold_len == 3:
446        n = fold3_expansion_num(e, one_folds)
447    else:
448        raise RuntimeError("Invalid fold_len %d" % (e.fold_len))
449
450    return n
451
452def get_all_folds_expansion_max_num():
453    l = UNFOLDS.items()
454    one_folds = make_one_folds(l)
455    fold2_heads = make_foldn_heads(l, 2, one_folds)
456    fold3_heads = make_foldn_heads(l, 3, one_folds)
457    sl = sorted(l, key=lambda (k,e):(e.fold_len, e.index))
458    nmax = 0
459    max_unfold = None
460    for unfold, e in sl:
461        n = get_all_folds_expansion_num(unfold, one_folds, fold2_heads, fold3_heads)
462        if nmax < n:
463            nmax = n
464            max_unfold = unfold
465
466    return (nmax, max_unfold)
467
468## main ##
469with open(SOURCE_FILE, 'r') as f:
470    parse_file(f)
471
472make_locale()
473
474out_comment = True
475output_fold_source(sys.stdout, out_comment)
476
477output_gperf_source()
478
479#unfolds_byte_length_check('utf-8')
480#unfolds_byte_length_check('utf-16')
481double_fold_check()
482unfold_is_multi_code_folds_head_check()
483
484#max_num, max_code = get_all_folds_expansion_max_num()
485#max_num -= 1  # remove self
486#print >> sys.stderr, "max expansion: 0x%06x: %d" % (max_code, max_num)
487