1#!/usr/bin/env python3
2
3"""usage: ./gen-ucd-table ucd.nounihan.grouped.xml [/path/to/hb-common.h]
4
5Input file:
6* https://unicode.org/Public/UCD/latest/ucdxml/ucd.nounihan.grouped.zip
7"""
8
9import sys, re
10import logging
11logging.basicConfig(format='%(levelname)s: %(message)s', level=logging.INFO)
12
13if len (sys.argv) not in (2, 3):
14	sys.exit (__doc__)
15
16# https://github.com/harfbuzz/packtab
17import packTab
18import packTab.ucdxml
19
20logging.info('Loading UCDXML...')
21ucdxml = packTab.ucdxml.load_ucdxml(sys.argv[1])
22ucd = packTab.ucdxml.ucdxml_get_repertoire(ucdxml)
23
24hb_common_h = 'hb-common.h' if len (sys.argv) < 3 else sys.argv[2]
25
26logging.info('Preparing data tables...')
27
28gc = [u['gc'] for u in ucd]
29ccc = [int(u['ccc']) for u in ucd]
30bmg = [int(v, 16) - int(u) if v else 0 for u,v in enumerate(u['bmg'] for u in ucd)]
31#gc_ccc_non0 = set((cat,klass) for cat,klass in zip(gc,ccc) if klass)
32#gc_bmg_non0 = set((cat,mirr) for cat,mirr in zip(gc, bmg) if mirr)
33
34sc = [u['sc'] for u in ucd]
35
36dm = {i:tuple(int(v, 16) for v in u['dm'].split()) for i,u in enumerate(ucd)
37      if u['dm'] != '#' and u['dt'] == 'can' and not (0xAC00 <= i < 0xAC00+11172)}
38ce = {i for i,u in enumerate(ucd) if u['Comp_Ex'] == 'Y'}
39
40assert not any(v for v in dm.values() if len(v) not in (1,2))
41dm1 = sorted(set(v for v in dm.values() if len(v) == 1))
42assert all((v[0] >> 16) in (0,2) for v in dm1)
43dm1_p0_array = ['0x%04Xu' % (v[0] & 0xFFFF) for v in dm1 if (v[0] >> 16) == 0]
44dm1_p2_array = ['0x%04Xu' % (v[0] & 0xFFFF) for v in dm1 if (v[0] >> 16) == 2]
45dm1_order = {v:i+1 for i,v in enumerate(dm1)}
46
47dm2 = sorted((v+(i if i not in ce and not ccc[i] else 0,), v)
48             for i,v in dm.items() if len(v) == 2)
49
50filt = lambda v: ((v[0] & 0xFFFFF800) == 0x0000 and
51                  (v[1] & 0xFFFFFF80) == 0x0300 and
52                  (v[2] & 0xFFF0C000) == 0x0000)
53dm2_u32_array = [v for v in dm2 if filt(v[0])]
54dm2_u64_array = [v for v in dm2 if not filt(v[0])]
55assert dm2_u32_array + dm2_u64_array == dm2
56dm2_u32_array = ["HB_CODEPOINT_ENCODE3_11_7_14 (0x%04Xu, 0x%04Xu, 0x%04Xu)" % v[0] for v in dm2_u32_array]
57dm2_u64_array = ["HB_CODEPOINT_ENCODE3 (0x%04Xu, 0x%04Xu, 0x%04Xu)" % v[0] for v in dm2_u64_array]
58
59l = 1 + len(dm1_p0_array) + len(dm1_p2_array)
60dm2_order = {v[1]:i+l for i,v in enumerate(dm2)}
61
62dm_order = {None: 0}
63dm_order.update(dm1_order)
64dm_order.update(dm2_order)
65
66gc_order = dict()
67for i,v in enumerate(('Cc', 'Cf', 'Cn', 'Co', 'Cs', 'Ll', 'Lm', 'Lo', 'Lt', 'Lu',
68                      'Mc', 'Me', 'Mn', 'Nd', 'Nl', 'No', 'Pc', 'Pd', 'Pe', 'Pf',
69                      'Pi', 'Po', 'Ps', 'Sc', 'Sk', 'Sm', 'So', 'Zl', 'Zp', 'Zs',)):
70    gc_order[i] = v
71    gc_order[v] = i
72
73sc_order = dict()
74sc_array = []
75sc_re = re.compile(r"\b(HB_SCRIPT_[_A-Z]*).*HB_TAG [(]'(.)','(.)','(.)','(.)'[)]")
76for line in open(hb_common_h):
77    m = sc_re.search (line)
78    if not m: continue
79    name = m.group(1)
80    tag = ''.join(m.group(i) for i in range(2, 6))
81    i = len(sc_array)
82    sc_order[tag] = i
83    sc_order[i] = tag
84    sc_array.append(name)
85
86DEFAULT = 1
87COMPACT = 3
88SLOPPY  = 5
89
90
91logging.info('Generating output...')
92print("/* == Start of generated table == */")
93print("/*")
94print(" * The following table is generated by running:")
95print(" *")
96print(" *   ./gen-ucd-table.py ucd.nounihan.grouped.xml")
97print(" *")
98print(" * on file with this description:", ucdxml.description)
99print(" */")
100print()
101print("#ifndef HB_UCD_TABLE_HH")
102print("#define HB_UCD_TABLE_HH")
103print()
104print('#include "hb.hh"')
105print()
106
107code = packTab.Code('_hb_ucd')
108sc_array, _ = code.addArray('hb_script_t', 'sc_map', sc_array)
109dm1_p0_array, _ = code.addArray('uint16_t', 'dm1_p0_map', dm1_p0_array)
110dm1_p2_array, _ = code.addArray('uint16_t', 'dm1_p2_map', dm1_p2_array)
111dm2_u32_array, _ = code.addArray('uint32_t', 'dm2_u32_map', dm2_u32_array)
112dm2_u64_array, _ = code.addArray('uint64_t', 'dm2_u64_map', dm2_u64_array)
113code.print_c(linkage='static inline')
114
115datasets = [
116    ('gc', gc, 'Cn', gc_order),
117    ('ccc', ccc, 0, None),
118    ('bmg', bmg, 0, None),
119    ('sc', sc, 'Zzzz', sc_order),
120    ('dm', dm, None, dm_order),
121]
122
123for compression in (DEFAULT, COMPACT, SLOPPY):
124    logging.info('  Compression=%d:' % compression)
125    print()
126    if compression == DEFAULT:
127        print('#ifndef HB_OPTIMIZE_SIZE')
128    elif compression == COMPACT:
129        print('#elif !defined(HB_NO_UCD_UNASSIGNED)')
130    else:
131        print('#else')
132    print()
133
134    if compression == SLOPPY:
135        for i in range(len(gc)):
136            if (i % 128) and gc[i] == 'Cn':
137                gc[i] = gc[i - 1]
138        for i in range(len(gc) - 2, -1, -1):
139            if ((i + 1) % 128) and gc[i] == 'Cn':
140                gc[i] = gc[i + 1]
141        for i in range(len(sc)):
142            if (i % 128) and sc[i] == 'Zzzz':
143                sc[i] = sc[i - 1]
144        for i in range(len(sc) - 2, -1, -1):
145            if ((i + 1) % 128) and sc[i] == 'Zzzz':
146                sc[i] = sc[i + 1]
147
148
149    code = packTab.Code('_hb_ucd')
150
151    for name,data,default,mapping in datasets:
152        sol = packTab.pack_table(data, default, mapping=mapping, compression=compression)
153        logging.info('      Dataset=%-8s FullCost=%d' % (name, sol.fullCost))
154        sol.genCode(code, name)
155
156    code.print_c(linkage='static inline')
157
158    print()
159
160print('#endif')
161print()
162
163print()
164print("#endif /* HB_UCD_TABLE_HH */")
165print()
166print("/* == End of generated table == */")
167logging.info('Done.')
168