1#!/usr/bin/env python3 2 3"""usage: ./gen-ucd-table ucd.nounihan.grouped.xml [/path/to/hb-common.h] 4 5Input file: 6* https://unicode.org/Public/UCD/latest/ucdxml/ucd.nounihan.grouped.zip 7""" 8 9import sys, re 10import logging 11logging.basicConfig(format='%(levelname)s: %(message)s', level=logging.INFO) 12 13if len (sys.argv) not in (2, 3): 14 sys.exit (__doc__) 15 16# https://github.com/harfbuzz/packtab 17import packTab 18import packTab.ucdxml 19 20logging.info('Loading UCDXML...') 21ucdxml = packTab.ucdxml.load_ucdxml(sys.argv[1]) 22ucd = packTab.ucdxml.ucdxml_get_repertoire(ucdxml) 23 24hb_common_h = 'hb-common.h' if len (sys.argv) < 3 else sys.argv[2] 25 26logging.info('Preparing data tables...') 27 28gc = [u['gc'] for u in ucd] 29ccc = [int(u['ccc']) for u in ucd] 30bmg = [int(v, 16) - int(u) if v else 0 for u,v in enumerate(u['bmg'] for u in ucd)] 31#gc_ccc_non0 = set((cat,klass) for cat,klass in zip(gc,ccc) if klass) 32#gc_bmg_non0 = set((cat,mirr) for cat,mirr in zip(gc, bmg) if mirr) 33 34sc = [u['sc'] for u in ucd] 35 36dm = {i:tuple(int(v, 16) for v in u['dm'].split()) for i,u in enumerate(ucd) 37 if u['dm'] != '#' and u['dt'] == 'can' and not (0xAC00 <= i < 0xAC00+11172)} 38ce = {i for i,u in enumerate(ucd) if u['Comp_Ex'] == 'Y'} 39 40assert not any(v for v in dm.values() if len(v) not in (1,2)) 41dm1 = sorted(set(v for v in dm.values() if len(v) == 1)) 42assert all((v[0] >> 16) in (0,2) for v in dm1) 43dm1_p0_array = ['0x%04Xu' % (v[0] & 0xFFFF) for v in dm1 if (v[0] >> 16) == 0] 44dm1_p2_array = ['0x%04Xu' % (v[0] & 0xFFFF) for v in dm1 if (v[0] >> 16) == 2] 45dm1_order = {v:i+1 for i,v in enumerate(dm1)} 46 47dm2 = sorted((v+(i if i not in ce and not ccc[i] else 0,), v) 48 for i,v in dm.items() if len(v) == 2) 49 50filt = lambda v: ((v[0] & 0xFFFFF800) == 0x0000 and 51 (v[1] & 0xFFFFFF80) == 0x0300 and 52 (v[2] & 0xFFF0C000) == 0x0000) 53dm2_u32_array = [v for v in dm2 if filt(v[0])] 54dm2_u64_array = [v for v in dm2 if not filt(v[0])] 55assert dm2_u32_array + dm2_u64_array == dm2 56dm2_u32_array = ["HB_CODEPOINT_ENCODE3_11_7_14 (0x%04Xu, 0x%04Xu, 0x%04Xu)" % v[0] for v in dm2_u32_array] 57dm2_u64_array = ["HB_CODEPOINT_ENCODE3 (0x%04Xu, 0x%04Xu, 0x%04Xu)" % v[0] for v in dm2_u64_array] 58 59l = 1 + len(dm1_p0_array) + len(dm1_p2_array) 60dm2_order = {v[1]:i+l for i,v in enumerate(dm2)} 61 62dm_order = {None: 0} 63dm_order.update(dm1_order) 64dm_order.update(dm2_order) 65 66gc_order = dict() 67for i,v in enumerate(('Cc', 'Cf', 'Cn', 'Co', 'Cs', 'Ll', 'Lm', 'Lo', 'Lt', 'Lu', 68 'Mc', 'Me', 'Mn', 'Nd', 'Nl', 'No', 'Pc', 'Pd', 'Pe', 'Pf', 69 'Pi', 'Po', 'Ps', 'Sc', 'Sk', 'Sm', 'So', 'Zl', 'Zp', 'Zs',)): 70 gc_order[i] = v 71 gc_order[v] = i 72 73sc_order = dict() 74sc_array = [] 75sc_re = re.compile(r"\b(HB_SCRIPT_[_A-Z]*).*HB_TAG [(]'(.)','(.)','(.)','(.)'[)]") 76for line in open(hb_common_h): 77 m = sc_re.search (line) 78 if not m: continue 79 name = m.group(1) 80 tag = ''.join(m.group(i) for i in range(2, 6)) 81 i = len(sc_array) 82 sc_order[tag] = i 83 sc_order[i] = tag 84 sc_array.append(name) 85 86DEFAULT = 1 87COMPACT = 3 88SLOPPY = 5 89 90 91logging.info('Generating output...') 92print("/* == Start of generated table == */") 93print("/*") 94print(" * The following table is generated by running:") 95print(" *") 96print(" * ./gen-ucd-table.py ucd.nounihan.grouped.xml") 97print(" *") 98print(" * on file with this description:", ucdxml.description) 99print(" */") 100print() 101print("#ifndef HB_UCD_TABLE_HH") 102print("#define HB_UCD_TABLE_HH") 103print() 104print('#include "hb.hh"') 105print() 106 107code = packTab.Code('_hb_ucd') 108sc_array, _ = code.addArray('hb_script_t', 'sc_map', sc_array) 109dm1_p0_array, _ = code.addArray('uint16_t', 'dm1_p0_map', dm1_p0_array) 110dm1_p2_array, _ = code.addArray('uint16_t', 'dm1_p2_map', dm1_p2_array) 111dm2_u32_array, _ = code.addArray('uint32_t', 'dm2_u32_map', dm2_u32_array) 112dm2_u64_array, _ = code.addArray('uint64_t', 'dm2_u64_map', dm2_u64_array) 113code.print_c(linkage='static inline') 114 115datasets = [ 116 ('gc', gc, 'Cn', gc_order), 117 ('ccc', ccc, 0, None), 118 ('bmg', bmg, 0, None), 119 ('sc', sc, 'Zzzz', sc_order), 120 ('dm', dm, None, dm_order), 121] 122 123for compression in (DEFAULT, COMPACT, SLOPPY): 124 logging.info(' Compression=%d:' % compression) 125 print() 126 if compression == DEFAULT: 127 print('#ifndef HB_OPTIMIZE_SIZE') 128 elif compression == COMPACT: 129 print('#elif !defined(HB_NO_UCD_UNASSIGNED)') 130 else: 131 print('#else') 132 print() 133 134 if compression == SLOPPY: 135 for i in range(len(gc)): 136 if (i % 128) and gc[i] == 'Cn': 137 gc[i] = gc[i - 1] 138 for i in range(len(gc) - 2, -1, -1): 139 if ((i + 1) % 128) and gc[i] == 'Cn': 140 gc[i] = gc[i + 1] 141 for i in range(len(sc)): 142 if (i % 128) and sc[i] == 'Zzzz': 143 sc[i] = sc[i - 1] 144 for i in range(len(sc) - 2, -1, -1): 145 if ((i + 1) % 128) and sc[i] == 'Zzzz': 146 sc[i] = sc[i + 1] 147 148 149 code = packTab.Code('_hb_ucd') 150 151 for name,data,default,mapping in datasets: 152 sol = packTab.pack_table(data, default, mapping=mapping, compression=compression) 153 logging.info(' Dataset=%-8s FullCost=%d' % (name, sol.fullCost)) 154 sol.genCode(code, name) 155 156 code.print_c(linkage='static inline') 157 158 print() 159 160print('#endif') 161print() 162 163print() 164print("#endif /* HB_UCD_TABLE_HH */") 165print() 166print("/* == End of generated table == */") 167logging.info('Done.') 168