1# Copyright (c) 2014, Thomas Goyne <plorkyeran@aegisub.org> 2# 3# Permission to use, copy, modify, and distribute this software for any 4# purpose with or without fee is hereby granted, provided that the above 5# copyright notice and this permission notice appear in all copies. 6# 7# THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES 8# WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF 9# MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR 10# ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES 11# WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN 12# ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF 13# OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. 14# 15# Aegisub Project http://www.aegisub.org/ 16 17# A script to strip all of the data we don't use out of ICU's data files 18# Run from $ICU_ROOT/source/data 19 20from __future__ import unicode_literals 21import re 22import os 23 24# Remove stuff we don't use at all from the Makefile 25def delete_matching(filename, strs): 26 exprs = [re.compile(s) for s in strs] 27 28 with open(filename) as f: 29 lines = [line for line in f if not any(r.match(line.decode('utf-8')) for r in exprs)] 30 31 with open(filename, 'w') as f: 32 for line in lines: 33 f.write(line) 34 35REMOVE_SUBDIRS=['LOCSRCDIR', 'CURRSRCDIR', 'ZONESRCDIR', 'COLSRCDIR', 'RBNFSRCDIR', 'TRANSLITSRCDIR'] 36delete_matching('Makefile.in', ['^-include .*%s' % s for s in REMOVE_SUBDIRS]) 37delete_matching('Makefile.in', ['^CNV_FILES']) 38 39with open('misc/misclocal.mk', 'w') as f: 40 f.write('MISC_SOURCE = supplementalData.txt likelySubtags.txt icuver.txt icustd.txt metadata.txt') 41 42# Remove data we don't need from the lang and region files 43def parse_txt(filename): 44 root = {} 45 cur = root 46 stack = [root] 47 comment = False 48 for line in open(filename): 49 line = line.decode('utf-8') 50 line = line.strip() 51 if len(line) == 0: 52 continue 53 if '//' in line: 54 continue 55 if '/*' in line: 56 comment = True 57 continue 58 if comment: 59 if '*/' in line: 60 comment = False 61 continue 62 63 if line == '}': 64 stack.pop() 65 cur = stack[-1] 66 continue 67 if line.endswith('{'): 68 obj = {} 69 cur[line[:-1]] = obj 70 cur = obj 71 stack.append(obj) 72 continue 73 74 m = re.match('(.*){"(.*)"}', line) 75 if not m: 76 print line 77 else: 78 cur[m.group(1)] = m.group(2) 79 80 return root 81 82def remove_sections(root): 83 for child in root.itervalues(): 84 child.pop('Keys', None) 85 child.pop('LanguagesShort', None) 86 child.pop('Types', None) 87 child.pop('Variants', None) 88 child.pop('codePatterns', None) 89 child.pop('localeDisplayPattern', None) 90 child.pop('CountriesShort', None) 91 child.pop('Scripts%stand-alone', None) 92 93def remove_languages(root): 94 for lang, child in root.iteritems(): 95 # We only care about a language's name in that language 96 lang = lang.split('_')[0] 97 trimmed = {} 98 v = child.get('Languages', {}).get(lang) 99 if v: 100 trimmed[lang] = v 101 child['Languages'] = trimmed 102 103# Scripts which are actually used by stuff 104SCRIPTS = ['Cyrl', 'Latn', 'Arab', 'Vaii', 'Hans', 'Hant'] 105def remove_scripts(root): 106 for lang, child in root.iteritems(): 107 v = child.get('Scripts') 108 if not v: 109 continue 110 111 trimmed = {} 112 for script in SCRIPTS: 113 if v.get(script): 114 trimmed[script] = v[script] 115 child['Scripts'] = trimmed 116 117def write_dict(name, value, out, indent): 118 if len(value) == 0: 119 return 120 121 child_indent = indent + ' ' 122 123 out.write(indent) 124 out.write(name.encode('utf-8')) 125 out.write('{\n') 126 for k in sorted(value.keys()): 127 v = value[k] 128 if type(v) == dict: 129 write_dict(k, v, out, child_indent) 130 else: 131 out.write(('%s%s{"%s"}\n' % (child_indent, k, v)).encode('utf-8')) 132 out.write(indent) 133 out.write('}\n') 134 135def write_file(root, filename): 136 with open(filename, 'w') as f: 137 for k, v in root.iteritems(): 138 write_dict(k, v, f, '') 139 140def minify_lang(filename): 141 f = parse_txt(filename) 142 remove_sections(f) 143 remove_languages(f) 144 remove_scripts(f) 145 write_file(f, filename) 146 147for name in os.listdir('lang'): 148 if not name.endswith('.txt'): 149 continue 150 minify_lang('lang/' + name) 151 152# gather information about which language+region combinations actually exist, 153# so that we can drop all others 154def gather_regions(): 155 langs = { 156 'af': ['ZA'], 157 'am': ['ET'], 158 'ar': ['AE', 'BH', 'DZ', 'EG', 'IQ', 'JO', 'KW', 'LB', 'LY', 'MA', 'OM', 'QA', 'SA', 'SY', 'TN', 'YE'], 159 'arn': ['CL'], 160 'as': ['IN'], 161 'az': ['AZ', 'AZ'], 162 'ba': ['RU'], 163 'be': ['BY'], 164 'bg': ['BG'], 165 'bn': ['BD', 'IN'], 166 'bo': ['CN'], 167 'br': ['FR'], 168 'bs': ['BA', 'BA'], 169 'ca': ['ES'], 170 'co': ['FR'], 171 'cs': ['CZ'], 172 'cy': ['GB'], 173 'da': ['DK'], 174 'de': ['AT', 'CH', 'DE', 'LI', 'LU'], 175 'div': ['MV'], 176 'el': ['GR'], 177 'en': ['029', 'AU', 'BZ', 'CA', 'GB', 'IE', 'IN', 'JM', 'MY', 'NZ', 'PH', 'SG', 'TT', 'US', 'ZA', 'ZW'], 178 'es': ['AR', 'BO', 'CL', 'CO', 'CR', 'DO', 'EC', 'ES', 'GT', 'HN', 'MX', 'NI', 'PA', 'PE', 'PR', 'PY', 'SV', 'US', 'UY', 'VE'], 179 'et': ['EE'], 180 'eu': ['ES'], 181 'fa': ['IR'], 182 'fi': ['FI'], 183 'fil': ['PH'], 184 'fo': ['FO'], 185 'fr': ['BE', 'CA', 'CH', 'FR', 'LU', 'MC'], 186 'fy': ['NL'], 187 'ga': ['IE'], 188 'gl': ['ES'], 189 'gsw': ['FR'], 190 'gu': ['IN'], 191 'ha': ['NG'], 192 'he': ['IL'], 193 'hi': ['IN'], 194 'hr': ['BA', 'HR'], 195 'hu': ['HU'], 196 'hy': ['AM'], 197 'id': ['ID'], 198 'ig': ['NG'], 199 'ii': ['CN'], 200 'is': ['IS'], 201 'it': ['CH', 'IT'], 202 'iu': ['CA', 'CA'], 203 'ja': ['JP'], 204 'ka': ['GE'], 205 'kk': ['KZ'], 206 'kl': ['GL'], 207 'km': ['KH'], 208 'kn': ['IN'], 209 'ko': ['KR'], 210 'kok': ['IN'], 211 'ky': ['KG'], 212 'lb': ['LU'], 213 'lo': ['LA'], 214 'lt': ['LT'], 215 'lv': ['LV'], 216 'mi': ['NZ'], 217 'mk': ['MK'], 218 'ml': ['IN'], 219 'mn': ['CN', 'MN'], 220 'moh': ['CA'], 221 'mr': ['IN'], 222 'ms': ['BN', 'MY'], 223 'mt': ['MT'], 224 'nb': ['NO'], 225 'ne': ['NP'], 226 'nl': ['BE', 'NL'], 227 'nn': ['NO'], 228 'nso': ['ZA'], 229 'oc': ['FR'], 230 'or': ['IN'], 231 'pa': ['IN'], 232 'pl': ['PL'], 233 'prs': ['AF'], 234 'ps': ['AF'], 235 'pt': ['BR', 'PT'], 236 'qut': ['GT'], 237 'quz': ['BO', 'EC', 'PE'], 238 'rm': ['CH'], 239 'ro': ['RO'], 240 'ru': ['RU'], 241 'rw': ['RW'], 242 'sa': ['IN'], 243 'sah': ['RU'], 244 'se': ['FI', 'NO', 'SE'], 245 'si': ['LK'], 246 'sk': ['SK'], 247 'sl': ['SI'], 248 'sma': ['NO', 'SE'], 249 'smj': ['NO', 'SE'], 250 'smn': ['FI'], 251 'sms': ['FI'], 252 'sq': ['AL'], 253 'sr': ['BA', 'BA', 'SP', 'YU'], 254 'sv': ['FI', 'SE'], 255 'sw': ['KE', 'TZ'], 256 'syr': ['SY'], 257 'ta': ['IN'], 258 'te': ['IN'], 259 'tg': ['TJ'], 260 'th': ['TH'], 261 'tk': ['TM'], 262 'tn': ['ZA'], 263 'tr': ['TR'], 264 'tt': ['RU'], 265 'tzm': ['DZ'], 266 'ug': ['CN'], 267 'uk': ['UA'], 268 'ur': ['PK'], 269 'uz': ['UZ', 'UZ'], 270 'vi': ['VN'], 271 'wee': ['DE'], 272 'wen': ['DE'], 273 'wo': ['SN'], 274 'xh': ['ZA'], 275 'yo': ['NG'], 276 'zh': ['CN', 'HK', 'MO', 'SG', 'TW'], 277 'zu': ['ZA'] 278 } 279 for name in os.listdir('region'): 280 if not name.endswith('.txt'): continue 281 parts = name[:-4].split('_') 282 if len(parts) == 1: continue 283 if not parts[0] in langs: 284 langs[parts[0]] = [] 285 langs[parts[0]].extend(parts[1:]) 286 return langs 287 288REGIONS = gather_regions() 289def remove_countries(root): 290 for lang, child in root.iteritems(): 291 v = child.get('Countries', {}) 292 if not v: continue 293 294 # We only care about the names for regions in the languages used in 295 # those regions 296 lang = lang.split('_')[0] 297 regions = REGIONS.get(lang) 298 if not regions: 299 del child['Countries'] 300 continue 301 302 trimmed = {} 303 for region in regions: 304 name = v.get(region) 305 if name: 306 trimmed[region] = name 307 child['Countries'] = trimmed 308 309def minify_region(filename): 310 f = parse_txt(filename) 311 remove_sections(f) 312 remove_countries(f) 313 write_file(f, filename) 314 315for name in os.listdir('region'): 316 if not name.endswith('.txt'): 317 continue 318 minify_region('region/' + name) 319 320