1# Copyright (c) 2014, Thomas Goyne <plorkyeran@aegisub.org>
2#
3# Permission to use, copy, modify, and distribute this software for any
4# purpose with or without fee is hereby granted, provided that the above
5# copyright notice and this permission notice appear in all copies.
6#
7# THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
8# WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
9# MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
10# ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
11# WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
12# ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
13# OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
14#
15# Aegisub Project http://www.aegisub.org/
16
17# A script to strip all of the data we don't use out of ICU's data files
18# Run from $ICU_ROOT/source/data
19
20from __future__ import unicode_literals
21import re
22import os
23
24# Remove stuff we don't use at all from the Makefile
25def delete_matching(filename, strs):
26    exprs = [re.compile(s) for s in strs]
27
28    with open(filename) as f:
29        lines = [line for line in f if not any(r.match(line.decode('utf-8')) for r in exprs)]
30
31    with open(filename, 'w') as f:
32        for line in lines:
33            f.write(line)
34
35REMOVE_SUBDIRS=['LOCSRCDIR', 'CURRSRCDIR', 'ZONESRCDIR', 'COLSRCDIR', 'RBNFSRCDIR', 'TRANSLITSRCDIR']
36delete_matching('Makefile.in', ['^-include .*%s' % s for s in REMOVE_SUBDIRS])
37delete_matching('Makefile.in', ['^CNV_FILES'])
38
39with open('misc/misclocal.mk', 'w') as f:
40    f.write('MISC_SOURCE = supplementalData.txt likelySubtags.txt icuver.txt icustd.txt metadata.txt')
41
42# Remove data we don't need from the lang and region files
43def parse_txt(filename):
44    root = {}
45    cur = root
46    stack = [root]
47    comment = False
48    for line in open(filename):
49        line = line.decode('utf-8')
50        line = line.strip()
51        if len(line) == 0:
52            continue
53        if '//' in line:
54            continue
55        if '/*' in line:
56            comment = True
57            continue
58        if comment:
59            if '*/' in line:
60                comment = False
61            continue
62
63        if line == '}':
64            stack.pop()
65            cur = stack[-1]
66            continue
67        if line.endswith('{'):
68            obj = {}
69            cur[line[:-1]] = obj
70            cur = obj
71            stack.append(obj)
72            continue
73
74        m = re.match('(.*){"(.*)"}', line)
75        if not m:
76            print line
77        else:
78            cur[m.group(1)] = m.group(2)
79
80    return root
81
82def remove_sections(root):
83    for child in root.itervalues():
84        child.pop('Keys', None)
85        child.pop('LanguagesShort', None)
86        child.pop('Types', None)
87        child.pop('Variants', None)
88        child.pop('codePatterns', None)
89        child.pop('localeDisplayPattern', None)
90        child.pop('CountriesShort', None)
91        child.pop('Scripts%stand-alone', None)
92
93def remove_languages(root):
94    for lang, child in root.iteritems():
95        # We only care about a language's name in that language
96        lang = lang.split('_')[0]
97        trimmed = {}
98        v = child.get('Languages', {}).get(lang)
99        if v:
100            trimmed[lang] = v
101        child['Languages'] = trimmed
102
103# Scripts which are actually used by stuff
104SCRIPTS = ['Cyrl', 'Latn', 'Arab', 'Vaii', 'Hans', 'Hant']
105def remove_scripts(root):
106    for lang, child in root.iteritems():
107        v = child.get('Scripts')
108        if not v:
109            continue
110
111        trimmed = {}
112        for script in SCRIPTS:
113            if v.get(script):
114                trimmed[script] = v[script]
115        child['Scripts'] = trimmed
116
117def write_dict(name, value, out, indent):
118    if len(value) == 0:
119        return
120
121    child_indent = indent + '    '
122
123    out.write(indent)
124    out.write(name.encode('utf-8'))
125    out.write('{\n')
126    for k in sorted(value.keys()):
127        v = value[k]
128        if type(v) == dict:
129            write_dict(k, v, out, child_indent)
130        else:
131            out.write(('%s%s{"%s"}\n' % (child_indent, k, v)).encode('utf-8'))
132    out.write(indent)
133    out.write('}\n')
134
135def write_file(root, filename):
136    with open(filename, 'w') as f:
137        for k, v in root.iteritems():
138            write_dict(k, v, f, '')
139
140def minify_lang(filename):
141    f = parse_txt(filename)
142    remove_sections(f)
143    remove_languages(f)
144    remove_scripts(f)
145    write_file(f, filename)
146
147for name in os.listdir('lang'):
148    if not name.endswith('.txt'):
149        continue
150    minify_lang('lang/' + name)
151
152# gather information about which language+region combinations actually exist,
153# so that we can drop all others
154def gather_regions():
155    langs = {
156        'af': ['ZA'],
157        'am': ['ET'],
158        'ar': ['AE', 'BH', 'DZ', 'EG', 'IQ', 'JO', 'KW', 'LB', 'LY', 'MA', 'OM', 'QA', 'SA', 'SY', 'TN', 'YE'],
159        'arn': ['CL'],
160        'as': ['IN'],
161        'az': ['AZ', 'AZ'],
162        'ba': ['RU'],
163        'be': ['BY'],
164        'bg': ['BG'],
165        'bn': ['BD', 'IN'],
166        'bo': ['CN'],
167        'br': ['FR'],
168        'bs': ['BA', 'BA'],
169        'ca': ['ES'],
170        'co': ['FR'],
171        'cs': ['CZ'],
172        'cy': ['GB'],
173        'da': ['DK'],
174        'de': ['AT', 'CH', 'DE', 'LI', 'LU'],
175        'div': ['MV'],
176        'el': ['GR'],
177        'en': ['029', 'AU', 'BZ', 'CA', 'GB', 'IE', 'IN', 'JM', 'MY', 'NZ', 'PH', 'SG', 'TT', 'US', 'ZA', 'ZW'],
178        'es': ['AR', 'BO', 'CL', 'CO', 'CR', 'DO', 'EC', 'ES', 'GT', 'HN', 'MX', 'NI', 'PA', 'PE', 'PR', 'PY', 'SV', 'US', 'UY', 'VE'],
179        'et': ['EE'],
180        'eu': ['ES'],
181        'fa': ['IR'],
182        'fi': ['FI'],
183        'fil': ['PH'],
184        'fo': ['FO'],
185        'fr': ['BE', 'CA', 'CH', 'FR', 'LU', 'MC'],
186        'fy': ['NL'],
187        'ga': ['IE'],
188        'gl': ['ES'],
189        'gsw': ['FR'],
190        'gu': ['IN'],
191        'ha': ['NG'],
192        'he': ['IL'],
193        'hi': ['IN'],
194        'hr': ['BA', 'HR'],
195        'hu': ['HU'],
196        'hy': ['AM'],
197        'id': ['ID'],
198        'ig': ['NG'],
199        'ii': ['CN'],
200        'is': ['IS'],
201        'it': ['CH', 'IT'],
202        'iu': ['CA', 'CA'],
203        'ja': ['JP'],
204        'ka': ['GE'],
205        'kk': ['KZ'],
206        'kl': ['GL'],
207        'km': ['KH'],
208        'kn': ['IN'],
209        'ko': ['KR'],
210        'kok': ['IN'],
211        'ky': ['KG'],
212        'lb': ['LU'],
213        'lo': ['LA'],
214        'lt': ['LT'],
215        'lv': ['LV'],
216        'mi': ['NZ'],
217        'mk': ['MK'],
218        'ml': ['IN'],
219        'mn': ['CN', 'MN'],
220        'moh': ['CA'],
221        'mr': ['IN'],
222        'ms': ['BN', 'MY'],
223        'mt': ['MT'],
224        'nb': ['NO'],
225        'ne': ['NP'],
226        'nl': ['BE', 'NL'],
227        'nn': ['NO'],
228        'nso': ['ZA'],
229        'oc': ['FR'],
230        'or': ['IN'],
231        'pa': ['IN'],
232        'pl': ['PL'],
233        'prs': ['AF'],
234        'ps': ['AF'],
235        'pt': ['BR', 'PT'],
236        'qut': ['GT'],
237        'quz': ['BO', 'EC', 'PE'],
238        'rm': ['CH'],
239        'ro': ['RO'],
240        'ru': ['RU'],
241        'rw': ['RW'],
242        'sa': ['IN'],
243        'sah': ['RU'],
244        'se': ['FI', 'NO', 'SE'],
245        'si': ['LK'],
246        'sk': ['SK'],
247        'sl': ['SI'],
248        'sma': ['NO', 'SE'],
249        'smj': ['NO', 'SE'],
250        'smn': ['FI'],
251        'sms': ['FI'],
252        'sq': ['AL'],
253        'sr': ['BA', 'BA', 'SP', 'YU'],
254        'sv': ['FI', 'SE'],
255        'sw': ['KE', 'TZ'],
256        'syr': ['SY'],
257        'ta': ['IN'],
258        'te': ['IN'],
259        'tg': ['TJ'],
260        'th': ['TH'],
261        'tk': ['TM'],
262        'tn': ['ZA'],
263        'tr': ['TR'],
264        'tt': ['RU'],
265        'tzm': ['DZ'],
266        'ug': ['CN'],
267        'uk': ['UA'],
268        'ur': ['PK'],
269        'uz': ['UZ', 'UZ'],
270        'vi': ['VN'],
271        'wee': ['DE'],
272        'wen': ['DE'],
273        'wo': ['SN'],
274        'xh': ['ZA'],
275        'yo': ['NG'],
276        'zh': ['CN', 'HK', 'MO', 'SG', 'TW'],
277        'zu': ['ZA']
278    }
279    for name in os.listdir('region'):
280        if not name.endswith('.txt'): continue
281        parts = name[:-4].split('_')
282        if len(parts) == 1: continue
283        if not parts[0] in langs:
284            langs[parts[0]] = []
285        langs[parts[0]].extend(parts[1:])
286    return langs
287
288REGIONS = gather_regions()
289def remove_countries(root):
290    for lang, child in root.iteritems():
291        v = child.get('Countries', {})
292        if not v: continue
293
294        # We only care about the names for regions in the languages used in
295        # those regions
296        lang = lang.split('_')[0]
297        regions = REGIONS.get(lang)
298        if not regions:
299            del child['Countries']
300            continue
301
302        trimmed = {}
303        for region in regions:
304            name = v.get(region)
305            if name:
306                trimmed[region] = name
307        child['Countries'] = trimmed
308
309def minify_region(filename):
310    f = parse_txt(filename)
311    remove_sections(f)
312    remove_countries(f)
313    write_file(f, filename)
314
315for name in os.listdir('region'):
316    if not name.endswith('.txt'):
317        continue
318    minify_region('region/' + name)
319
320