1# Copyright (C) 2018 and later: Unicode, Inc. and others.
2# License & terms of use: http://www.unicode.org/copyright.html
3
4# Python 2/3 Compatibility (ICU-20299)
5# TODO(ICU-20301): Remove this.
6from __future__ import print_function
7
8from abc import abstractmethod
9from collections import defaultdict
10import re
11import sys
12
13from . import *
14from . import utils
15from .request_types import *
16
17
18# Note: for this to be a proper abstract class, it should extend abc.ABC.
19# There is no nice way to do this that works in both Python 2 and 3.
20# TODO(ICU-20301): Make this inherit from abc.ABC.
21class Filter(object):
22    @staticmethod
23    def create_from_json(json_data, io):
24        assert io != None
25        if "filterType" in json_data:
26            filter_type = json_data["filterType"]
27        else:
28            filter_type = "file-stem"
29
30        if filter_type == "file-stem":
31            return FileStemFilter(json_data)
32        elif filter_type == "language":
33            return LanguageFilter(json_data)
34        elif filter_type == "regex":
35            return RegexFilter(json_data)
36        elif filter_type == "exclude":
37            return ExclusionFilter()
38        elif filter_type == "union":
39            return UnionFilter(json_data, io)
40        elif filter_type == "locale":
41            return LocaleFilter(json_data, io)
42        else:
43            print("Error: Unknown filterType option: %s" % filter_type, file=sys.stderr)
44            return None
45
46    def filter(self, request):
47        if not request.apply_file_filter(self):
48            return []
49        for file in request.all_input_files():
50            assert self.match(file)
51        return [request]
52
53    @staticmethod
54    def _file_to_file_stem(file):
55        start = file.filename.rfind("/")
56        limit = file.filename.rfind(".")
57        return file.filename[start+1:limit]
58
59    @staticmethod
60    def _file_to_subdir(file):
61        limit = file.filename.rfind("/")
62        if limit == -1:
63            return None
64        return file.filename[:limit]
65
66    @abstractmethod
67    def match(self, file):
68        pass
69
70
71class InclusionFilter(Filter):
72    def match(self, file):
73        return True
74
75
76class ExclusionFilter(Filter):
77    def match(self, file):
78        return False
79
80
81class IncludeExcludeFilter(Filter):
82    def __init__(self, json_data):
83        if "whitelist" in json_data:
84            self.is_includelist = True
85            self.includelist = json_data["whitelist"]
86        elif "includelist" in json_data:
87            self.is_includelist = True
88            self.includelist = json_data["includelist"]
89        elif "blacklist" in json_data:
90            self.is_includelist = False
91            self.excludelist = json_data["blacklist"]
92        elif "excludelist" in json_data:
93            self.is_includelist = False
94            self.excludelist = json_data["excludelist"]
95        else:
96            raise AssertionError("Need either includelist or excludelist: %s" % str(json_data))
97
98    def match(self, file):
99        file_stem = self._file_to_file_stem(file)
100        return self._should_include(file_stem)
101
102    @abstractmethod
103    def _should_include(self, file_stem):
104        pass
105
106
107class FileStemFilter(IncludeExcludeFilter):
108    def _should_include(self, file_stem):
109        if self.is_includelist:
110            return file_stem in self.includelist
111        else:
112            return file_stem not in self.excludelist
113
114
115class LanguageFilter(IncludeExcludeFilter):
116    def _should_include(self, file_stem):
117        language = file_stem.split("_")[0]
118        if language == "root":
119            # Always include root.txt
120            return True
121        if self.is_includelist:
122            return language in self.includelist
123        else:
124            return language not in self.excludelist
125
126
127class RegexFilter(IncludeExcludeFilter):
128    def __init__(self, *args):
129        # TODO(ICU-20301): Change this to: super().__init__(*args)
130        super(RegexFilter, self).__init__(*args)
131        if self.is_includelist:
132            self.includelist = [re.compile(pat) for pat in self.includelist]
133        else:
134            self.excludelist = [re.compile(pat) for pat in self.excludelist]
135
136    def _should_include(self, file_stem):
137        if self.is_includelist:
138            for pattern in self.includelist:
139                if pattern.match(file_stem):
140                    return True
141            return False
142        else:
143            for pattern in self.excludelist:
144                if pattern.match(file_stem):
145                    return False
146            return True
147
148
149class UnionFilter(Filter):
150    def __init__(self, json_data, io):
151        # Collect the sub-filters.
152        self.sub_filters = []
153        for filter_json in json_data["unionOf"]:
154            self.sub_filters.append(Filter.create_from_json(filter_json, io))
155
156    def match(self, file):
157        """Match iff any of the sub-filters match."""
158        for filter in self.sub_filters:
159            if filter.match(file):
160                return True
161        return False
162
163
164LANGUAGE_SCRIPT_REGEX = re.compile(r"^([a-z]{2,3})_[A-Z][a-z]{3}$")
165LANGUAGE_ONLY_REGEX = re.compile(r"^[a-z]{2,3}$")
166
167class LocaleFilter(Filter):
168    def __init__(self, json_data, io):
169        if "whitelist" in json_data:
170            self.locales_requested = list(json_data["whitelist"])
171        elif "includelist" in json_data:
172            self.locales_requested = list(json_data["includelist"])
173        else:
174            raise AssertionError("You must have an includelist in a locale filter")
175        self.include_children = json_data.get("includeChildren", True)
176        self.include_scripts = json_data.get("includeScripts", False)
177
178        # Load the dependency graph from disk
179        self.dependency_data_by_tree = {
180            tree: io.read_locale_deps(tree)
181            for tree in utils.ALL_TREES
182        }
183
184    def match(self, file):
185        tree = self._file_to_subdir(file)
186        assert tree is not None
187        locale = self._file_to_file_stem(file)
188
189        # A locale is *required* if it is *requested* or an ancestor of a
190        # *requested* locale.
191        if locale in self._locales_required(tree):
192            return True
193
194        # Resolve include_scripts and include_children.
195        return self._match_recursive(locale, tree)
196
197    def _match_recursive(self, locale, tree):
198        # Base case: return True if we reached a *requested* locale,
199        # or False if we ascend out of the locale tree.
200        if locale is None:
201            return False
202        if locale in self.locales_requested:
203            return True
204
205        # Check for alternative scripts.
206        # This causes sr_Latn to check sr instead of going directly to root.
207        if self.include_scripts:
208            match = LANGUAGE_SCRIPT_REGEX.match(locale)
209            if match and self._match_recursive(match.group(1), tree):
210                return True
211
212        # Check if we are a descendant of a *requested* locale.
213        if self.include_children:
214            parent = self._get_parent_locale(locale, tree)
215            if self._match_recursive(parent, tree):
216                return True
217
218        # No matches.
219        return False
220
221    def _get_parent_locale(self, locale, tree):
222        """Gets the parent locale in the given tree, according to dependency data."""
223        dependency_data = self.dependency_data_by_tree[tree]
224        if "parents" in dependency_data and locale in dependency_data["parents"]:
225            return dependency_data["parents"][locale]
226        if "aliases" in dependency_data and locale in dependency_data["aliases"]:
227            return dependency_data["aliases"][locale]
228        if LANGUAGE_ONLY_REGEX.match(locale):
229            return "root"
230        i = locale.rfind("_")
231        if i < 0:
232            assert locale == "root", "Invalid locale: %s/%s" % (tree, locale)
233            return None
234        return locale[:i]
235
236    def _locales_required(self, tree):
237        """Returns a generator of all required locales in the given tree."""
238        for locale in self.locales_requested:
239            while locale is not None:
240                yield locale
241                locale = self._get_parent_locale(locale, tree)
242
243
244def apply_filters(requests, config, io):
245    """Runs the filters and returns a new list of requests."""
246    requests = _apply_file_filters(requests, config, io)
247    requests = _apply_resource_filters(requests, config, io)
248    return requests
249
250
251def _apply_file_filters(old_requests, config, io):
252    """Filters out entire files."""
253    filters = _preprocess_file_filters(old_requests, config, io)
254    new_requests = []
255    for request in old_requests:
256        category = request.category
257        if category in filters:
258            new_requests += filters[category].filter(request)
259        else:
260            new_requests.append(request)
261    return new_requests
262
263
264def _preprocess_file_filters(requests, config, io):
265    all_categories = set(
266        request.category
267        for request in requests
268    )
269    all_categories.remove(None)
270    all_categories = list(sorted(all_categories))
271    json_data = config.filters_json_data
272    filters = {}
273    default_filter_json = "exclude" if config.strategy == "additive" else "include"
274    for category in all_categories:
275        filter_json = default_filter_json
276        # Special default for category "brkitr_lstm" as "exclude" for now.
277        if "brkitr_lstm" == category:
278            filter_json = "exclude"
279        # Figure out the correct filter to create for now.
280        if "featureFilters" in json_data and category in json_data["featureFilters"]:
281            filter_json = json_data["featureFilters"][category]
282        if filter_json == "include" and "localeFilter" in json_data and category.endswith("_tree"):
283            filter_json = json_data["localeFilter"]
284        # Resolve the filter JSON into a filter object
285        if filter_json == "exclude":
286            filters[category] = ExclusionFilter()
287        elif filter_json == "include":
288            pass  # no-op
289        else:
290            filters[category] = Filter.create_from_json(filter_json, io)
291    if "featureFilters" in json_data:
292        for category in json_data["featureFilters"]:
293            if category not in all_categories:
294                print("Warning: category %s is not known" % category, file=sys.stderr)
295    return filters
296
297
298class ResourceFilterInfo(object):
299    def __init__(self, category, strategy):
300        self.category = category
301        self.strategy = strategy
302        self.filter_tmp_dir = "filters/%s" % category
303        self.input_files = None
304        self.filter_files = None
305        self.rules_by_file = None
306
307    def apply_to_requests(self, all_requests):
308        # Call this method only once per list of requests.
309        assert self.input_files is None
310        for request in all_requests:
311            if request.category != self.category:
312                continue
313            if not isinstance(request, AbstractExecutionRequest):
314                continue
315            if request.tool != IcuTool("genrb"):
316                continue
317            if not request.input_files:
318                continue
319            self._set_files(request.input_files)
320            request.dep_targets += [self.filter_files[:]]
321            arg_str = "--filterDir {TMP_DIR}/%s" % self.filter_tmp_dir
322            request.args = "%s %s" % (arg_str, request.args)
323
324        # Make sure we found the target request
325        if self.input_files is None:
326            print("WARNING: Category not found: %s" % self.category, file=sys.stderr)
327            self.input_files = []
328            self.filter_files = []
329            self.rules_by_file = []
330
331    def _set_files(self, files):
332        # Note: The input files to genrb for a certain category should always
333        # be the same. For example, there are often two genrb calls: one for
334        # --writePoolBundle, and the other for --usePoolBundle. They are both
335        # expected to have the same list of input files.
336        if self.input_files is not None:
337            assert self.input_files == files
338            return
339        self.input_files = list(files)
340        self.filter_files = [
341            TmpFile("%s/%s" % (self.filter_tmp_dir, basename))
342            for basename in (
343                file.filename[file.filename.rfind("/")+1:]
344                for file in files
345            )
346        ]
347        if self.strategy == "additive":
348            self.rules_by_file = [
349                [r"-/", r"+/%%ALIAS", r"+/%%Parent"]
350                for _ in range(len(files))
351            ]
352        else:
353            self.rules_by_file = [
354                [r"+/"]
355                for _ in range(len(files))
356            ]
357
358    def add_rules(self, file_filter, rules):
359        for file, rule_list in zip(self.input_files, self.rules_by_file):
360            if file_filter.match(file):
361                rule_list += rules
362
363    def make_requests(self):
364        # Map from rule list to filter files with that rule list
365        unique_rules = defaultdict(list)
366        for filter_file, rules in zip(self.filter_files, self.rules_by_file):
367            unique_rules[tuple(rules)].append(filter_file)
368
369        new_requests = []
370        i = 0
371        for rules, filter_files in unique_rules.items():
372            base_filter_file = filter_files[0]
373            new_requests += [
374                PrintFileRequest(
375                    name = "%s_print_%d" % (self.category, i),
376                    output_file = base_filter_file,
377                    content = self._generate_resource_filter_txt(rules)
378                )
379            ]
380            i += 1
381            for filter_file in filter_files[1:]:
382                new_requests += [
383                    CopyRequest(
384                        name = "%s_copy_%d" % (self.category, i),
385                        input_file = base_filter_file,
386                        output_file = filter_file
387                    )
388                ]
389                i += 1
390        return new_requests
391
392    @staticmethod
393    def _generate_resource_filter_txt(rules):
394        result = "# Caution: This file is automatically generated\n\n"
395        result += "\n".join(rules)
396        return result
397
398
399def _apply_resource_filters(all_requests, config, io):
400    """Creates filters for looking within resource bundle files."""
401    json_data = config.filters_json_data
402    if "resourceFilters" not in json_data:
403        return all_requests
404
405    collected = {}
406    for entry in json_data["resourceFilters"]:
407        if "files" in entry:
408            file_filter = Filter.create_from_json(entry["files"], io)
409        else:
410            file_filter = InclusionFilter()
411        for category in entry["categories"]:
412            # not defaultdict because we need to pass arguments to the constructor
413            if category not in collected:
414                filter_info = ResourceFilterInfo(category, config.strategy)
415                filter_info.apply_to_requests(all_requests)
416                collected[category] = filter_info
417            else:
418                filter_info = collected[category]
419            filter_info.add_rules(file_filter, entry["rules"])
420
421    # Add the filter generation requests to the beginning so that by default
422    # they are made before genrb gets run (order is required by windirect)
423    new_requests = []
424    for filter_info in collected.values():
425        new_requests += filter_info.make_requests()
426    new_requests += all_requests
427    return new_requests
428