1import argparse
2import fnmatch
3import os.path
4import re
5import sys
6import unicodedata
7from collections import namedtuple
8from enum import Enum
9
10from . import shellpattern
11from .helpers import clean_lines
12
13
14def parse_patternfile_line(line, roots, ie_commands, fallback):
15    """Parse a pattern-file line and act depending on which command it represents."""
16    ie_command = parse_inclexcl_command(line, fallback=fallback)
17    if ie_command.cmd is IECommand.RootPath:
18        roots.append(ie_command.val)
19    elif ie_command.cmd is IECommand.PatternStyle:
20        fallback = ie_command.val
21    else:
22        # it is some kind of include/exclude command
23        ie_commands.append(ie_command)
24    return fallback
25
26
27def load_pattern_file(fileobj, roots, ie_commands, fallback=None):
28    if fallback is None:
29        fallback = ShellPattern  # ShellPattern is defined later in this module
30    for line in clean_lines(fileobj):
31        fallback = parse_patternfile_line(line, roots, ie_commands, fallback)
32
33
34def load_exclude_file(fileobj, patterns):
35    for patternstr in clean_lines(fileobj):
36        patterns.append(parse_exclude_pattern(patternstr))
37
38
39class ArgparsePatternAction(argparse.Action):
40    def __init__(self, nargs=1, **kw):
41        super().__init__(nargs=nargs, **kw)
42
43    def __call__(self, parser, args, values, option_string=None):
44        parse_patternfile_line(values[0], args.paths, args.patterns, ShellPattern)
45
46
47class ArgparsePatternFileAction(argparse.Action):
48    def __init__(self, nargs=1, **kw):
49        super().__init__(nargs=nargs, **kw)
50
51    def __call__(self, parser, args, values, option_string=None):
52        """Load and parse patterns from a file.
53        Lines empty or starting with '#' after stripping whitespace on both line ends are ignored.
54        """
55        filename = values[0]
56        with open(filename) as f:
57            self.parse(f, args)
58
59    def parse(self, fobj, args):
60        load_pattern_file(fobj, args.paths, args.patterns)
61
62
63class ArgparseExcludeFileAction(ArgparsePatternFileAction):
64    def parse(self, fobj, args):
65        load_exclude_file(fobj, args.patterns)
66
67
68class PatternMatcher:
69    """Represents a collection of pattern objects to match paths against.
70
71    *fallback* is a boolean value that *match()* returns if no matching patterns are found.
72
73    """
74    def __init__(self, fallback=None):
75        self._items = []
76
77        # Value to return from match function when none of the patterns match.
78        self.fallback = fallback
79
80        # optimizations
81        self._path_full_patterns = {}  # full path -> return value
82
83        # indicates whether the last match() call ended on a pattern for which
84        # we should recurse into any matching folder.  Will be set to True or
85        # False when calling match().
86        self.recurse_dir = None
87
88        # whether to recurse into directories when no match is found
89        # TODO: allow modification as a config option?
90        self.recurse_dir_default = True
91
92        self.include_patterns = []
93
94        # TODO: move this info to parse_inclexcl_command and store in PatternBase subclass?
95        self.is_include_cmd = {
96            IECommand.Exclude: False,
97            IECommand.ExcludeNoRecurse: False,
98            IECommand.Include: True
99        }
100
101    def empty(self):
102        return not len(self._items) and not len(self._path_full_patterns)
103
104    def _add(self, pattern, cmd):
105        """*cmd* is an IECommand value.
106        """
107        if isinstance(pattern, PathFullPattern):
108            key = pattern.pattern  # full, normalized path
109            self._path_full_patterns[key] = cmd
110        else:
111            self._items.append((pattern, cmd))
112
113    def add(self, patterns, cmd):
114        """Add list of patterns to internal list. *cmd* indicates whether the
115        pattern is an include/exclude pattern, and whether recursion should be
116        done on excluded folders.
117        """
118        for pattern in patterns:
119            self._add(pattern, cmd)
120
121    def add_includepaths(self, include_paths):
122        """Used to add inclusion-paths from args.paths (from commandline).
123        """
124        include_patterns = [parse_pattern(p, PathPrefixPattern) for p in include_paths]
125        self.add(include_patterns, IECommand.Include)
126        self.fallback = not include_patterns
127        self.include_patterns = include_patterns
128
129    def get_unmatched_include_patterns(self):
130        "Note that this only returns patterns added via *add_includepaths*."
131        return [p for p in self.include_patterns if p.match_count == 0]
132
133    def add_inclexcl(self, patterns):
134        """Add list of patterns (of type CmdTuple) to internal list.
135        """
136        for pattern, cmd in patterns:
137            self._add(pattern, cmd)
138
139    def match(self, path):
140        """Return True or False depending on whether *path* is matched.
141
142        If no match is found among the patterns in this matcher, then the value
143        in self.fallback is returned (defaults to None).
144
145        """
146        path = normalize_path(path)
147        # do a fast lookup for full path matches (note: we do not count such matches):
148        non_existent = object()
149        value = self._path_full_patterns.get(path, non_existent)
150
151        if value is not non_existent:
152            # we have a full path match!
153            self.recurse_dir = command_recurses_dir(value)
154            return self.is_include_cmd[value]
155
156        # this is the slow way, if we have many patterns in self._items:
157        for (pattern, cmd) in self._items:
158            if pattern.match(path, normalize=False):
159                self.recurse_dir = pattern.recurse_dir
160                return self.is_include_cmd[cmd]
161
162        # by default we will recurse if there is no match
163        self.recurse_dir = self.recurse_dir_default
164        return self.fallback
165
166
167def normalize_path(path):
168    """normalize paths for MacOS (but do nothing on other platforms)"""
169    # HFS+ converts paths to a canonical form, so users shouldn't be required to enter an exact match.
170    # Windows and Unix filesystems allow different forms, so users always have to enter an exact match.
171    return unicodedata.normalize('NFD', path) if sys.platform == 'darwin' else path
172
173
174class PatternBase:
175    """Shared logic for inclusion/exclusion patterns.
176    """
177    PREFIX = NotImplemented
178
179    def __init__(self, pattern, recurse_dir=False):
180        self.pattern_orig = pattern
181        self.match_count = 0
182        pattern = normalize_path(pattern)
183        self._prepare(pattern)
184        self.recurse_dir = recurse_dir
185
186    def match(self, path, normalize=True):
187        """Return a boolean indicating whether *path* is matched by this pattern.
188
189        If normalize is True (default), the path will get normalized using normalize_path(),
190        otherwise it is assumed that it already is normalized using that function.
191        """
192        if normalize:
193            path = normalize_path(path)
194        matches = self._match(path)
195        if matches:
196            self.match_count += 1
197        return matches
198
199    def __repr__(self):
200        return '%s(%s)' % (type(self), self.pattern)
201
202    def __str__(self):
203        return self.pattern_orig
204
205    def _prepare(self, pattern):
206        "Should set the value of self.pattern"
207        raise NotImplementedError
208
209    def _match(self, path):
210        raise NotImplementedError
211
212
213class PathFullPattern(PatternBase):
214    """Full match of a path."""
215    PREFIX = "pf"
216
217    def _prepare(self, pattern):
218        self.pattern = os.path.normpath(pattern)
219
220    def _match(self, path):
221        return path == self.pattern
222
223
224# For PathPrefixPattern, FnmatchPattern and ShellPattern, we require that the pattern either match the whole path
225# or an initial segment of the path up to but not including a path separator. To unify the two cases, we add a path
226# separator to the end of the path before matching.
227
228
229class PathPrefixPattern(PatternBase):
230    """Literal files or directories listed on the command line
231    for some operations (e.g. extract, but not create).
232    If a directory is specified, all paths that start with that
233    path match as well.  A trailing slash makes no difference.
234    """
235    PREFIX = "pp"
236
237    def _prepare(self, pattern):
238        self.pattern = os.path.normpath(pattern).rstrip(os.path.sep) + os.path.sep
239
240    def _match(self, path):
241        return (path + os.path.sep).startswith(self.pattern)
242
243
244class FnmatchPattern(PatternBase):
245    """Shell glob patterns to exclude.  A trailing slash means to
246    exclude the contents of a directory, but not the directory itself.
247    """
248    PREFIX = "fm"
249
250    def _prepare(self, pattern):
251        if pattern.endswith(os.path.sep):
252            pattern = os.path.normpath(pattern).rstrip(os.path.sep) + os.path.sep + '*' + os.path.sep
253        else:
254            pattern = os.path.normpath(pattern) + os.path.sep + '*'
255
256        self.pattern = pattern
257
258        # fnmatch and re.match both cache compiled regular expressions.
259        # Nevertheless, this is about 10 times faster.
260        self.regex = re.compile(fnmatch.translate(self.pattern))
261
262    def _match(self, path):
263        return (self.regex.match(path + os.path.sep) is not None)
264
265
266class ShellPattern(PatternBase):
267    """Shell glob patterns to exclude.  A trailing slash means to
268    exclude the contents of a directory, but not the directory itself.
269    """
270    PREFIX = "sh"
271
272    def _prepare(self, pattern):
273        sep = os.path.sep
274
275        if pattern.endswith(sep):
276            pattern = os.path.normpath(pattern).rstrip(sep) + sep + "**" + sep + "*" + sep
277        else:
278            pattern = os.path.normpath(pattern) + sep + "**" + sep + "*"
279
280        self.pattern = pattern
281        self.regex = re.compile(shellpattern.translate(self.pattern))
282
283    def _match(self, path):
284        return (self.regex.match(path + os.path.sep) is not None)
285
286
287class RegexPattern(PatternBase):
288    """Regular expression to exclude.
289    """
290    PREFIX = "re"
291
292    def _prepare(self, pattern):
293        self.pattern = pattern
294        self.regex = re.compile(pattern)
295
296    def _match(self, path):
297        # Normalize path separators
298        if os.path.sep != '/':
299            path = path.replace(os.path.sep, '/')
300
301        return (self.regex.search(path) is not None)
302
303
304_PATTERN_CLASSES = {
305    FnmatchPattern,
306    PathFullPattern,
307    PathPrefixPattern,
308    RegexPattern,
309    ShellPattern,
310}
311
312_PATTERN_CLASS_BY_PREFIX = dict((i.PREFIX, i) for i in _PATTERN_CLASSES)
313
314CmdTuple = namedtuple('CmdTuple', 'val cmd')
315
316
317class IECommand(Enum):
318    """A command that an InclExcl file line can represent.
319    """
320    RootPath = 1
321    PatternStyle = 2
322    Include = 3
323    Exclude = 4
324    ExcludeNoRecurse = 5
325
326
327def command_recurses_dir(cmd):
328    # TODO?: raise error or return None if *cmd* is RootPath or PatternStyle
329    return cmd not in [IECommand.ExcludeNoRecurse]
330
331
332def get_pattern_class(prefix):
333    try:
334        return _PATTERN_CLASS_BY_PREFIX[prefix]
335    except KeyError:
336        raise ValueError("Unknown pattern style: {}".format(prefix)) from None
337
338
339def parse_pattern(pattern, fallback=FnmatchPattern, recurse_dir=True):
340    """Read pattern from string and return an instance of the appropriate implementation class.
341
342    """
343    if len(pattern) > 2 and pattern[2] == ":" and pattern[:2].isalnum():
344        (style, pattern) = (pattern[:2], pattern[3:])
345        cls = get_pattern_class(style)
346    else:
347        cls = fallback
348    return cls(pattern, recurse_dir)
349
350
351def parse_exclude_pattern(pattern_str, fallback=FnmatchPattern):
352    """Read pattern from string and return an instance of the appropriate implementation class.
353    """
354    epattern_obj = parse_pattern(pattern_str, fallback, recurse_dir=False)
355    return CmdTuple(epattern_obj, IECommand.ExcludeNoRecurse)
356
357
358def parse_inclexcl_command(cmd_line_str, fallback=ShellPattern):
359    """Read a --patterns-from command from string and return a CmdTuple object."""
360
361    cmd_prefix_map = {
362        '-': IECommand.Exclude,
363        '!': IECommand.ExcludeNoRecurse,
364        '+': IECommand.Include,
365        'R': IECommand.RootPath,
366        'r': IECommand.RootPath,
367        'P': IECommand.PatternStyle,
368        'p': IECommand.PatternStyle,
369    }
370    if not cmd_line_str:
371        raise argparse.ArgumentTypeError("A pattern/command must not be empty.")
372
373    cmd = cmd_prefix_map.get(cmd_line_str[0])
374    if cmd is None:
375        raise argparse.ArgumentTypeError("A pattern/command must start with anyone of: %s" %
376                                         ', '.join(cmd_prefix_map))
377
378    # remaining text on command-line following the command character
379    remainder_str = cmd_line_str[1:].lstrip()
380    if not remainder_str:
381        raise argparse.ArgumentTypeError("A pattern/command must have a value part.")
382
383    if cmd is IECommand.RootPath:
384        # TODO: validate string?
385        val = remainder_str
386    elif cmd is IECommand.PatternStyle:
387        # then remainder_str is something like 're' or 'sh'
388        try:
389            val = get_pattern_class(remainder_str)
390        except ValueError:
391            raise argparse.ArgumentTypeError("Invalid pattern style: {}".format(remainder_str))
392    else:
393        # determine recurse_dir based on command type
394        recurse_dir = command_recurses_dir(cmd)
395        val = parse_pattern(remainder_str, fallback, recurse_dir)
396
397    return CmdTuple(val, cmd)
398