1import argparse 2import fnmatch 3import os.path 4import re 5import sys 6import unicodedata 7from collections import namedtuple 8from enum import Enum 9 10from . import shellpattern 11from .helpers import clean_lines 12 13 14def parse_patternfile_line(line, roots, ie_commands, fallback): 15 """Parse a pattern-file line and act depending on which command it represents.""" 16 ie_command = parse_inclexcl_command(line, fallback=fallback) 17 if ie_command.cmd is IECommand.RootPath: 18 roots.append(ie_command.val) 19 elif ie_command.cmd is IECommand.PatternStyle: 20 fallback = ie_command.val 21 else: 22 # it is some kind of include/exclude command 23 ie_commands.append(ie_command) 24 return fallback 25 26 27def load_pattern_file(fileobj, roots, ie_commands, fallback=None): 28 if fallback is None: 29 fallback = ShellPattern # ShellPattern is defined later in this module 30 for line in clean_lines(fileobj): 31 fallback = parse_patternfile_line(line, roots, ie_commands, fallback) 32 33 34def load_exclude_file(fileobj, patterns): 35 for patternstr in clean_lines(fileobj): 36 patterns.append(parse_exclude_pattern(patternstr)) 37 38 39class ArgparsePatternAction(argparse.Action): 40 def __init__(self, nargs=1, **kw): 41 super().__init__(nargs=nargs, **kw) 42 43 def __call__(self, parser, args, values, option_string=None): 44 parse_patternfile_line(values[0], args.paths, args.patterns, ShellPattern) 45 46 47class ArgparsePatternFileAction(argparse.Action): 48 def __init__(self, nargs=1, **kw): 49 super().__init__(nargs=nargs, **kw) 50 51 def __call__(self, parser, args, values, option_string=None): 52 """Load and parse patterns from a file. 53 Lines empty or starting with '#' after stripping whitespace on both line ends are ignored. 54 """ 55 filename = values[0] 56 with open(filename) as f: 57 self.parse(f, args) 58 59 def parse(self, fobj, args): 60 load_pattern_file(fobj, args.paths, args.patterns) 61 62 63class ArgparseExcludeFileAction(ArgparsePatternFileAction): 64 def parse(self, fobj, args): 65 load_exclude_file(fobj, args.patterns) 66 67 68class PatternMatcher: 69 """Represents a collection of pattern objects to match paths against. 70 71 *fallback* is a boolean value that *match()* returns if no matching patterns are found. 72 73 """ 74 def __init__(self, fallback=None): 75 self._items = [] 76 77 # Value to return from match function when none of the patterns match. 78 self.fallback = fallback 79 80 # optimizations 81 self._path_full_patterns = {} # full path -> return value 82 83 # indicates whether the last match() call ended on a pattern for which 84 # we should recurse into any matching folder. Will be set to True or 85 # False when calling match(). 86 self.recurse_dir = None 87 88 # whether to recurse into directories when no match is found 89 # TODO: allow modification as a config option? 90 self.recurse_dir_default = True 91 92 self.include_patterns = [] 93 94 # TODO: move this info to parse_inclexcl_command and store in PatternBase subclass? 95 self.is_include_cmd = { 96 IECommand.Exclude: False, 97 IECommand.ExcludeNoRecurse: False, 98 IECommand.Include: True 99 } 100 101 def empty(self): 102 return not len(self._items) and not len(self._path_full_patterns) 103 104 def _add(self, pattern, cmd): 105 """*cmd* is an IECommand value. 106 """ 107 if isinstance(pattern, PathFullPattern): 108 key = pattern.pattern # full, normalized path 109 self._path_full_patterns[key] = cmd 110 else: 111 self._items.append((pattern, cmd)) 112 113 def add(self, patterns, cmd): 114 """Add list of patterns to internal list. *cmd* indicates whether the 115 pattern is an include/exclude pattern, and whether recursion should be 116 done on excluded folders. 117 """ 118 for pattern in patterns: 119 self._add(pattern, cmd) 120 121 def add_includepaths(self, include_paths): 122 """Used to add inclusion-paths from args.paths (from commandline). 123 """ 124 include_patterns = [parse_pattern(p, PathPrefixPattern) for p in include_paths] 125 self.add(include_patterns, IECommand.Include) 126 self.fallback = not include_patterns 127 self.include_patterns = include_patterns 128 129 def get_unmatched_include_patterns(self): 130 "Note that this only returns patterns added via *add_includepaths*." 131 return [p for p in self.include_patterns if p.match_count == 0] 132 133 def add_inclexcl(self, patterns): 134 """Add list of patterns (of type CmdTuple) to internal list. 135 """ 136 for pattern, cmd in patterns: 137 self._add(pattern, cmd) 138 139 def match(self, path): 140 """Return True or False depending on whether *path* is matched. 141 142 If no match is found among the patterns in this matcher, then the value 143 in self.fallback is returned (defaults to None). 144 145 """ 146 path = normalize_path(path) 147 # do a fast lookup for full path matches (note: we do not count such matches): 148 non_existent = object() 149 value = self._path_full_patterns.get(path, non_existent) 150 151 if value is not non_existent: 152 # we have a full path match! 153 self.recurse_dir = command_recurses_dir(value) 154 return self.is_include_cmd[value] 155 156 # this is the slow way, if we have many patterns in self._items: 157 for (pattern, cmd) in self._items: 158 if pattern.match(path, normalize=False): 159 self.recurse_dir = pattern.recurse_dir 160 return self.is_include_cmd[cmd] 161 162 # by default we will recurse if there is no match 163 self.recurse_dir = self.recurse_dir_default 164 return self.fallback 165 166 167def normalize_path(path): 168 """normalize paths for MacOS (but do nothing on other platforms)""" 169 # HFS+ converts paths to a canonical form, so users shouldn't be required to enter an exact match. 170 # Windows and Unix filesystems allow different forms, so users always have to enter an exact match. 171 return unicodedata.normalize('NFD', path) if sys.platform == 'darwin' else path 172 173 174class PatternBase: 175 """Shared logic for inclusion/exclusion patterns. 176 """ 177 PREFIX = NotImplemented 178 179 def __init__(self, pattern, recurse_dir=False): 180 self.pattern_orig = pattern 181 self.match_count = 0 182 pattern = normalize_path(pattern) 183 self._prepare(pattern) 184 self.recurse_dir = recurse_dir 185 186 def match(self, path, normalize=True): 187 """Return a boolean indicating whether *path* is matched by this pattern. 188 189 If normalize is True (default), the path will get normalized using normalize_path(), 190 otherwise it is assumed that it already is normalized using that function. 191 """ 192 if normalize: 193 path = normalize_path(path) 194 matches = self._match(path) 195 if matches: 196 self.match_count += 1 197 return matches 198 199 def __repr__(self): 200 return '%s(%s)' % (type(self), self.pattern) 201 202 def __str__(self): 203 return self.pattern_orig 204 205 def _prepare(self, pattern): 206 "Should set the value of self.pattern" 207 raise NotImplementedError 208 209 def _match(self, path): 210 raise NotImplementedError 211 212 213class PathFullPattern(PatternBase): 214 """Full match of a path.""" 215 PREFIX = "pf" 216 217 def _prepare(self, pattern): 218 self.pattern = os.path.normpath(pattern) 219 220 def _match(self, path): 221 return path == self.pattern 222 223 224# For PathPrefixPattern, FnmatchPattern and ShellPattern, we require that the pattern either match the whole path 225# or an initial segment of the path up to but not including a path separator. To unify the two cases, we add a path 226# separator to the end of the path before matching. 227 228 229class PathPrefixPattern(PatternBase): 230 """Literal files or directories listed on the command line 231 for some operations (e.g. extract, but not create). 232 If a directory is specified, all paths that start with that 233 path match as well. A trailing slash makes no difference. 234 """ 235 PREFIX = "pp" 236 237 def _prepare(self, pattern): 238 self.pattern = os.path.normpath(pattern).rstrip(os.path.sep) + os.path.sep 239 240 def _match(self, path): 241 return (path + os.path.sep).startswith(self.pattern) 242 243 244class FnmatchPattern(PatternBase): 245 """Shell glob patterns to exclude. A trailing slash means to 246 exclude the contents of a directory, but not the directory itself. 247 """ 248 PREFIX = "fm" 249 250 def _prepare(self, pattern): 251 if pattern.endswith(os.path.sep): 252 pattern = os.path.normpath(pattern).rstrip(os.path.sep) + os.path.sep + '*' + os.path.sep 253 else: 254 pattern = os.path.normpath(pattern) + os.path.sep + '*' 255 256 self.pattern = pattern 257 258 # fnmatch and re.match both cache compiled regular expressions. 259 # Nevertheless, this is about 10 times faster. 260 self.regex = re.compile(fnmatch.translate(self.pattern)) 261 262 def _match(self, path): 263 return (self.regex.match(path + os.path.sep) is not None) 264 265 266class ShellPattern(PatternBase): 267 """Shell glob patterns to exclude. A trailing slash means to 268 exclude the contents of a directory, but not the directory itself. 269 """ 270 PREFIX = "sh" 271 272 def _prepare(self, pattern): 273 sep = os.path.sep 274 275 if pattern.endswith(sep): 276 pattern = os.path.normpath(pattern).rstrip(sep) + sep + "**" + sep + "*" + sep 277 else: 278 pattern = os.path.normpath(pattern) + sep + "**" + sep + "*" 279 280 self.pattern = pattern 281 self.regex = re.compile(shellpattern.translate(self.pattern)) 282 283 def _match(self, path): 284 return (self.regex.match(path + os.path.sep) is not None) 285 286 287class RegexPattern(PatternBase): 288 """Regular expression to exclude. 289 """ 290 PREFIX = "re" 291 292 def _prepare(self, pattern): 293 self.pattern = pattern 294 self.regex = re.compile(pattern) 295 296 def _match(self, path): 297 # Normalize path separators 298 if os.path.sep != '/': 299 path = path.replace(os.path.sep, '/') 300 301 return (self.regex.search(path) is not None) 302 303 304_PATTERN_CLASSES = { 305 FnmatchPattern, 306 PathFullPattern, 307 PathPrefixPattern, 308 RegexPattern, 309 ShellPattern, 310} 311 312_PATTERN_CLASS_BY_PREFIX = dict((i.PREFIX, i) for i in _PATTERN_CLASSES) 313 314CmdTuple = namedtuple('CmdTuple', 'val cmd') 315 316 317class IECommand(Enum): 318 """A command that an InclExcl file line can represent. 319 """ 320 RootPath = 1 321 PatternStyle = 2 322 Include = 3 323 Exclude = 4 324 ExcludeNoRecurse = 5 325 326 327def command_recurses_dir(cmd): 328 # TODO?: raise error or return None if *cmd* is RootPath or PatternStyle 329 return cmd not in [IECommand.ExcludeNoRecurse] 330 331 332def get_pattern_class(prefix): 333 try: 334 return _PATTERN_CLASS_BY_PREFIX[prefix] 335 except KeyError: 336 raise ValueError("Unknown pattern style: {}".format(prefix)) from None 337 338 339def parse_pattern(pattern, fallback=FnmatchPattern, recurse_dir=True): 340 """Read pattern from string and return an instance of the appropriate implementation class. 341 342 """ 343 if len(pattern) > 2 and pattern[2] == ":" and pattern[:2].isalnum(): 344 (style, pattern) = (pattern[:2], pattern[3:]) 345 cls = get_pattern_class(style) 346 else: 347 cls = fallback 348 return cls(pattern, recurse_dir) 349 350 351def parse_exclude_pattern(pattern_str, fallback=FnmatchPattern): 352 """Read pattern from string and return an instance of the appropriate implementation class. 353 """ 354 epattern_obj = parse_pattern(pattern_str, fallback, recurse_dir=False) 355 return CmdTuple(epattern_obj, IECommand.ExcludeNoRecurse) 356 357 358def parse_inclexcl_command(cmd_line_str, fallback=ShellPattern): 359 """Read a --patterns-from command from string and return a CmdTuple object.""" 360 361 cmd_prefix_map = { 362 '-': IECommand.Exclude, 363 '!': IECommand.ExcludeNoRecurse, 364 '+': IECommand.Include, 365 'R': IECommand.RootPath, 366 'r': IECommand.RootPath, 367 'P': IECommand.PatternStyle, 368 'p': IECommand.PatternStyle, 369 } 370 if not cmd_line_str: 371 raise argparse.ArgumentTypeError("A pattern/command must not be empty.") 372 373 cmd = cmd_prefix_map.get(cmd_line_str[0]) 374 if cmd is None: 375 raise argparse.ArgumentTypeError("A pattern/command must start with anyone of: %s" % 376 ', '.join(cmd_prefix_map)) 377 378 # remaining text on command-line following the command character 379 remainder_str = cmd_line_str[1:].lstrip() 380 if not remainder_str: 381 raise argparse.ArgumentTypeError("A pattern/command must have a value part.") 382 383 if cmd is IECommand.RootPath: 384 # TODO: validate string? 385 val = remainder_str 386 elif cmd is IECommand.PatternStyle: 387 # then remainder_str is something like 're' or 'sh' 388 try: 389 val = get_pattern_class(remainder_str) 390 except ValueError: 391 raise argparse.ArgumentTypeError("Invalid pattern style: {}".format(remainder_str)) 392 else: 393 # determine recurse_dir based on command type 394 recurse_dir = command_recurses_dir(cmd) 395 val = parse_pattern(remainder_str, fallback, recurse_dir) 396 397 return CmdTuple(val, cmd) 398