1#! /usr/bin/env python 2# coding=utf-8 3 4from __future__ import print_function 5from __future__ import unicode_literals 6 7__version__ = "1.7" 8 9import sys 10from os import path, extsep 11from subprocess import Popen, PIPE, CalledProcessError 12 13 14class GitArchiver(object): 15 """ 16 GitArchiver 17 18 Scan a git repository and export all tracked files, and submodules. 19 Checks for .gitattributes files in each directory and uses 'export-ignore' 20 pattern entries for ignore files in the archive. 21 22 Automatically detects output format extension: zip, tar, bz2, or gz. 23 """ 24 25 def __init__(self, prefix='', verbose=False, exclude=True, force_sub=False, extra=None, main_repo_abspath=None): 26 """ 27 @type prefix: string 28 @param prefix: Prefix used to prepend all paths in the resulting archive. 29 30 @type verbose: bool 31 @param verbose: Determines verbosity of the output (stdout). 32 33 @type exclude: bool 34 @param exclude: Determines whether archiver should follow rules specified in .gitattributes files. 35 Defaults to True. 36 37 @type force_sub: bool 38 @param force_sub: Determines whether submodules are initialized and updated before archiving. 39 Defaults to False 40 41 @type extra: list 42 @param extra: List of extra paths to include in the resulting archive. 43 44 @type main_repo_abspath: string 45 @param main_repo_abspath: Absolute path to the main repository (or one of subdirectories). 46 If None, current cwd is used. 47 If given path is path to a subdirectory (but not a submodule directory!) 48 it will be replaced with abspath to toplevel directory of the repository. 49 """ 50 if extra is None: 51 extra = [] 52 53 if main_repo_abspath is None: 54 main_repo_abspath = path.abspath('') 55 elif not path.isabs(main_repo_abspath): 56 raise ValueError("You MUST pass absolute path to the main git repository.") 57 58 # Raises an exception if there is no repo under main_repo_abspath. 59 try: 60 self.run_shell("[ -d .git ] || git rev-parse --git-dir > /dev/null 2>&1", main_repo_abspath) 61 except Exception as e: 62 raise ValueError("Not a git repository (or any of the parent directories).".format(path=main_repo_abspath)) 63 64 # Detect toplevel directory of the repo. 65 main_repo_abspath = path.abspath(self.read_git_shell('git rev-parse --show-toplevel', main_repo_abspath).rstrip()) 66 67 self.prefix = prefix 68 self.verbose = verbose 69 self.exclude = exclude 70 self.extra = extra 71 self.force_sub = force_sub 72 self.main_repo_abspath = main_repo_abspath 73 74 def create(self, output_path, dry_run=False, output_format=None): 75 """ 76 Creates the archive, written to the given output_file_path 77 78 Type of the archive is determined either by extension of output_file_path or by the format argument. 79 Supported formats are: gz, zip, bz2, tar, tgz 80 81 @type output_path: string 82 @param output_path: Output file path. 83 84 @type dry_run: bool 85 @param dry_run: Determines whether create should do nothing but print what it would archive. 86 87 @type output_format: string 88 @param output_format: Determines format of the output archive. 89 If None, format is determined from extension of output_file_path. 90 """ 91 if output_format is None: 92 file_name, file_ext = path.splitext(output_path) 93 output_format = file_ext[len(extsep):].lower() 94 95 if output_format == 'zip': 96 from zipfile import ZipFile, ZIP_DEFLATED 97 98 if not dry_run: 99 archive = ZipFile(path.abspath(output_path), 'w') 100 add = lambda file_path, file_name: archive.write(file_path, path.join(self.prefix, file_name), ZIP_DEFLATED) 101 elif output_format in ['tar', 'bz2', 'gz', 'tgz']: 102 import tarfile 103 104 if output_format == 'tar': 105 t_mode = 'w' 106 elif output_format == 'tgz': 107 t_mode = 'w:gz' 108 else: 109 t_mode = 'w:{f}'.format(f=output_format) 110 111 if not dry_run: 112 archive = tarfile.open(path.abspath(output_path), t_mode) 113 add = lambda file_path, file_name: archive.add(file_path, path.join(self.prefix, file_name)) 114 else: 115 raise RuntimeError("Unknown format: {f}".format(f=output_format)) 116 117 for file_path in self.extra: 118 if not dry_run: 119 if self.verbose: 120 print("Compressing {f} => {a}...".format(f=file_path, 121 a=path.join(self.prefix, file_path))) 122 add(file_path, file_path) 123 else: 124 print("{f} => {a}".format(f=file_path, 125 a=path.join(self.prefix, file_path))) 126 127 for file_path in self.list_files(): 128 if not dry_run: 129 if self.verbose: 130 print("Compressing {f} => {a}...".format(f=path.join(self.main_repo_abspath, file_path), 131 a=path.join(self.prefix, file_path))) 132 add(path.join(self.main_repo_abspath, file_path), file_path) 133 else: 134 print("{f} => {a}".format(f=path.join(self.main_repo_abspath, file_path), 135 a=path.join(self.prefix, file_path))) 136 137 if not dry_run: 138 archive.close() 139 140 def get_path_components(self, repo_abspath, abspath): 141 """ 142 Splits given abspath into components until repo_abspath is reached. 143 144 E.g. if repo_abspath is '/Documents/Hobby/ParaView/' and abspath is 145 '/Documents/Hobby/ParaView/Catalyst/Editions/Base/', function will return: 146 ['.', 'Catalyst', 'Editions', 'Base'] 147 148 First element is always '.' (concrete symbol depends on OS). 149 150 @type repo_abspath: string 151 @param repo_abspath: Absolute path to the git repository. 152 153 @type abspath: string 154 @param abspath: Absolute path to within repo_abspath. 155 156 @rtype: list 157 @return: List of path components. 158 """ 159 components = [] 160 161 while not path.samefile(abspath, repo_abspath): 162 abspath, tail = path.split(abspath) 163 164 if len(tail): 165 components.insert(0, tail) 166 167 components.insert(0, path.relpath(repo_abspath, repo_abspath)) 168 return components 169 170 def get_exclude_patterns(self, repo_abspath, repo_file_paths): 171 """ 172 Returns exclude patterns for a given repo. It looks for .gitattributes files in repo_file_paths. 173 174 Resulting dictionary will contain exclude patterns per path (relative to the repo_abspath). 175 E.g. {('.', 'Catalyst', 'Editions', 'Base'), ['Foo*', '*Bar']} 176 177 @type repo_abspath: string 178 @param repo_abspath: Absolute path to the git repository. 179 180 @type repo_file_paths: list 181 @param repo_file_paths: List of paths relative to the repo_abspath that are under git control. 182 183 @rtype: dict 184 @return: Dictionary representing exclude patterns. 185 Keys are tuples of strings. Values are lists of strings. 186 Returns None if self.exclude is not set. 187 """ 188 if not self.exclude: 189 return None 190 191 def read_attributes(attributes_abspath): 192 patterns = [] 193 if path.isfile(attributes_abspath): 194 attributes = open(attributes_abspath, 'r').readlines() 195 patterns = [] 196 for line in attributes: 197 tokens = line.strip().split() 198 if "export-ignore" in tokens[1:]: 199 patterns.append(tokens[0]) 200 return patterns 201 202 exclude_patterns = {(): []} 203 204 # There may be no gitattributes. 205 try: 206 global_attributes_abspath = self.read_shell("git config --get core.attributesfile", repo_abspath).rstrip() 207 exclude_patterns[()] = read_attributes(global_attributes_abspath) 208 except Exception: 209 # And valid to not have them. 210 pass 211 212 for attributes_abspath in [path.join(repo_abspath, f) for f in repo_file_paths if f.endswith(".gitattributes")]: 213 # Each .gitattributes affects only files within its directory. 214 key = tuple(self.get_path_components(repo_abspath, path.dirname(attributes_abspath))) 215 exclude_patterns[key] = read_attributes(attributes_abspath) 216 217 local_attributes_abspath = path.join(repo_abspath, ".git", "info", "attributes") 218 key = tuple(self.get_path_components(repo_abspath, repo_abspath)) 219 220 if key in exclude_patterns: 221 exclude_patterns[key].extend(read_attributes(local_attributes_abspath)) 222 else: 223 exclude_patterns[key] = read_attributes(local_attributes_abspath) 224 225 return exclude_patterns 226 227 def is_file_excluded(self, repo_abspath, repo_file_path, exclude_patterns): 228 """ 229 Checks whether file at a given path is excluded. 230 231 @type repo_abspath: string 232 @param repo_abspath: Absolute path to the git repository. 233 234 @type repo_file_path: string 235 @param repo_file_path: Path to a file within repo_abspath. 236 237 @type exclude_patterns: dict 238 @param exclude_patterns: Exclude patterns with format specified for get_exclude_patterns. 239 240 @rtype: bool 241 @return: True if file should be excluded. Otherwise False. 242 """ 243 if exclude_patterns is None or not len(exclude_patterns): 244 return False 245 246 from fnmatch import fnmatch 247 248 file_name = path.basename(repo_file_path) 249 components = self.get_path_components(repo_abspath, path.join(repo_abspath, path.dirname(repo_file_path))) 250 251 is_excluded = False 252 # We should check all patterns specified in intermediate directories to the given file. 253 # At the end we should also check for the global patterns (key '()' or empty tuple). 254 while not is_excluded: 255 key = tuple(components) 256 if key in exclude_patterns: 257 patterns = exclude_patterns[key] 258 for p in patterns: 259 if fnmatch(file_name, p) or fnmatch(repo_file_path, p): 260 if self.verbose: 261 print("Exclude pattern matched {pattern}: {path}".format(pattern=p, path=repo_file_path)) 262 is_excluded = True 263 264 if not len(components): 265 break 266 267 components.pop() 268 269 return is_excluded 270 271 def list_files(self, repo_path=''): 272 """ 273 An iterator method that yields a file path relative to main_repo_abspath 274 for each file that should be included in the archive. 275 Skips those that match the exclusion patterns found in 276 any discovered .gitattributes files along the way. 277 278 Recurs into submodules as well. 279 280 @type repo_path: string 281 @param repo_path: Path to the git submodule repository within the main git repository. 282 283 @rtype: iterator 284 @return: Iterator to traverse files under git control relative to main_repo_abspath. 285 """ 286 repo_abspath = path.join(self.main_repo_abspath, repo_path) 287 repo_file_paths = self.read_git_shell("git ls-files --cached --full-name --no-empty-directory", repo_abspath).splitlines() 288 exclude_patterns = self.get_exclude_patterns(repo_abspath, repo_file_paths) 289 290 for repo_file_path in repo_file_paths: 291 # Git puts path in quotes if file path has unicode characters. 292 repo_file_path = repo_file_path.strip('"') # file path relative to current repo 293 file_name = path.basename(repo_file_path) 294 295 # Only list symlinks and files that don't start with git. 296 if file_name.startswith(".git") or (not path.islink(repo_file_path) and path.isdir(repo_file_path)): 297 continue 298 299 main_repo_file_path = path.join(repo_path, repo_file_path) # file path relative to the main repo 300 301 if self.is_file_excluded(repo_abspath, repo_file_path, exclude_patterns): 302 continue 303 304 # Yield both repo_file_path and main_repo_file_path to preserve structure of the repo. 305 yield main_repo_file_path 306 307 if self.force_sub: 308 self.run_shell("git submodule init", repo_abspath) 309 self.run_shell("git submodule update", repo_abspath) 310 311 # List files of every submodule. 312 for submodule_path in self.read_shell("git submodule --quiet foreach 'pwd'", repo_abspath).splitlines(): 313 # In order to get output path we need to exclude repository path from submodule_path. 314 submodule_path = path.relpath(submodule_path, self.main_repo_abspath) 315 for file_path in self.list_files(submodule_path): 316 yield file_path 317 318 @staticmethod 319 def run_shell(cmd, cwd=None): 320 """ 321 Runs shell command. 322 323 @type cmd: string 324 @param cmd: Command to be executed. 325 326 @type cwd: string 327 @param cwd: Working directory. 328 329 @rtype: int 330 @return: Return code of the command. 331 332 @raise CalledProcessError: Raises exception if return code of the command is non-zero. 333 """ 334 p = Popen(cmd, shell=True, cwd=cwd) 335 p.wait() 336 337 if p.returncode: 338 raise CalledProcessError(returncode=p.returncode, cmd=cmd) 339 340 return p.returncode 341 342 @staticmethod 343 def read_shell(cmd, cwd=None, encoding='utf-8'): 344 """ 345 Runs shell command and reads output. 346 347 @type cmd: string 348 @param cmd: Command to be executed. 349 350 @type cwd: string 351 @param cwd: Working directory. 352 353 @type encoding: string 354 @param encoding: Encoding used to decode bytes returned by Popen into string. 355 356 @rtype: string 357 @return: Output of the command. 358 359 @raise CalledProcessError: Raises exception if return code of the command is non-zero. 360 """ 361 p = Popen(cmd, shell=True, stdout=PIPE, cwd=cwd) 362 output, _ = p.communicate() 363 output = output.decode(encoding) 364 365 if p.returncode: 366 raise CalledProcessError(returncode=p.returncode, cmd=cmd, output=output) 367 368 return output 369 370 @staticmethod 371 def read_git_shell(cmd, cwd=None): 372 """ 373 Runs git shell command, reads output and decodes it into unicode string 374 375 @type cmd: string 376 @param cmd: Command to be executed. 377 378 @type cwd: string 379 @param cwd: Working directory. 380 381 @rtype: string 382 @return: Output of the command. 383 384 @raise CalledProcessError: Raises exception if return code of the command is non-zero. 385 """ 386 p = Popen(cmd, shell=True, stdout=PIPE, cwd=cwd) 387 output, _ = p.communicate() 388 output = output.decode('unicode_escape').encode('raw_unicode_escape').decode('utf-8') 389 390 if p.returncode: 391 raise CalledProcessError(returncode=p.returncode, cmd=cmd, output=output) 392 393 return output 394 395 396if __name__ == '__main__': 397 from optparse import OptionParser 398 399 parser = OptionParser(usage="usage: %prog [-v] [--prefix PREFIX] [--no-exclude] [--force-submodules] [--dry-run] OUTPUT_FILE", 400 version="%prog {version}".format(version=__version__)) 401 402 parser.add_option('--prefix', 403 type='string', 404 dest='prefix', 405 default='', 406 help="Prepend PREFIX to each filename in the archive. OUTPUT_FILE name is used by default to avoid tarbomb.") 407 408 parser.add_option('-v', '--verbose', 409 action='store_true', 410 dest='verbose', 411 help='Enable verbose mode.') 412 413 parser.add_option('--no-exclude', 414 action='store_false', 415 dest='exclude', 416 default=True, 417 help="Don't read .gitattributes files for patterns containing export-ignore attrib.") 418 419 parser.add_option('--force-submodules', 420 action='store_true', 421 dest='force_sub', 422 help="Force a git submodule init && git submodule update at each level before iterating submodules.") 423 424 parser.add_option('--extra', 425 action='append', 426 dest='extra', 427 default=[], 428 help="Any additional files to include in the archive.") 429 parser.add_option('--dry-run', 430 action='store_true', 431 dest='dry_run', 432 help="Don't actually archive anything, just show what would be done.") 433 434 options, args = parser.parse_args() 435 436 if len(args) != 1: 437 parser.error("You must specify exactly one output file") 438 439 output_file_path = args[0] 440 441 if path.isdir(output_file_path): 442 parser.error("You cannot use directory as output") 443 444 # avoid tarbomb 445 if options.prefix: 446 options.prefix = path.join(options.prefix, '') 447 else: 448 import re 449 450 output_name = path.basename(output_file_path) 451 output_name = re.sub('(\.zip|\.tar|\.tgz|\.gz|\.bz2|\.tar\.gz|\.tar\.bz2)$', '', output_name) or "Archive" 452 options.prefix = path.join(output_name, '') 453 454 try: 455 archiver = GitArchiver(options.prefix, 456 options.verbose, 457 options.exclude, 458 options.force_sub, 459 options.extra) 460 archiver.create(output_file_path, options.dry_run) 461 except Exception as e: 462 parser.exit(2, "{exception}\n".format(exception=e)) 463 464 sys.exit(0) 465