1#! /usr/bin/env python
2# coding=utf-8
3
4from __future__ import print_function
5from __future__ import unicode_literals
6
7__version__ = "1.7"
8
9import sys
10from os import path, extsep
11from subprocess import Popen, PIPE, CalledProcessError
12
13
14class GitArchiver(object):
15    """
16    GitArchiver
17
18    Scan a git repository and export all tracked files, and submodules.
19    Checks for .gitattributes files in each directory and uses 'export-ignore'
20    pattern entries for ignore files in the archive.
21
22    Automatically detects output format extension: zip, tar, bz2, or gz.
23    """
24
25    def __init__(self, prefix='', verbose=False, exclude=True, force_sub=False, extra=None, main_repo_abspath=None):
26        """
27        @type prefix:   string
28        @param prefix:  Prefix used to prepend all paths in the resulting archive.
29
30        @type verbose:  bool
31        @param verbose: Determines verbosity of the output (stdout).
32
33        @type exclude:  bool
34        @param exclude: Determines whether archiver should follow rules specified in .gitattributes files.
35                        Defaults to True.
36
37        @type force_sub:    bool
38        @param force_sub:   Determines whether submodules are initialized and updated before archiving.
39                            Defaults to False
40
41        @type extra:    list
42        @param extra:   List of extra paths to include in the resulting archive.
43
44        @type main_repo_abspath:    string
45        @param main_repo_abspath:   Absolute path to the main repository (or one of subdirectories).
46                                    If None, current cwd is used.
47                                    If given path is path to a subdirectory (but not a submodule directory!)
48                                    it will be replaced with abspath to toplevel directory of the repository.
49        """
50        if extra is None:
51            extra = []
52
53        if main_repo_abspath is None:
54            main_repo_abspath = path.abspath('')
55        elif not path.isabs(main_repo_abspath):
56            raise ValueError("You MUST pass absolute path to the main git repository.")
57
58        # Raises an exception if there is no repo under main_repo_abspath.
59        try:
60            self.run_shell("[ -d .git ] || git rev-parse --git-dir > /dev/null 2>&1", main_repo_abspath)
61        except Exception as e:
62            raise ValueError("Not a git repository (or any of the parent directories).".format(path=main_repo_abspath))
63
64        # Detect toplevel directory of the repo.
65        main_repo_abspath = path.abspath(self.read_git_shell('git rev-parse --show-toplevel', main_repo_abspath).rstrip())
66
67        self.prefix = prefix
68        self.verbose = verbose
69        self.exclude = exclude
70        self.extra = extra
71        self.force_sub = force_sub
72        self.main_repo_abspath = main_repo_abspath
73
74    def create(self, output_path, dry_run=False, output_format=None):
75        """
76        Creates the archive, written to the given output_file_path
77
78        Type of the archive is determined either by extension of output_file_path or by the format argument.
79        Supported formats are: gz, zip, bz2, tar, tgz
80
81        @type output_path:     string
82        @param output_path:    Output file path.
83
84        @type dry_run:  bool
85        @param dry_run: Determines whether create should do nothing but print what it would archive.
86
87        @type output_format:    string
88        @param output_format:   Determines format of the output archive.
89                                If None, format is determined from extension of output_file_path.
90        """
91        if output_format is None:
92            file_name, file_ext = path.splitext(output_path)
93            output_format = file_ext[len(extsep):].lower()
94
95        if output_format == 'zip':
96            from zipfile import ZipFile, ZIP_DEFLATED
97
98            if not dry_run:
99                archive = ZipFile(path.abspath(output_path), 'w')
100                add = lambda file_path, file_name: archive.write(file_path, path.join(self.prefix, file_name), ZIP_DEFLATED)
101        elif output_format in ['tar', 'bz2', 'gz', 'tgz']:
102            import tarfile
103
104            if output_format == 'tar':
105                t_mode = 'w'
106            elif output_format == 'tgz':
107                t_mode = 'w:gz'
108            else:
109                t_mode = 'w:{f}'.format(f=output_format)
110
111            if not dry_run:
112                archive = tarfile.open(path.abspath(output_path), t_mode)
113                add = lambda file_path, file_name: archive.add(file_path, path.join(self.prefix, file_name))
114        else:
115            raise RuntimeError("Unknown format: {f}".format(f=output_format))
116
117        for file_path in self.extra:
118            if not dry_run:
119                if self.verbose:
120                    print("Compressing {f} => {a}...".format(f=file_path,
121                                                             a=path.join(self.prefix, file_path)))
122                add(file_path, file_path)
123            else:
124                print("{f} => {a}".format(f=file_path,
125                                          a=path.join(self.prefix, file_path)))
126
127        for file_path in self.list_files():
128            if not dry_run:
129                if self.verbose:
130                    print("Compressing {f} => {a}...".format(f=path.join(self.main_repo_abspath, file_path),
131                                                             a=path.join(self.prefix, file_path)))
132                add(path.join(self.main_repo_abspath, file_path), file_path)
133            else:
134                print("{f} => {a}".format(f=path.join(self.main_repo_abspath, file_path),
135                                          a=path.join(self.prefix, file_path)))
136
137        if not dry_run:
138            archive.close()
139
140    def get_path_components(self, repo_abspath, abspath):
141        """
142        Splits given abspath into components until repo_abspath is reached.
143
144        E.g. if repo_abspath is '/Documents/Hobby/ParaView/' and abspath is
145        '/Documents/Hobby/ParaView/Catalyst/Editions/Base/', function will return:
146        ['.', 'Catalyst', 'Editions', 'Base']
147
148        First element is always '.' (concrete symbol depends on OS).
149
150        @type repo_abspath:     string
151        @param repo_abspath:    Absolute path to the git repository.
152
153        @type abspath:  string
154        @param abspath: Absolute path to within repo_abspath.
155
156        @rtype:     list
157        @return:    List of path components.
158        """
159        components = []
160
161        while not path.samefile(abspath, repo_abspath):
162            abspath, tail = path.split(abspath)
163
164            if len(tail):
165                components.insert(0, tail)
166
167        components.insert(0, path.relpath(repo_abspath, repo_abspath))
168        return components
169
170    def get_exclude_patterns(self, repo_abspath, repo_file_paths):
171        """
172        Returns exclude patterns for a given repo. It looks for .gitattributes files in repo_file_paths.
173
174        Resulting dictionary will contain exclude patterns per path (relative to the repo_abspath).
175        E.g. {('.', 'Catalyst', 'Editions', 'Base'), ['Foo*', '*Bar']}
176
177        @type repo_abspath:     string
178        @param repo_abspath:    Absolute path to the git repository.
179
180        @type repo_file_paths:  list
181        @param repo_file_paths: List of paths relative to the repo_abspath that are under git control.
182
183        @rtype:         dict
184        @return:    Dictionary representing exclude patterns.
185                    Keys are tuples of strings. Values are lists of strings.
186                    Returns None if self.exclude is not set.
187        """
188        if not self.exclude:
189            return None
190
191        def read_attributes(attributes_abspath):
192            patterns = []
193            if path.isfile(attributes_abspath):
194                attributes = open(attributes_abspath, 'r').readlines()
195                patterns = []
196                for line in attributes:
197                    tokens = line.strip().split()
198                    if "export-ignore" in tokens[1:]:
199                        patterns.append(tokens[0])
200            return patterns
201
202        exclude_patterns = {(): []}
203
204        # There may be no gitattributes.
205        try:
206            global_attributes_abspath = self.read_shell("git config --get core.attributesfile", repo_abspath).rstrip()
207            exclude_patterns[()] = read_attributes(global_attributes_abspath)
208        except Exception:
209            # And valid to not have them.
210            pass
211
212        for attributes_abspath in [path.join(repo_abspath, f) for f in repo_file_paths if f.endswith(".gitattributes")]:
213            # Each .gitattributes affects only files within its directory.
214            key = tuple(self.get_path_components(repo_abspath, path.dirname(attributes_abspath)))
215            exclude_patterns[key] = read_attributes(attributes_abspath)
216
217        local_attributes_abspath = path.join(repo_abspath, ".git", "info", "attributes")
218        key = tuple(self.get_path_components(repo_abspath, repo_abspath))
219
220        if key in exclude_patterns:
221            exclude_patterns[key].extend(read_attributes(local_attributes_abspath))
222        else:
223            exclude_patterns[key] = read_attributes(local_attributes_abspath)
224
225        return exclude_patterns
226
227    def is_file_excluded(self, repo_abspath, repo_file_path, exclude_patterns):
228        """
229        Checks whether file at a given path is excluded.
230
231        @type repo_abspath: string
232        @param repo_abspath: Absolute path to the git repository.
233
234        @type repo_file_path:   string
235        @param repo_file_path:  Path to a file within repo_abspath.
236
237        @type exclude_patterns:     dict
238        @param exclude_patterns:    Exclude patterns with format specified for get_exclude_patterns.
239
240        @rtype: bool
241        @return: True if file should be excluded. Otherwise False.
242        """
243        if exclude_patterns is None or not len(exclude_patterns):
244            return False
245
246        from fnmatch import fnmatch
247
248        file_name = path.basename(repo_file_path)
249        components = self.get_path_components(repo_abspath, path.join(repo_abspath, path.dirname(repo_file_path)))
250
251        is_excluded = False
252        # We should check all patterns specified in intermediate directories to the given file.
253        # At the end we should also check for the global patterns (key '()' or empty tuple).
254        while not is_excluded:
255            key = tuple(components)
256            if key in exclude_patterns:
257                patterns = exclude_patterns[key]
258                for p in patterns:
259                    if fnmatch(file_name, p) or fnmatch(repo_file_path, p):
260                        if self.verbose:
261                            print("Exclude pattern matched {pattern}: {path}".format(pattern=p, path=repo_file_path))
262                        is_excluded = True
263
264            if not len(components):
265                break
266
267            components.pop()
268
269        return is_excluded
270
271    def list_files(self, repo_path=''):
272        """
273        An iterator method that yields a file path relative to main_repo_abspath
274        for each file that should be included in the archive.
275        Skips those that match the exclusion patterns found in
276        any discovered .gitattributes files along the way.
277
278        Recurs into submodules as well.
279
280        @type repo_path:    string
281        @param repo_path:   Path to the git submodule repository within the main git repository.
282
283        @rtype:     iterator
284        @return:    Iterator to traverse files under git control relative to main_repo_abspath.
285        """
286        repo_abspath = path.join(self.main_repo_abspath, repo_path)
287        repo_file_paths = self.read_git_shell("git ls-files --cached --full-name --no-empty-directory", repo_abspath).splitlines()
288        exclude_patterns = self.get_exclude_patterns(repo_abspath, repo_file_paths)
289
290        for repo_file_path in repo_file_paths:
291            # Git puts path in quotes if file path has unicode characters.
292            repo_file_path = repo_file_path.strip('"')  # file path relative to current repo
293            file_name = path.basename(repo_file_path)
294
295            # Only list symlinks and files that don't start with git.
296            if file_name.startswith(".git") or (not path.islink(repo_file_path) and path.isdir(repo_file_path)):
297                continue
298
299            main_repo_file_path = path.join(repo_path, repo_file_path)  # file path relative to the main repo
300
301            if self.is_file_excluded(repo_abspath, repo_file_path, exclude_patterns):
302                continue
303
304            # Yield both repo_file_path and main_repo_file_path to preserve structure of the repo.
305            yield main_repo_file_path
306
307        if self.force_sub:
308            self.run_shell("git submodule init", repo_abspath)
309            self.run_shell("git submodule update", repo_abspath)
310
311        # List files of every submodule.
312        for submodule_path in self.read_shell("git submodule --quiet foreach 'pwd'", repo_abspath).splitlines():
313            # In order to get output path we need to exclude repository path from submodule_path.
314            submodule_path = path.relpath(submodule_path, self.main_repo_abspath)
315            for file_path in self.list_files(submodule_path):
316                yield file_path
317
318    @staticmethod
319    def run_shell(cmd, cwd=None):
320        """
321        Runs shell command.
322
323        @type cmd:  string
324        @param cmd: Command to be executed.
325
326        @type cwd:  string
327        @param cwd: Working directory.
328
329        @rtype:     int
330        @return:    Return code of the command.
331
332        @raise CalledProcessError:  Raises exception if return code of the command is non-zero.
333        """
334        p = Popen(cmd, shell=True, cwd=cwd)
335        p.wait()
336
337        if p.returncode:
338            raise CalledProcessError(returncode=p.returncode, cmd=cmd)
339
340        return p.returncode
341
342    @staticmethod
343    def read_shell(cmd, cwd=None, encoding='utf-8'):
344        """
345        Runs shell command and reads output.
346
347        @type cmd:  string
348        @param cmd: Command to be executed.
349
350        @type cwd:  string
351        @param cwd: Working directory.
352
353        @type encoding: string
354        @param encoding: Encoding used to decode bytes returned by Popen into string.
355
356        @rtype:     string
357        @return:    Output of the command.
358
359        @raise CalledProcessError:  Raises exception if return code of the command is non-zero.
360        """
361        p = Popen(cmd, shell=True, stdout=PIPE, cwd=cwd)
362        output, _ = p.communicate()
363        output = output.decode(encoding)
364
365        if p.returncode:
366            raise CalledProcessError(returncode=p.returncode, cmd=cmd, output=output)
367
368        return output
369
370    @staticmethod
371    def read_git_shell(cmd, cwd=None):
372        """
373        Runs git shell command, reads output and decodes it into unicode string
374
375        @type cmd:  string
376        @param cmd: Command to be executed.
377
378        @type cwd:  string
379        @param cwd: Working directory.
380
381        @rtype:     string
382        @return:    Output of the command.
383
384        @raise CalledProcessError:  Raises exception if return code of the command is non-zero.
385        """
386        p = Popen(cmd, shell=True, stdout=PIPE, cwd=cwd)
387        output, _ = p.communicate()
388        output = output.decode('unicode_escape').encode('raw_unicode_escape').decode('utf-8')
389
390        if p.returncode:
391            raise CalledProcessError(returncode=p.returncode, cmd=cmd, output=output)
392
393        return output
394
395
396if __name__ == '__main__':
397    from optparse import OptionParser
398
399    parser = OptionParser(usage="usage: %prog [-v] [--prefix PREFIX] [--no-exclude] [--force-submodules] [--dry-run] OUTPUT_FILE",
400                          version="%prog {version}".format(version=__version__))
401
402    parser.add_option('--prefix',
403                      type='string',
404                      dest='prefix',
405                      default='',
406                      help="Prepend PREFIX to each filename in the archive. OUTPUT_FILE name is used by default to avoid tarbomb.")
407
408    parser.add_option('-v', '--verbose',
409                      action='store_true',
410                      dest='verbose',
411                      help='Enable verbose mode.')
412
413    parser.add_option('--no-exclude',
414                      action='store_false',
415                      dest='exclude',
416                      default=True,
417                      help="Don't read .gitattributes files for patterns containing export-ignore attrib.")
418
419    parser.add_option('--force-submodules',
420                      action='store_true',
421                      dest='force_sub',
422                      help="Force a git submodule init && git submodule update at each level before iterating submodules.")
423
424    parser.add_option('--extra',
425                      action='append',
426                      dest='extra',
427                      default=[],
428                      help="Any additional files to include in the archive.")
429    parser.add_option('--dry-run',
430                      action='store_true',
431                      dest='dry_run',
432                      help="Don't actually archive anything, just show what would be done.")
433
434    options, args = parser.parse_args()
435
436    if len(args) != 1:
437        parser.error("You must specify exactly one output file")
438
439    output_file_path = args[0]
440
441    if path.isdir(output_file_path):
442        parser.error("You cannot use directory as output")
443
444    # avoid tarbomb
445    if options.prefix:
446        options.prefix = path.join(options.prefix, '')
447    else:
448        import re
449
450        output_name = path.basename(output_file_path)
451        output_name = re.sub('(\.zip|\.tar|\.tgz|\.gz|\.bz2|\.tar\.gz|\.tar\.bz2)$', '', output_name) or "Archive"
452        options.prefix = path.join(output_name, '')
453
454    try:
455        archiver = GitArchiver(options.prefix,
456                               options.verbose,
457                               options.exclude,
458                               options.force_sub,
459                               options.extra)
460        archiver.create(output_file_path, options.dry_run)
461    except Exception as e:
462        parser.exit(2, "{exception}\n".format(exception=e))
463
464    sys.exit(0)
465