1# -*- coding: utf-8 -*-
2#
3# Copyright (C) 2012-2021 Edgewall Software
4# Copyright (C) 2006-2011, Herbert Valerio Riedel <hvr@gnu.org>
5# All rights reserved.
6#
7# This software is licensed as described in the file COPYING, which
8# you should have received as part of this distribution. The terms
9# are also available at https://trac.edgewall.org/wiki/TracLicense.
10#
11# This software consists of voluntary contributions made by many
12# individuals. For the exact contribution history, see the revision
13# history and logs, available at https://trac.edgewall.org/log/.
14
15import codecs
16import contextlib
17import io
18import os
19import re
20import subprocess
21import tempfile
22import weakref
23from collections import deque
24from functools import partial
25from subprocess import DEVNULL, PIPE
26from threading import Lock
27
28from trac.core import TracBaseError
29from trac.util import terminate
30from trac.util.compat import close_fds
31from trac.util.datefmt import time_now
32from trac.util.text import exception_to_unicode, to_unicode
33
34__all__ = ['GitError', 'GitErrorSha', 'Storage', 'StorageFactory']
35
36
37class GitError(TracBaseError):
38    pass
39
40class GitErrorSha(GitError):
41    pass
42
43# Helper functions
44
45def parse_commit(raw):
46    """Parse the raw content of a commit (as given by `git cat-file -p <rev>`).
47
48    Return the commit message and a dict of properties.
49    """
50    if not raw:
51        raise GitErrorSha
52    lines = raw.splitlines()
53    if not lines:
54        raise GitErrorSha
55    line = lines.pop(0)
56    props = {}
57    multiline = multiline_key = None
58    while line:
59        if line[0] == ' ':
60            if not multiline:
61                multiline_key = key
62                multiline = [props[multiline_key][-1]]
63            multiline.append(line[1:])
64        else:
65            key, value = line.split(None, 1)
66            props.setdefault(key, []).append(value.strip())
67        line = lines.pop(0)
68        if multiline and (not line or key != multiline_key):
69            props[multiline_key][-1] = '\n'.join(multiline)
70            multiline = None
71    return '\n'.join(lines), props
72
73
74_unquote_re = re.compile(r'\\(?:[abtnvfr"\\]|[0-7]{3})'.encode('utf-8'))
75_unquote_chars = bytearray(128)
76for _key, _val in zip(b'abtnvfr"\\', b'\a\b\t\n\v\f\r"\\'):
77    _unquote_chars[_key] = _val
78del _key, _val
79_unquote_chars = bytes(_unquote_chars)
80
81
82def _unquote(path):
83    if path.startswith(b'"') and path.endswith(b'"'):
84        def replace(match):
85            match = match.group(0)
86            if len(match) == 4:
87                code = int(match[1:], 8)  # \ooo
88            else:
89                code = _unquote_chars[match[1]]
90            return b'%c' % code
91        path = _unquote_re.sub(replace, path[1:-1])
92    return path
93
94
95def _rev_u(rev):
96    if rev is not None:
97        rev = str(rev, 'ascii')
98    return rev
99
100
101def _rev_b(rev):
102    if rev is not None:
103        rev = rev.encode('ascii')
104    return rev
105
106
107class GitCore(object):
108    """Low-level wrapper around git executable"""
109
110    def __init__(self, git_dir=None, git_bin='git', log=None,
111                 fs_encoding=None):
112        self.__git_bin = git_bin
113        self.__git_dir = git_dir
114        self.__log = log
115        self.__fs_encoding = fs_encoding
116
117    def __repr__(self):
118        return '<GitCore bin="%s" dir="%s">' % (self.__git_bin,
119                                                self.__git_dir)
120
121    def __build_git_cmd(self, gitcmd, *args):
122        """construct command tuple for git call suitable for Popen()"""
123
124        cmd = [self.__git_bin]
125        if self.__git_dir:
126            cmd.append('--git-dir=%s' % self.__git_dir)
127        cmd.append(gitcmd)
128        cmd.extend(args)
129
130        fs_encoding = self.__fs_encoding
131        if fs_encoding is not None:
132            if os.name == 'nt':
133                # If Python 3 for Windows, Popen() accepts only str instances
134                def to_cmd_encoding(arg):
135                    if isinstance(arg, bytes):
136                        arg = arg.decode(fs_encoding, 'replace')
137                    return arg
138            else:
139                def to_cmd_encoding(arg):
140                    if isinstance(arg, str):
141                        arg = arg.encode(fs_encoding, 'replace')
142                    return arg
143            cmd = list(map(to_cmd_encoding, cmd))
144        return cmd
145
146    def __pipe(self, git_cmd, *cmd_args, **kw):
147        kw.setdefault('stdin', PIPE)
148        kw.setdefault('stdout', PIPE)
149        kw.setdefault('stderr', PIPE)
150        return subprocess.Popen(self.__build_git_cmd(git_cmd, *cmd_args),
151                                close_fds=close_fds, **kw)
152
153    def __execute(self, *args):
154        """execute git command and return file-like object of stdout"""
155
156        #print("DEBUG:", args, file=sys.stderr)
157
158        with self.__pipe(*args, stdin=DEVNULL) as p:
159            stdout_data, stderr_data = p.communicate()
160        if self.__log and (p.returncode != 0 or stderr_data):
161            self.__log.debug('%s exits with %d, dir: %r, args: %r, stderr: %r',
162                             self.__git_bin, p.returncode, self.__git_dir,
163                             args, stderr_data)
164
165        return stdout_data
166
167    def cat_file_batch(self):
168        return self.__pipe('cat-file', '--batch')
169
170    def log_pipe(self, *cmd_args):
171        return self.__pipe('log', *cmd_args)
172
173    def diff_tree_pipe(self):
174        return self.__pipe('diff-tree', '--stdin', '--root', '-z', '-r', '-M')
175
176    def __getattr__(self, name):
177        if name.startswith('_') or \
178                name in ('cat_file_batch', 'log_pipe', 'diff_tree_pipe'):
179            raise AttributeError(name)
180        return partial(self.__execute, name.replace('_','-'))
181
182    __is_sha_pat = re.compile(b'[0-9A-Fa-f]{4,40}$')
183
184    @classmethod
185    def is_sha(cls, sha):
186        """returns whether sha is a potential sha id
187        (i.e. proper hexstring between 4 and 40 characters)
188        """
189
190        # quick test before starting up regexp matcher
191        if not (4 <= len(sha) <= 40):
192            return False
193
194        return bool(cls.__is_sha_pat.match(sha))
195
196
197class SizedDict(dict):
198    """Size-bounded dictionary with FIFO replacement strategy"""
199
200    def __init__(self, max_size=0):
201        dict.__init__(self)
202        self.__max_size = max_size
203        self.__key_fifo = deque()
204        self.__lock = Lock()
205
206    def __setitem__(self, name, value):
207        with self.__lock:
208            assert len(self) == len(self.__key_fifo) # invariant
209
210            if not self.__contains__(name):
211                self.__key_fifo.append(name)
212
213            rc = dict.__setitem__(self, name, value)
214
215            while len(self.__key_fifo) > self.__max_size:
216                self.__delitem__(self.__key_fifo.popleft())
217
218            assert len(self) == len(self.__key_fifo) # invariant
219
220            return rc
221
222    def setdefault(self, *_):
223        raise NotImplementedError("SizedDict has no setdefault() method")
224
225
226class StorageFactory(object):
227    __dict = weakref.WeakValueDictionary()
228    __dict_nonweak = {}
229    __dict_rev_cache = {}
230    __dict_lock = Lock()
231
232    def __init__(self, repo, log, weak=True, git_bin='git',
233                 git_fs_encoding=None):
234        self.logger = log
235
236        with self.__dict_lock:
237            if weak:
238                # remove additional reference which is created
239                # with non-weak argument
240                try:
241                    del self.__dict_nonweak[repo]
242                except KeyError:
243                    pass
244            try:
245                i = self.__dict[repo]
246            except KeyError:
247                rev_cache = self.__dict_rev_cache.get(repo)
248                i = Storage(repo, log, git_bin, git_fs_encoding, rev_cache)
249                self.__dict[repo] = i
250
251            # create additional reference depending on 'weak' argument
252            if not weak:
253                self.__dict_nonweak[repo] = i
254
255        self.__inst = i
256        self.logger.debug("requested %s PyGIT.Storage instance for '%s'",
257                          'weak' if weak else 'non-weak', repo)
258
259    def getInstance(self):
260        return self.__inst
261
262    @classmethod
263    def set_rev_cache(cls, repo, rev_cache):
264        with cls.__dict_lock:
265            cls.__dict_rev_cache[repo] = rev_cache
266
267    @classmethod
268    def _clean(cls):
269        """For testing purpose only"""
270        with cls.__dict_lock:
271            cls.__dict.clear()
272            cls.__dict_nonweak.clear()
273            cls.__dict_rev_cache.clear()
274
275
276class Storage(object):
277    """High-level wrapper around GitCore with in-memory caching"""
278
279    __SREV_MIN = 4 # minimum short-rev length
280
281    class RevCache(object):
282
283        __slots__ = ('youngest_rev', 'oldest_rev', 'rev_dict', 'refs_dict',
284                     'srev_dict')
285
286        def __init__(self, youngest_rev, oldest_rev, rev_dict, refs_dict,
287                     srev_dict):
288            self.youngest_rev = youngest_rev
289            self.oldest_rev = oldest_rev
290            self.rev_dict = rev_dict
291            self.refs_dict = refs_dict
292            self.srev_dict = srev_dict
293            if youngest_rev is not None and oldest_rev is not None and \
294                    rev_dict and refs_dict and srev_dict:
295                pass  # all fields are not empty
296            elif not youngest_rev and not oldest_rev and \
297                    not rev_dict and not refs_dict and not srev_dict:
298                pass  # all fields are empty
299            else:
300                raise ValueError('Invalid RevCache fields: %r' % self)
301
302        @classmethod
303        def empty(cls):
304            return cls(None, None, {}, {}, {})
305
306        def __repr__(self):
307            return 'RevCache(youngest_rev=%r, oldest_rev=%r, ' \
308                   'rev_dict=%d entries, refs_dict=%d entries, ' \
309                   'srev_dict=%d entries)' % \
310                   (self.youngest_rev, self.oldest_rev, len(self.rev_dict),
311                    len(self.refs_dict), len(self.srev_dict))
312
313        def iter_branches(self):
314            head = self.refs_dict.get(b'HEAD')
315            for refname, rev in self.refs_dict.items():
316                if refname.startswith(b'refs/heads/'):
317                    yield refname[11:], rev, refname == head
318
319        def iter_tags(self):
320            for refname, rev in self.refs_dict.items():
321                if refname.startswith(b'refs/tags/'):
322                    yield refname[10:], rev
323
324    @staticmethod
325    def __rev_key(rev):
326        assert len(rev) >= 4
327        #assert GitCore.is_sha(rev)
328        srev_key = int(rev[:4], 16)
329        assert 0 <= srev_key <= 0xffff
330        return srev_key
331
332    @staticmethod
333    def git_version(git_bin='git'):
334        GIT_VERSION_MIN_REQUIRED = (1, 5, 6)
335        try:
336            g = GitCore(git_bin=git_bin)
337            [v] = g.version().splitlines()
338            version = v.strip().split()[2]
339            # 'version' has usually at least 3 numeric version
340            # components, e.g.::
341            #  1.5.4.2
342            #  1.5.4.3.230.g2db511
343            #  1.5.4.GIT
344
345            def try_int(s):
346                try:
347                    return int(s)
348                except ValueError:
349                    return s
350
351            split_version = tuple(map(try_int, version.split(b'.')))
352
353            result = {}
354            result['v_str'] = version
355            result['v_tuple'] = split_version
356            result['v_min_tuple'] = GIT_VERSION_MIN_REQUIRED
357            result['v_min_str'] = ".".join(map(str, GIT_VERSION_MIN_REQUIRED))
358            result['v_compatible'] = split_version >= GIT_VERSION_MIN_REQUIRED
359            return result
360
361        except Exception as e:
362            raise GitError("Could not retrieve GIT version (tried to "
363                           "execute/parse '%s --version' but got %s)"
364                           % (git_bin, repr(e)))
365
366    def __init__(self, git_dir, log, git_bin='git', git_fs_encoding=None,
367                 rev_cache=None):
368        """Initialize PyGit.Storage instance
369
370        `git_dir`: path to .git folder;
371                this setting is not affected by the `git_fs_encoding` setting
372
373        `log`: logger instance
374
375        `git_bin`: path to executable
376                this setting is not affected by the `git_fs_encoding` setting
377
378        `git_fs_encoding`: encoding used for paths stored in git repository;
379                if `None`, no implicit decoding/encoding to/from
380                unicode objects is performed, and bytestrings are
381                returned instead
382        """
383
384        self.logger = log
385
386        self.commit_encoding = None
387
388        # caches
389        self.__rev_cache = rev_cache or self.RevCache.empty()
390        self.__rev_cache_refresh = True
391        self.__rev_cache_lock = Lock()
392
393        # cache the last 200 commit messages
394        self.__commit_msg_cache = SizedDict(200)
395        self.__commit_msg_lock = Lock()
396
397        self.__cat_file_pipe = None
398        self.__cat_file_pipe_lock = Lock()
399        self.__diff_tree_pipe = None
400        self.__diff_tree_pipe_lock = Lock()
401
402        if git_fs_encoding is not None:
403            # validate encoding name
404            codecs.lookup(git_fs_encoding)
405
406            # setup conversion functions
407            self._fs_to_unicode = lambda s: s.decode(git_fs_encoding,
408                                                     'replace')
409            self._fs_from_unicode = lambda s: s.encode(git_fs_encoding)
410        else:
411            # pass bytestrings as-is w/o any conversion
412            self._fs_to_unicode = self._fs_from_unicode = lambda s: s
413
414        # simple sanity checking
415        try:
416            os.listdir(git_dir)
417        except EnvironmentError as e:
418            self._raise_not_readable(git_dir, e)
419        if not self._control_files_exist(git_dir):
420            dot_git_dir = os.path.join(git_dir, '.git')
421            try:
422                os.listdir(dot_git_dir)
423            except EnvironmentError:
424                missing = True
425            else:
426                if self._control_files_exist(dot_git_dir):
427                    missing = False
428                    git_dir = dot_git_dir
429                else:
430                    missing = True
431            if missing:
432                raise GitError("Git control files not found in '%s'" % git_dir)
433
434        # at least, check that the HEAD file is readable
435        try:
436            with open(os.path.join(git_dir, 'HEAD'), 'rb'):
437                pass
438        except EnvironmentError as e:
439            self._raise_not_readable(git_dir, e)
440
441        self.repo = GitCore(git_dir, git_bin, log, git_fs_encoding)
442        self.repo_path = git_dir
443
444        self.logger.debug("PyGIT.Storage instance for '%s' is constructed",
445                          git_dir)
446
447    def _cleanup_proc(self, proc):
448        if proc:
449            for f in (proc.stdin, proc.stdout, proc.stderr):
450                if f:
451                    f.close()
452            terminate(proc)
453            proc.wait()
454
455    def __del__(self):
456        with self.__cat_file_pipe_lock:
457            self._cleanup_proc(self.__cat_file_pipe)
458        with self.__diff_tree_pipe_lock:
459            self._cleanup_proc(self.__diff_tree_pipe)
460
461    #
462    # cache handling
463    #
464
465    def invalidate_rev_cache(self):
466        with self.__rev_cache_lock:
467            self.__rev_cache_refresh = True
468
469    @property
470    def rev_cache(self):
471        """Retrieve revision cache
472
473        may rebuild cache on the fly if required
474
475        returns RevCache tuple
476        """
477        with self.__rev_cache_lock:
478            self._refresh_rev_cache()
479            return self.__rev_cache
480
481    def _refresh_rev_cache(self, force=False):
482        refreshed = False
483        if force or self.__rev_cache_refresh:
484            self.__rev_cache_refresh = False
485            refs = self._get_refs()
486            if self.__rev_cache.refs_dict != refs:
487                self.logger.debug("Detected changes in git repository "
488                                  "'%s'", self.repo_path)
489                rev_cache = self._build_rev_cache(refs)
490                self.__rev_cache = rev_cache
491                StorageFactory.set_rev_cache(self.repo_path, rev_cache)
492                refreshed = True
493            else:
494                self.logger.debug("Detected no changes in git repository "
495                                  "'%s'", self.repo_path)
496        return refreshed
497
498    def _build_rev_cache(self, refs):
499        self.logger.debug("triggered rebuild of commit tree db for '%s'",
500                          self.repo_path)
501        ts0 = time_now()
502
503        new_db = {} # db
504        new_sdb = {} # short_rev db
505
506        # helper for reusing strings
507        revs_seen = {}
508        def _rev_reuse(rev):
509            return revs_seen.setdefault(rev, rev)
510
511        refs = {refname: _rev_reuse(rev) for refname, rev in refs.items()}
512        head_revs = {rev for refname, rev in refs.items()
513                         if refname.startswith(b'refs/heads/')}
514        rev_list = [list(map(_rev_reuse, line.split()))
515                    for line in self.repo.rev_list('--parents', '--topo-order',
516                                                   '--all').splitlines()]
517        revs_seen = None
518
519        if rev_list:
520            # first rev seen is assumed to be the youngest one
521            youngest = rev_list[0][0]
522            # last rev seen is assumed to be the oldest one
523            oldest = rev_list[-1][0]
524        else:
525            youngest = oldest = None
526
527        rheads_seen = {}
528        def _rheads_reuse(rheads):
529            rheads = frozenset(rheads)
530            return rheads_seen.setdefault(rheads, rheads)
531
532        __rev_key = self.__rev_key
533        for ord_rev, revs in enumerate(rev_list):
534            rev = revs[0]
535            parents = revs[1:]
536
537            # shortrev "hash" map
538            new_sdb.setdefault(__rev_key(rev), []).append(rev)
539
540            # new_db[rev] = (children(rev), parents(rev),
541            #                ordinal_id(rev), rheads(rev))
542            if rev in new_db:
543                # (incomplete) entry was already created by children
544                _children, _parents, _ord_rev, _rheads = new_db[rev]
545                assert _children
546                assert not _parents
547                assert _ord_rev == 0
548            else: # new entry
549                _children = set()
550                _rheads = set()
551            if rev in head_revs:
552                _rheads.add(rev)
553
554            # create/update entry
555            # transform into frozenset and tuple since entry will be final
556            new_db[rev] = (frozenset(_children), tuple(parents), ord_rev + 1,
557                           _rheads_reuse(_rheads))
558
559            # update parents(rev)s
560            for parent in parents:
561                # by default, a dummy ordinal_id is used for the mean-time
562                _children, _parents, _ord_rev, _rheads2 = \
563                    new_db.setdefault(parent, (set(), [], 0, set()))
564
565                # update parent(rev)'s children
566                _children.add(rev)
567
568                # update parent(rev)'s rheads
569                _rheads2.update(_rheads)
570
571        rheads_seen = None
572
573        # convert sdb either to dict or array depending on size
574        tmp = [()] * (max(new_sdb) + 1) if len(new_sdb) > 5000 else {}
575        try:
576            while True:
577                k, v = new_sdb.popitem()
578                tmp[k] = tuple(v)
579        except KeyError:
580            pass
581        assert len(new_sdb) == 0
582        new_sdb = tmp
583
584        rev_cache = self.RevCache(youngest, oldest, new_db, refs, new_sdb)
585        self.logger.debug("rebuilt commit tree db for '%s' with %d entries "
586                          "(took %.1f ms)", self.repo_path, len(new_db),
587                          1000 * (time_now() - ts0))
588        return rev_cache
589
590    def _get_refs(self):
591        refs = {}
592        tags = {}
593
594        for line in self.repo.show_ref('--dereference').splitlines():
595            if b' ' not in line:
596                continue
597            rev, refname = line.split(b' ', 1)
598            if refname.endswith(b'^{}'):  # derefered tag
599                tags[refname[:-3]] = rev
600            else:
601                refs[refname] = rev
602        refs.update(iter(tags.items()))
603
604        if refs:
605            refname = (self.repo.symbolic_ref('-q', 'HEAD') or '').strip()
606            if refname in refs:
607                refs[b'HEAD'] = refname
608
609        return refs
610
611    def get_branches(self):
612        """returns list of (local) branches, with active (= HEAD) one being
613        the first item
614        """
615        def fn(args):
616            name, rev, head = args
617            return not head, name
618        _fs_to_unicode = self._fs_to_unicode
619        branches = sorted(((_fs_to_unicode(name), _rev_u(rev), head)
620                           for name, rev, head
621                           in self.rev_cache.iter_branches()), key=fn)
622        return [(name, rev) for name, rev, head in branches]
623
624    def get_refs(self):
625        _fs_to_unicode = self._fs_to_unicode
626        for refname, rev in self.rev_cache.refs_dict.items():
627            if refname != b'HEAD':
628                yield _fs_to_unicode(refname), _rev_u(rev)
629
630    def get_commits(self):
631        return self.rev_cache.rev_dict
632
633    def oldest_rev(self):
634        return _rev_u(self.rev_cache.oldest_rev)
635
636    def youngest_rev(self):
637        return _rev_u(self.rev_cache.youngest_rev)
638
639    def get_branch_contains(self, sha, resolve=False):
640        """return list of reachable head sha ids or (names, sha) pairs if
641        resolve is true
642
643        see also get_branches()
644        """
645
646        sha = _rev_b(sha)
647        _rev_cache = self.rev_cache
648
649        try:
650            rheads = _rev_cache.rev_dict[sha][3]
651        except KeyError:
652            return []
653
654        if resolve:
655            _fs_to_unicode = self._fs_to_unicode
656            rv = [(_fs_to_unicode(name), _rev_u(rev))
657                  for name, rev, head in _rev_cache.iter_branches()
658                  if rev in rheads]
659            rv.sort(key=lambda v: v[0])
660            return rv
661        else:
662            return list(map(_rev_u, rheads))
663
664    def history_relative_rev(self, sha, rel_pos):
665
666        def get_history_relative_rev(sha, rel_pos):
667            rev_dict = self.get_commits()
668
669            if sha not in rev_dict:
670                raise GitErrorSha()
671
672            if rel_pos == 0:
673                return sha
674
675            lin_rev = rev_dict[sha][2] + rel_pos
676
677            if lin_rev < 1 or lin_rev > len(rev_dict):
678                return None
679
680            for k, v in rev_dict.items():
681                if v[2] == lin_rev:
682                    return k
683
684            # should never be reached if rev_dict is consistent
685            raise GitError("internal inconsistency detected")
686
687        result = get_history_relative_rev(_rev_b(sha), rel_pos)
688        return _rev_u(result)
689
690    def hist_next_revision(self, sha):
691        return self.history_relative_rev(sha, -1)
692
693    def hist_prev_revision(self, sha):
694        return self.history_relative_rev(sha, +1)
695
696    def get_commit_encoding(self):
697        if self.commit_encoding is None:
698            self.commit_encoding = \
699                self.repo.config('--get', 'i18n.commitEncoding').strip() or \
700                'utf-8'
701
702        return self.commit_encoding
703
704    def head(self):
705        """get current HEAD commit id"""
706        return self.verifyrev('HEAD')
707
708    def cat_file(self, kind, sha):
709        return self._cat_file_reader(kind, sha).read()
710
711    def _cat_file_reader(self, kind, sha):
712        with self.__cat_file_pipe_lock:
713            if self.__cat_file_pipe is None:
714                self.__cat_file_pipe = self.repo.cat_file_batch()
715
716            try:
717                self.__cat_file_pipe.stdin.write(sha + b'\n')
718                self.__cat_file_pipe.stdin.flush()
719
720                split_stdout_line = self.__cat_file_pipe.stdout.readline() \
721                                                               .split()
722                if len(split_stdout_line) != 3:
723                    raise GitError("internal error (could not split line %s)" %
724                                   repr(split_stdout_line))
725
726                _sha, _type, _size = split_stdout_line
727
728                if _type != kind:
729                    raise GitError("internal error (got unexpected object "
730                                   "kind %r, expected %r)" % (_type, kind))
731
732                size = int(_size)
733
734                # stdout.read() can return fewer bytes than requested,
735                # especially if a pipe buffers because the contents are
736                # larger than 64k.
737                stdout_read = self.__cat_file_pipe.stdout.read
738                if size > 32 * 1024 * 1024:
739                    buf = tempfile.TemporaryFile()
740                else:
741                    buf = io.BytesIO()
742                remaining = size + 1
743                while remaining > 0:
744                    chunk = stdout_read(min(remaining, 65536))
745                    if not chunk:
746                        # No new data, let's abort
747                        raise GitError("internal error (expected to read %d "
748                                       "bytes, but only got %d)" %
749                                       (size + 1, size + 1 - remaining))
750                    remaining -= len(chunk)
751                    buf.write(chunk if remaining > 0 else chunk[:-1])
752
753                buf.seek(0)
754                return buf
755
756            except EnvironmentError as e:
757                # There was an error, we should close the pipe to get to a
758                # consistent state (Otherwise it happens that next time we
759                # call cat_file we get payload from previous call)
760                self.logger.warning("closing cat_file pipe: %s",
761                                    exception_to_unicode(e))
762                self._cleanup_proc(self.__cat_file_pipe)
763                self.__cat_file_pipe = None
764
765    def verifyrev(self, rev):
766        """verify/lookup given revision object and return a sha id or None
767        if lookup failed
768        """
769
770        def get_verifyrev(rev):
771            _rev_cache = self.rev_cache
772
773            if GitCore.is_sha(rev):
774                # maybe it's a short or full rev
775                fullrev = self.fullrev(rev)
776                if fullrev:
777                    return fullrev
778
779            refs = _rev_cache.refs_dict
780            if rev == b'HEAD':  # resolve HEAD
781                refname = refs.get(rev)
782                if refname in refs:
783                    return refs[refname]
784            resolved = refs.get(b'refs/heads/' + rev)  # resolve branch
785            if resolved:
786                return resolved
787            resolved = refs.get(b'refs/tags/' + rev)  # resolve tag
788            if resolved:
789                return resolved
790
791            # fall back to external git calls
792            rc = self.repo.rev_parse('--verify', rev).strip()
793            if not rc:
794                return None
795            if rc in _rev_cache.rev_dict:
796                return rc
797
798            return None
799
800        result = get_verifyrev(self._fs_from_unicode(rev))
801        return _rev_u(result)
802
803    def shortrev(self, rev, min_len=7):
804
805        def get_shortrev(rev, min_len):
806            """try to shorten sha id"""
807            #try to emulate the following:
808            #return self.repo.rev_parse("--short", rev).strip()
809
810            if min_len < self.__SREV_MIN:
811                min_len = self.__SREV_MIN
812
813            _rev_cache = self.rev_cache
814
815            if rev not in _rev_cache.rev_dict:
816                return None
817
818            srev = rev[:min_len]
819            srevs = set(_rev_cache.srev_dict[self.__rev_key(rev)])
820
821            if len(srevs) == 1:
822                return srev # we already got a unique id
823
824            # find a shortened id for which rev doesn't conflict with
825            # the other ones from srevs
826            crevs = srevs - {rev}
827
828            for l in range(min_len+1, 40):
829                srev = rev[:l]
830                if srev not in [ r[:l] for r in crevs ]:
831                    return srev
832
833            return rev # worst-case, all except the last character match
834
835        return _rev_u(get_shortrev(_rev_b(rev), min_len))
836
837
838    def fullrev(self, rev):
839        """try to reverse shortrev()"""
840
841        _rev_cache = self.rev_cache
842
843        # short-cut
844        if len(rev) == 40 and rev in _rev_cache.rev_dict:
845            return rev
846
847        if not GitCore.is_sha(rev):
848            return None
849
850        try:
851            srevs = _rev_cache.srev_dict[self.__rev_key(rev)]
852        except KeyError:
853            return None
854
855        resolved = None
856        for s in srevs:
857            if s.startswith(rev):
858                if resolved is not None:
859                    return None
860                resolved = s
861        return resolved
862
863    def get_tags(self, rev=None):
864        if rev is not None:
865            rev = _rev_b(rev)
866        return sorted(self._fs_to_unicode(name)
867                      for name, rev_ in self.rev_cache.iter_tags()
868                      if rev is None or rev == rev_)
869
870    def ls_tree(self, rev, path='', recursive=False):
871        rev = self._fs_from_unicode(rev) if rev else b'HEAD'  # paranoia
872        path = self._fs_from_unicode(path).lstrip(b'/') or b'.'
873        tree = self.repo.ls_tree('-zlr' if recursive else '-zl',
874                                 rev, '--', path).split(b'\0')
875
876        def split_ls_tree_line(l):
877            """split according to '<mode> <type> <sha> <size>\t<fname>'"""
878
879            meta, fname = l.split(b'\t', 1)
880            _mode, _type, _sha, _size = meta.split()
881            _type = str(_type, 'utf-8')
882            _sha = _rev_u(_sha)
883            _mode = int(_mode, 8)
884            _size = None if _size == b'-' else int(_size)
885            fname = self._fs_to_unicode(fname)
886            return _mode, _type, _sha, _size, fname
887
888        return [split_ls_tree_line(e) for e in tree if e]
889
890    def read_commit(self, commit_id):
891        if not commit_id:
892            raise GitError("read_commit called with empty commit_id")
893
894        commit_id_orig = commit_id
895        commit_id = self.fullrev(_rev_b(commit_id))
896
897        rev_dict = self.get_commits()
898        if commit_id not in rev_dict:
899            self.logger.info("read_commit failed for %r (%r)",
900                             commit_id, commit_id_orig)
901            raise GitErrorSha
902
903        with self.__commit_msg_lock:
904            if commit_id in self.__commit_msg_cache:
905                # cache hit
906                result = self.__commit_msg_cache[commit_id]
907                return result[0], dict(result[1])
908
909        # cache miss
910        raw = self.cat_file(b'commit', commit_id)
911        raw = str(raw, self.get_commit_encoding(), 'replace')
912        result = parse_commit(raw)
913        with self.__commit_msg_lock:
914            self.__commit_msg_cache[commit_id] = result
915        return result[0], dict(result[1])
916
917    def get_file(self, sha):
918        sha = _rev_b(sha)
919        return self._cat_file_reader(b'blob', sha)
920
921    def get_obj_size(self, sha):
922        sha = _rev_b(sha)
923        try:
924            obj_size = int(self.repo.cat_file(b'-s', sha).strip())
925        except ValueError:
926            raise GitErrorSha("object '%s' not found" % sha)
927        return obj_size
928
929    def children(self, sha):
930        sha = _rev_b(sha)
931        rev_dict = self.get_commits()
932        try:
933            item = rev_dict[sha]
934        except KeyError:
935            return ()
936        return sorted(map(_rev_u, item[0]))
937
938    def children_recursive(self, sha, rev_dict=None):
939        """Recursively traverse children in breadth-first order"""
940
941        if rev_dict is None:
942            rev_dict = self.get_commits()
943
944        work_list = deque()
945        seen = set()
946
947        _children = rev_dict[sha][0]
948        seen.update(_children)
949        work_list.extend(_children)
950
951        while work_list:
952            p = work_list.popleft()
953            yield p
954
955            _children = rev_dict[p][0] - seen
956            seen.update(_children)
957            work_list.extend(_children)
958
959        assert len(work_list) == 0
960
961    def parents(self, sha):
962        sha = _rev_b(sha)
963        rev_dict = self.get_commits()
964        try:
965            item = rev_dict[sha]
966        except KeyError:
967            return []
968        return list(map(_rev_u, item[1]))
969
970    def all_revs(self):
971        for rev in self.get_commits():
972            yield _rev_u(rev)
973
974    def sync(self):
975        with self.__rev_cache_lock:
976            return self._refresh_rev_cache(force=True)
977
978    @contextlib.contextmanager
979    def get_historian(self, sha, base_path):
980        p = []
981        change = {}
982        next_path = []
983        base_path = self._fs_from_unicode(base_path) or '.'
984
985        def name_status_gen():
986            p[:] = [self.repo.log_pipe('--pretty=format:%n%H', '--no-renames',
987                                       '--name-status', sha, '--', base_path)]
988            f = p[0].stdout
989            for l in f:
990                if l == b'\n':
991                    continue
992                old_sha = l.rstrip(b'\n')
993                for l in f:
994                    if l == b'\n':
995                        break
996                    _, path = l.rstrip(b'\n').split(b'\t', 1)
997                    # git-log without -z option quotes each pathname
998                    path = _unquote(path)
999                    while path not in change:
1000                        change[path] = old_sha
1001                        if next_path == [path]:
1002                            yield old_sha
1003                        try:
1004                            path, _ = path.rsplit(b'/', 1)
1005                        except ValueError:
1006                            break
1007            if p:
1008                self._cleanup_proc(p[0])
1009            p[:] = []
1010            while True:
1011                yield None
1012        gen = name_status_gen()
1013
1014        def historian(path):
1015            path = self._fs_from_unicode(path)
1016            try:
1017                rev = change[path]
1018            except KeyError:
1019                next_path[:] = [path]
1020                rev = next(gen)
1021            return _rev_u(rev)
1022
1023        try:
1024            yield historian
1025        finally:
1026            if p:
1027                self._cleanup_proc(p[0])
1028
1029    def last_change(self, sha, path, historian=None):
1030        if historian is not None:
1031            return historian(path)
1032        for entry in self.history(sha, path, limit=1):
1033            return entry
1034        return None
1035
1036    def history(self, sha, path, limit=None):
1037        if limit is None:
1038            limit = -1
1039
1040        args = ['--max-count=%d' % limit, str(sha)]
1041        if path:
1042            args.extend(('--', self._fs_from_unicode(path)))
1043        tmp = self.repo.rev_list(*args)
1044        for rev in tmp.splitlines():
1045            yield _rev_u(rev)
1046
1047    def history_timerange(self, start, stop):
1048        # retrieve start <= committer-time < stop,
1049        # see CachedRepository.get_changesets()
1050        output = self.repo.rev_list('--all', '--date-order',
1051                                    '--max-age=%d' % start,
1052                                    '--min-age=%d' % (stop - 1))
1053        return [_rev_u(rev) for rev in output.splitlines()]
1054
1055    def rev_is_anchestor_of(self, rev1, rev2):
1056        """return True if rev2 is successor of rev1"""
1057
1058        rev1 = _rev_b(rev1)
1059        rev2 = _rev_b(rev2)
1060        rev_dict = self.get_commits()
1061        return (rev2 in rev_dict and
1062                rev2 in self.children_recursive(rev1, rev_dict))
1063
1064    def blame(self, commit_sha, path):
1065        in_metadata = False
1066
1067        commit_sha = _rev_b(commit_sha)
1068        path = self._fs_from_unicode(path)
1069
1070        for line in self.repo.blame('-p', '--', path, commit_sha) \
1071                             .splitlines():
1072            assert line
1073            if in_metadata:
1074                in_metadata = not line.startswith(b'\t')
1075            else:
1076                split_line = line.split()
1077                if len(split_line) == 4:
1078                    (sha, orig_lineno, lineno, group_size) = split_line
1079                else:
1080                    (sha, orig_lineno, lineno) = split_line
1081
1082                assert len(sha) == 40
1083                yield _rev_u(sha), lineno
1084                in_metadata = True
1085
1086        assert not in_metadata
1087
1088    def get_changes(self, tree1, tree2):
1089        with self.__diff_tree_pipe_lock:
1090            if self.__diff_tree_pipe is None:
1091                self.__diff_tree_pipe = self.repo.diff_tree_pipe()
1092            proc = self.__diff_tree_pipe
1093            try:
1094                proc.stdin.write(b'%s %s\n\n' % (_rev_b(tree2), _rev_b(tree1))
1095                                 if tree1 else
1096                                 b'%s\n\n' % _rev_b(tree2))
1097                proc.stdin.flush()
1098                read = proc.stdout.read
1099                entries = []
1100                c = read(1)
1101                if not c:
1102                    raise EOFError()
1103                while c != b'\n':
1104                    entry = bytearray()
1105                    while c != b'\0':
1106                        entry.append(c[0])
1107                        c = read(1)
1108                        if not c:
1109                            raise EOFError()
1110                    entries.append(bytes(entry))
1111                    c = read(1)
1112                    if not c:
1113                        raise EOFError()
1114            except:
1115                self.__diff_tree_pipe = None
1116                self._cleanup_proc(proc)
1117                raise
1118        if not entries:
1119            return
1120        # skip first entry as a sha
1121        assert not entries[0].startswith(b':')
1122        entries = entries[1:]
1123
1124        yield from self._iter_diff_tree(entries)
1125
1126    def diff_tree(self, tree1, tree2, path='', find_renames=False):
1127        """calls `git diff-tree` and returns tuples of the kind
1128        (mode1,mode2,obj1,obj2,action,path1,path2)"""
1129
1130        # diff-tree returns records with the following structure:
1131        # :<old-mode> <new-mode> <old-sha> <new-sha> <change> NUL <old-path> NUL [ <new-path> NUL ]
1132
1133        path = self._fs_from_unicode(path).strip(b'/') or b'.'
1134        diff_tree_args = ['-z', '-r']
1135        if find_renames:
1136            diff_tree_args.append('-M')
1137        diff_tree_args.extend([tree1 if tree1 else '--root',
1138                               tree2, '--', path])
1139        result = self.repo.diff_tree(*diff_tree_args)
1140        if not result:
1141            return
1142
1143        def iter_entry(result):
1144            start = 0
1145            while True:
1146                idx = result.find(b'\0', start)
1147                if idx == -1:
1148                    return
1149                yield result[start:idx]
1150                start = idx + 1
1151
1152        entries = list(iter_entry(result))
1153        if not tree1:
1154            # if only one tree-sha is given on commandline,
1155            # the first line is just the redundant tree-sha itself...
1156            entry = entries.pop(0)
1157            assert not entry.startswith(b':')
1158
1159        yield from self._iter_diff_tree(entries)
1160
1161    def _iter_diff_tree(self, entries):
1162
1163        def next_entry():
1164            return next(iter_entry)
1165
1166        iter_entry = iter(entries)
1167        while True:
1168            try:
1169                entry = next_entry()
1170            except StopIteration:
1171                return
1172            assert entry.startswith(b':')
1173            values = entry[1:].split(b' ')
1174            assert len(values) == 5
1175            old_mode, new_mode, old_sha, new_sha, change = values
1176            old_mode = int(old_mode, 8)
1177            new_mode = int(new_mode, 8)
1178            old_sha = _rev_u(old_sha)
1179            new_sha = _rev_u(new_sha)
1180            change = str(change[:1], 'utf-8')
1181            old_path = self._fs_to_unicode(next_entry())
1182            new_path = None
1183            if change in ('R', 'C'):  # renamed or copied
1184                new_path = self._fs_to_unicode(next_entry())
1185            yield (old_mode, new_mode, old_sha, new_sha, change, old_path,
1186                   new_path)
1187
1188    def _raise_not_readable(self, git_dir, e):
1189        raise GitError("Make sure the Git repository '%s' is readable: %s"
1190                       % (git_dir, to_unicode(e)))
1191
1192    def _control_files_exist(self, git_dir):
1193        for name in ('HEAD', 'objects', 'refs'):
1194            if not os.path.exists(os.path.join(git_dir, name)):
1195                self.logger.debug("Missing Git control file '%s' in '%s'",
1196                                  name, git_dir)
1197                return False
1198        return True
1199