1"""Git interaction library.
2bup repositories are in Git format. This library allows us to
3interact with the Git data structures.
4"""
5
6from __future__ import absolute_import, print_function
7import errno, os, sys, zlib, time, subprocess, struct, stat, re, tempfile, glob
8from array import array
9from binascii import hexlify, unhexlify
10from collections import namedtuple
11from itertools import islice
12from numbers import Integral
13
14from bup import _helpers, compat, hashsplit, path, midx, bloom, xstat
15from bup.compat import (buffer,
16                        byte_int, bytes_from_byte, bytes_from_uint,
17                        environ,
18                        items,
19                        range,
20                        reraise)
21from bup.io import path_msg
22from bup.helpers import (Sha1, add_error, chunkyreader, debug1, debug2,
23                         exo,
24                         fdatasync,
25                         hostname, localtime, log,
26                         merge_dict,
27                         merge_iter,
28                         mmap_read, mmap_readwrite,
29                         parse_num,
30                         progress, qprogress, stat_if_exists,
31                         unlink,
32                         utc_offset_str)
33from bup.pwdgrp import username, userfullname
34
35
36verbose = 0
37repodir = None  # The default repository, once initialized
38
39_typemap =  {b'blob': 3, b'tree': 2, b'commit': 1, b'tag': 4}
40_typermap = {v: k for k, v in items(_typemap)}
41
42
43_total_searches = 0
44_total_steps = 0
45
46
47class GitError(Exception):
48    pass
49
50
51def _gitenv(repo_dir=None):
52    if not repo_dir:
53        repo_dir = repo()
54    return merge_dict(environ, {b'GIT_DIR': os.path.abspath(repo_dir)})
55
56def _git_wait(cmd, p):
57    rv = p.wait()
58    if rv != 0:
59        raise GitError('%r returned %d' % (cmd, rv))
60
61def _git_exo(cmd, **kwargs):
62    kwargs['check'] = False
63    result = exo(cmd, **kwargs)
64    _, _, proc = result
65    if proc.returncode != 0:
66        raise GitError('%r returned %d' % (cmd, proc.returncode))
67    return result
68
69def git_config_get(option, repo_dir=None):
70    cmd = (b'git', b'config', b'--get', option)
71    p = subprocess.Popen(cmd, stdout=subprocess.PIPE,
72                         env=_gitenv(repo_dir=repo_dir))
73    r = p.stdout.read()
74    rc = p.wait()
75    if rc == 0:
76        return r
77    if rc != 1:
78        raise GitError('%r returned %d' % (cmd, rc))
79    return None
80
81
82def parse_tz_offset(s):
83    """UTC offset in seconds."""
84    tz_off = (int(s[1:3]) * 60 * 60) + (int(s[3:5]) * 60)
85    if bytes_from_byte(s[0]) == b'-':
86        return - tz_off
87    return tz_off
88
89
90# FIXME: derived from http://git.rsbx.net/Documents/Git_Data_Formats.txt
91# Make sure that's authoritative.
92_start_end_char = br'[^ .,:;<>"\'\0\n]'
93_content_char = br'[^\0\n<>]'
94_safe_str_rx = br'(?:%s{1,2}|(?:%s%s*%s))' \
95    % (_start_end_char,
96       _start_end_char, _content_char, _start_end_char)
97_tz_rx = br'[-+]\d\d[0-5]\d'
98_parent_rx = br'(?:parent [abcdefABCDEF0123456789]{40}\n)'
99# Assumes every following line starting with a space is part of the
100# mergetag.  Is there a formal commit blob spec?
101_mergetag_rx = br'(?:\nmergetag object [abcdefABCDEF0123456789]{40}(?:\n [^\0\n]*)*)'
102_commit_rx = re.compile(br'''tree (?P<tree>[abcdefABCDEF0123456789]{40})
103(?P<parents>%s*)author (?P<author_name>%s) <(?P<author_mail>%s)> (?P<asec>\d+) (?P<atz>%s)
104committer (?P<committer_name>%s) <(?P<committer_mail>%s)> (?P<csec>\d+) (?P<ctz>%s)(?P<mergetag>%s?)
105
106(?P<message>(?:.|\n)*)''' % (_parent_rx,
107                             _safe_str_rx, _safe_str_rx, _tz_rx,
108                             _safe_str_rx, _safe_str_rx, _tz_rx,
109                             _mergetag_rx))
110_parent_hash_rx = re.compile(br'\s*parent ([abcdefABCDEF0123456789]{40})\s*')
111
112# Note that the author_sec and committer_sec values are (UTC) epoch
113# seconds, and for now the mergetag is not included.
114CommitInfo = namedtuple('CommitInfo', ['tree', 'parents',
115                                       'author_name', 'author_mail',
116                                       'author_sec', 'author_offset',
117                                       'committer_name', 'committer_mail',
118                                       'committer_sec', 'committer_offset',
119                                       'message'])
120
121def parse_commit(content):
122    commit_match = re.match(_commit_rx, content)
123    if not commit_match:
124        raise Exception('cannot parse commit %r' % content)
125    matches = commit_match.groupdict()
126    return CommitInfo(tree=matches['tree'],
127                      parents=re.findall(_parent_hash_rx, matches['parents']),
128                      author_name=matches['author_name'],
129                      author_mail=matches['author_mail'],
130                      author_sec=int(matches['asec']),
131                      author_offset=parse_tz_offset(matches['atz']),
132                      committer_name=matches['committer_name'],
133                      committer_mail=matches['committer_mail'],
134                      committer_sec=int(matches['csec']),
135                      committer_offset=parse_tz_offset(matches['ctz']),
136                      message=matches['message'])
137
138
139def get_cat_data(cat_iterator, expected_type):
140    _, kind, _ = next(cat_iterator)
141    if kind != expected_type:
142        raise Exception('expected %r, saw %r' % (expected_type, kind))
143    return b''.join(cat_iterator)
144
145def get_commit_items(id, cp):
146    return parse_commit(get_cat_data(cp.get(id), b'commit'))
147
148def _local_git_date_str(epoch_sec):
149    return b'%d %s' % (epoch_sec, utc_offset_str(epoch_sec))
150
151
152def _git_date_str(epoch_sec, tz_offset_sec):
153    offs =  tz_offset_sec // 60
154    return b'%d %s%02d%02d' \
155        % (epoch_sec,
156           b'+' if offs >= 0 else b'-',
157           abs(offs) // 60,
158           abs(offs) % 60)
159
160
161def repo(sub = b'', repo_dir=None):
162    """Get the path to the git repository or one of its subdirectories."""
163    repo_dir = repo_dir or repodir
164    if not repo_dir:
165        raise GitError('You should call check_repo_or_die()')
166
167    # If there's a .git subdirectory, then the actual repo is in there.
168    gd = os.path.join(repo_dir, b'.git')
169    if os.path.exists(gd):
170        repo_dir = gd
171
172    return os.path.join(repo_dir, sub)
173
174
175_shorten_hash_rx = \
176    re.compile(br'([^0-9a-z]|\b)([0-9a-z]{7})[0-9a-z]{33}([^0-9a-z]|\b)')
177
178def shorten_hash(s):
179    return _shorten_hash_rx.sub(br'\1\2*\3', s)
180
181
182def repo_rel(path):
183    full = os.path.abspath(path)
184    fullrepo = os.path.abspath(repo(b''))
185    if not fullrepo.endswith(b'/'):
186        fullrepo += b'/'
187    if full.startswith(fullrepo):
188        path = full[len(fullrepo):]
189    if path.startswith(b'index-cache/'):
190        path = path[len(b'index-cache/'):]
191    return shorten_hash(path)
192
193
194def all_packdirs():
195    paths = [repo(b'objects/pack')]
196    paths += glob.glob(repo(b'index-cache/*/.'))
197    return paths
198
199
200def auto_midx(objdir):
201    args = [path.exe(), b'midx', b'--auto', b'--dir', objdir]
202    try:
203        rv = subprocess.call(args, stdout=open(os.devnull, 'w'))
204    except OSError as e:
205        # make sure 'args' gets printed to help with debugging
206        add_error('%r: exception: %s' % (args, e))
207        raise
208    if rv:
209        add_error('%r: returned %d' % (args, rv))
210
211    args = [path.exe(), b'bloom', b'--dir', objdir]
212    try:
213        rv = subprocess.call(args, stdout=open(os.devnull, 'w'))
214    except OSError as e:
215        # make sure 'args' gets printed to help with debugging
216        add_error('%r: exception: %s' % (args, e))
217        raise
218    if rv:
219        add_error('%r: returned %d' % (args, rv))
220
221
222def mangle_name(name, mode, gitmode):
223    """Mangle a file name to present an abstract name for segmented files.
224    Mangled file names will have the ".bup" extension added to them. If a
225    file's name already ends with ".bup", a ".bupl" extension is added to
226    disambiguate normal files from segmented ones.
227    """
228    if stat.S_ISREG(mode) and not stat.S_ISREG(gitmode):
229        assert(stat.S_ISDIR(gitmode))
230        return name + b'.bup'
231    elif name.endswith(b'.bup') or name[:-1].endswith(b'.bup'):
232        return name + b'.bupl'
233    else:
234        return name
235
236
237(BUP_NORMAL, BUP_CHUNKED) = (0,1)
238def demangle_name(name, mode):
239    """Remove name mangling from a file name, if necessary.
240
241    The return value is a tuple (demangled_filename,mode), where mode is one of
242    the following:
243
244    * BUP_NORMAL  : files that should be read as-is from the repository
245    * BUP_CHUNKED : files that were chunked and need to be reassembled
246
247    For more information on the name mangling algorithm, see mangle_name()
248    """
249    if name.endswith(b'.bupl'):
250        return (name[:-5], BUP_NORMAL)
251    elif name.endswith(b'.bup'):
252        return (name[:-4], BUP_CHUNKED)
253    elif name.endswith(b'.bupm'):
254        return (name[:-5],
255                BUP_CHUNKED if stat.S_ISDIR(mode) else BUP_NORMAL)
256    else:
257        return (name, BUP_NORMAL)
258
259
260def calc_hash(type, content):
261    """Calculate some content's hash in the Git fashion."""
262    header = b'%s %d\0' % (type, len(content))
263    sum = Sha1(header)
264    sum.update(content)
265    return sum.digest()
266
267
268def shalist_item_sort_key(ent):
269    (mode, name, id) = ent
270    assert(mode+0 == mode)
271    if stat.S_ISDIR(mode):
272        return name + b'/'
273    else:
274        return name
275
276
277def tree_encode(shalist):
278    """Generate a git tree object from (mode,name,hash) tuples."""
279    shalist = sorted(shalist, key = shalist_item_sort_key)
280    l = []
281    for (mode,name,bin) in shalist:
282        assert(mode)
283        assert(mode+0 == mode)
284        assert(name)
285        assert(len(bin) == 20)
286        s = b'%o %s\0%s' % (mode,name,bin)
287        assert s[0] != b'0'  # 0-padded octal is not acceptable in a git tree
288        l.append(s)
289    return b''.join(l)
290
291
292def tree_decode(buf):
293    """Generate a list of (mode,name,hash) from the git tree object in buf."""
294    ofs = 0
295    while ofs < len(buf):
296        z = buf.find(b'\0', ofs)
297        assert(z > ofs)
298        spl = buf[ofs:z].split(b' ', 1)
299        assert(len(spl) == 2)
300        mode,name = spl
301        sha = buf[z+1:z+1+20]
302        ofs = z+1+20
303        yield (int(mode, 8), name, sha)
304
305
306def _encode_packobj(type, content, compression_level=1):
307    if compression_level not in (0, 1, 2, 3, 4, 5, 6, 7, 8, 9):
308        raise ValueError('invalid compression level %s' % compression_level)
309    szout = b''
310    sz = len(content)
311    szbits = (sz & 0x0f) | (_typemap[type]<<4)
312    sz >>= 4
313    while 1:
314        if sz: szbits |= 0x80
315        szout += bytes_from_uint(szbits)
316        if not sz:
317            break
318        szbits = sz & 0x7f
319        sz >>= 7
320    z = zlib.compressobj(compression_level)
321    yield szout
322    yield z.compress(content)
323    yield z.flush()
324
325
326def _encode_looseobj(type, content, compression_level=1):
327    z = zlib.compressobj(compression_level)
328    yield z.compress(b'%s %d\0' % (type, len(content)))
329    yield z.compress(content)
330    yield z.flush()
331
332
333def _decode_looseobj(buf):
334    assert(buf);
335    s = zlib.decompress(buf)
336    i = s.find(b'\0')
337    assert(i > 0)
338    l = s[:i].split(b' ')
339    type = l[0]
340    sz = int(l[1])
341    content = s[i+1:]
342    assert(type in _typemap)
343    assert(sz == len(content))
344    return (type, content)
345
346
347def _decode_packobj(buf):
348    assert(buf)
349    c = byte_int(buf[0])
350    type = _typermap[(c & 0x70) >> 4]
351    sz = c & 0x0f
352    shift = 4
353    i = 0
354    while c & 0x80:
355        i += 1
356        c = byte_int(buf[i])
357        sz |= (c & 0x7f) << shift
358        shift += 7
359        if not (c & 0x80):
360            break
361    return (type, zlib.decompress(buf[i+1:]))
362
363
364class PackIdx:
365    def __init__(self):
366        assert(0)
367
368    def find_offset(self, hash):
369        """Get the offset of an object inside the index file."""
370        idx = self._idx_from_hash(hash)
371        if idx != None:
372            return self._ofs_from_idx(idx)
373        return None
374
375    def exists(self, hash, want_source=False):
376        """Return nonempty if the object exists in this index."""
377        if hash and (self._idx_from_hash(hash) != None):
378            return want_source and os.path.basename(self.name) or True
379        return None
380
381    def _idx_from_hash(self, hash):
382        global _total_searches, _total_steps
383        _total_searches += 1
384        assert(len(hash) == 20)
385        b1 = byte_int(hash[0])
386        start = self.fanout[b1-1] # range -1..254
387        end = self.fanout[b1] # range 0..255
388        want = hash
389        _total_steps += 1  # lookup table is a step
390        while start < end:
391            _total_steps += 1
392            mid = start + (end - start) // 2
393            v = self._idx_to_hash(mid)
394            if v < want:
395                start = mid+1
396            elif v > want:
397                end = mid
398            else: # got it!
399                return mid
400        return None
401
402
403class PackIdxV1(PackIdx):
404    """Object representation of a Git pack index (version 1) file."""
405    def __init__(self, filename, f):
406        self.name = filename
407        self.idxnames = [self.name]
408        self.map = mmap_read(f)
409        # Min size for 'L' is 4, which is sufficient for struct's '!I'
410        self.fanout = array('L', struct.unpack('!256I', self.map))
411        self.fanout.append(0)  # entry "-1"
412        self.nsha = self.fanout[255]
413        self.sha_ofs = 256 * 4
414        # Avoid slicing shatable for individual hashes (very high overhead)
415        self.shatable = buffer(self.map, self.sha_ofs, self.nsha * 24)
416
417    def __enter__(self):
418        return self
419
420    def __exit__(self, type, value, traceback):
421        self.close()
422
423    def __len__(self):
424        return int(self.nsha)  # int() from long for python 2
425
426    def _ofs_from_idx(self, idx):
427        if idx >= self.nsha or idx < 0:
428            raise IndexError('invalid pack index index %d' % idx)
429        ofs = self.sha_ofs + idx * 24
430        return struct.unpack_from('!I', self.map, offset=ofs)[0]
431
432    def _idx_to_hash(self, idx):
433        if idx >= self.nsha or idx < 0:
434            raise IndexError('invalid pack index index %d' % idx)
435        ofs = self.sha_ofs + idx * 24 + 4
436        return self.map[ofs : ofs + 20]
437
438    def __iter__(self):
439        start = self.sha_ofs + 4
440        for ofs in range(start, start + 24 * self.nsha, 24):
441            yield self.map[ofs : ofs + 20]
442
443    def close(self):
444        if self.map is not None:
445            self.shatable = None
446            self.map.close()
447            self.map = None
448
449
450class PackIdxV2(PackIdx):
451    """Object representation of a Git pack index (version 2) file."""
452    def __init__(self, filename, f):
453        self.name = filename
454        self.idxnames = [self.name]
455        self.map = mmap_read(f)
456        assert self.map[0:8] == b'\377tOc\0\0\0\2'
457        # Min size for 'L' is 4, which is sufficient for struct's '!I'
458        self.fanout = array('L', struct.unpack_from('!256I', self.map, offset=8))
459        self.fanout.append(0)
460        self.nsha = self.fanout[255]
461        self.sha_ofs = 8 + 256*4
462        self.ofstable_ofs = self.sha_ofs + self.nsha * 20 + self.nsha * 4
463        self.ofs64table_ofs = self.ofstable_ofs + self.nsha * 4
464        # Avoid slicing this for individual hashes (very high overhead)
465        self.shatable = buffer(self.map, self.sha_ofs, self.nsha*20)
466
467    def __enter__(self):
468        return self
469
470    def __exit__(self, type, value, traceback):
471        self.close()
472
473    def __len__(self):
474        return int(self.nsha)  # int() from long for python 2
475
476    def _ofs_from_idx(self, idx):
477        if idx >= self.nsha or idx < 0:
478            raise IndexError('invalid pack index index %d' % idx)
479        ofs_ofs = self.ofstable_ofs + idx * 4
480        ofs = struct.unpack_from('!I', self.map, offset=ofs_ofs)[0]
481        if ofs & 0x80000000:
482            idx64 = ofs & 0x7fffffff
483            ofs64_ofs = self.ofs64table_ofs + idx64 * 8
484            ofs = struct.unpack_from('!Q', self.map, offset=ofs64_ofs)[0]
485        return ofs
486
487    def _idx_to_hash(self, idx):
488        if idx >= self.nsha or idx < 0:
489            raise IndexError('invalid pack index index %d' % idx)
490        ofs = self.sha_ofs + idx * 20
491        return self.map[ofs : ofs + 20]
492
493    def __iter__(self):
494        start = self.sha_ofs
495        for ofs in range(start, start + 20 * self.nsha, 20):
496            yield self.map[ofs : ofs + 20]
497
498    def close(self):
499        if self.map is not None:
500            self.shatable = None
501            self.map.close()
502            self.map = None
503
504
505_mpi_count = 0
506class PackIdxList:
507    def __init__(self, dir, ignore_midx=False):
508        global _mpi_count
509        assert(_mpi_count == 0) # these things suck tons of VM; don't waste it
510        _mpi_count += 1
511        self.dir = dir
512        self.also = set()
513        self.packs = []
514        self.do_bloom = False
515        self.bloom = None
516        self.ignore_midx = ignore_midx
517        self.refresh()
518
519    def __del__(self):
520        global _mpi_count
521        _mpi_count -= 1
522        assert(_mpi_count == 0)
523
524    def __iter__(self):
525        return iter(idxmerge(self.packs))
526
527    def __len__(self):
528        return sum(len(pack) for pack in self.packs)
529
530    def exists(self, hash, want_source=False):
531        """Return nonempty if the object exists in the index files."""
532        global _total_searches
533        _total_searches += 1
534        if hash in self.also:
535            return True
536        if self.do_bloom and self.bloom:
537            if self.bloom.exists(hash):
538                self.do_bloom = False
539            else:
540                _total_searches -= 1  # was counted by bloom
541                return None
542        for i in range(len(self.packs)):
543            p = self.packs[i]
544            _total_searches -= 1  # will be incremented by sub-pack
545            ix = p.exists(hash, want_source=want_source)
546            if ix:
547                # reorder so most recently used packs are searched first
548                self.packs = [p] + self.packs[:i] + self.packs[i+1:]
549                return ix
550        self.do_bloom = True
551        return None
552
553    def refresh(self, skip_midx = False):
554        """Refresh the index list.
555        This method verifies if .midx files were superseded (e.g. all of its
556        contents are in another, bigger .midx file) and removes the superseded
557        files.
558
559        If skip_midx is True, all work on .midx files will be skipped and .midx
560        files will be removed from the list.
561
562        The instance variable 'ignore_midx' can force this function to
563        always act as if skip_midx was True.
564        """
565        if self.bloom is not None:
566            self.bloom.close()
567        self.bloom = None # Always reopen the bloom as it may have been relaced
568        self.do_bloom = False
569        skip_midx = skip_midx or self.ignore_midx
570        d = dict((p.name, p) for p in self.packs
571                 if not skip_midx or not isinstance(p, midx.PackMidx))
572        if os.path.exists(self.dir):
573            if not skip_midx:
574                midxl = []
575                midxes = set(glob.glob(os.path.join(self.dir, b'*.midx')))
576                # remove any *.midx files from our list that no longer exist
577                for ix in list(d.values()):
578                    if not isinstance(ix, midx.PackMidx):
579                        continue
580                    if ix.name in midxes:
581                        continue
582                    # remove the midx
583                    del d[ix.name]
584                    ix.close()
585                    self.packs.remove(ix)
586                for ix in self.packs:
587                    if isinstance(ix, midx.PackMidx):
588                        for name in ix.idxnames:
589                            d[os.path.join(self.dir, name)] = ix
590                for full in midxes:
591                    if not d.get(full):
592                        mx = midx.PackMidx(full)
593                        (mxd, mxf) = os.path.split(mx.name)
594                        broken = False
595                        for n in mx.idxnames:
596                            if not os.path.exists(os.path.join(mxd, n)):
597                                log(('warning: index %s missing\n'
598                                     '  used by %s\n')
599                                    % (path_msg(n), path_msg(mxf)))
600                                broken = True
601                        if broken:
602                            mx.close()
603                            del mx
604                            unlink(full)
605                        else:
606                            midxl.append(mx)
607                midxl.sort(key=lambda ix:
608                           (-len(ix), -xstat.stat(ix.name).st_mtime))
609                for ix in midxl:
610                    any_needed = False
611                    for sub in ix.idxnames:
612                        found = d.get(os.path.join(self.dir, sub))
613                        if not found or isinstance(found, PackIdx):
614                            # doesn't exist, or exists but not in a midx
615                            any_needed = True
616                            break
617                    if any_needed:
618                        d[ix.name] = ix
619                        for name in ix.idxnames:
620                            d[os.path.join(self.dir, name)] = ix
621                    elif not ix.force_keep:
622                        debug1('midx: removing redundant: %s\n'
623                               % path_msg(os.path.basename(ix.name)))
624                        ix.close()
625                        unlink(ix.name)
626            for full in glob.glob(os.path.join(self.dir, b'*.idx')):
627                if not d.get(full):
628                    try:
629                        ix = open_idx(full)
630                    except GitError as e:
631                        add_error(e)
632                        continue
633                    d[full] = ix
634            bfull = os.path.join(self.dir, b'bup.bloom')
635            if self.bloom is None and os.path.exists(bfull):
636                self.bloom = bloom.ShaBloom(bfull)
637            self.packs = list(set(d.values()))
638            self.packs.sort(reverse=True, key=lambda x: len(x))
639            if self.bloom and self.bloom.valid() and len(self.bloom) >= len(self):
640                self.do_bloom = True
641            else:
642                self.bloom = None
643        debug1('PackIdxList: using %d index%s.\n'
644            % (len(self.packs), len(self.packs)!=1 and 'es' or ''))
645
646    def add(self, hash):
647        """Insert an additional object in the list."""
648        self.also.add(hash)
649
650
651def open_idx(filename):
652    if filename.endswith(b'.idx'):
653        f = open(filename, 'rb')
654        header = f.read(8)
655        if header[0:4] == b'\377tOc':
656            version = struct.unpack('!I', header[4:8])[0]
657            if version == 2:
658                return PackIdxV2(filename, f)
659            else:
660                raise GitError('%s: expected idx file version 2, got %d'
661                               % (path_msg(filename), version))
662        elif len(header) == 8 and header[0:4] < b'\377tOc':
663            return PackIdxV1(filename, f)
664        else:
665            raise GitError('%s: unrecognized idx file header'
666                           % path_msg(filename))
667    elif filename.endswith(b'.midx'):
668        return midx.PackMidx(filename)
669    else:
670        raise GitError('idx filenames must end with .idx or .midx')
671
672
673def idxmerge(idxlist, final_progress=True):
674    """Generate a list of all the objects reachable in a PackIdxList."""
675    def pfunc(count, total):
676        qprogress('Reading indexes: %.2f%% (%d/%d)\r'
677                  % (count*100.0/total, count, total))
678    def pfinal(count, total):
679        if final_progress:
680            progress('Reading indexes: %.2f%% (%d/%d), done.\n'
681                     % (100, total, total))
682    return merge_iter(idxlist, 10024, pfunc, pfinal)
683
684
685def _make_objcache():
686    return PackIdxList(repo(b'objects/pack'))
687
688# bup-gc assumes that it can disable all PackWriter activities
689# (bloom/midx/cache) via the constructor and close() arguments.
690
691class PackWriter:
692    """Writes Git objects inside a pack file."""
693    def __init__(self, objcache_maker=_make_objcache, compression_level=1,
694                 run_midx=True, on_pack_finish=None,
695                 max_pack_size=None, max_pack_objects=None, repo_dir=None):
696        self.repo_dir = repo_dir or repo()
697        self.file = None
698        self.parentfd = None
699        self.count = 0
700        self.outbytes = 0
701        self.filename = None
702        self.idx = None
703        self.objcache_maker = objcache_maker
704        self.objcache = None
705        self.compression_level = compression_level
706        self.run_midx=run_midx
707        self.on_pack_finish = on_pack_finish
708        if not max_pack_size:
709            max_pack_size = git_config_get(b'pack.packSizeLimit',
710                                           repo_dir=self.repo_dir)
711            if max_pack_size is not None:
712                max_pack_size = parse_num(max_pack_size)
713            if not max_pack_size:
714                # larger packs slow down pruning
715                max_pack_size = 1000 * 1000 * 1000
716        self.max_pack_size = max_pack_size
717        # cache memory usage is about 83 bytes per object
718        self.max_pack_objects = max_pack_objects if max_pack_objects \
719                                else max(1, self.max_pack_size // 5000)
720
721    def __del__(self):
722        self.close()
723
724    def __enter__(self):
725        return self
726
727    def __exit__(self, type, value, traceback):
728        self.close()
729
730    def _open(self):
731        if not self.file:
732            objdir = dir = os.path.join(self.repo_dir, b'objects')
733            fd, name = tempfile.mkstemp(suffix=b'.pack', dir=objdir)
734            try:
735                self.file = os.fdopen(fd, 'w+b')
736            except:
737                os.close(fd)
738                raise
739            try:
740                self.parentfd = os.open(objdir, os.O_RDONLY)
741            except:
742                f = self.file
743                self.file = None
744                f.close()
745                raise
746            assert name.endswith(b'.pack')
747            self.filename = name[:-5]
748            self.file.write(b'PACK\0\0\0\2\0\0\0\0')
749            self.idx = PackIdxV2Writer()
750
751    def _raw_write(self, datalist, sha):
752        self._open()
753        f = self.file
754        # in case we get interrupted (eg. KeyboardInterrupt), it's best if
755        # the file never has a *partial* blob.  So let's make sure it's
756        # all-or-nothing.  (The blob shouldn't be very big anyway, thanks
757        # to our hashsplit algorithm.)  f.write() does its own buffering,
758        # but that's okay because we'll flush it in _end().
759        oneblob = b''.join(datalist)
760        try:
761            f.write(oneblob)
762        except IOError as e:
763            reraise(GitError(e))
764        nw = len(oneblob)
765        crc = zlib.crc32(oneblob) & 0xffffffff
766        self._update_idx(sha, crc, nw)
767        self.outbytes += nw
768        self.count += 1
769        return nw, crc
770
771    def _update_idx(self, sha, crc, size):
772        assert(sha)
773        if self.idx:
774            self.idx.add(sha, crc, self.file.tell() - size)
775
776    def _write(self, sha, type, content):
777        if verbose:
778            log('>')
779        if not sha:
780            sha = calc_hash(type, content)
781        size, crc = self._raw_write(_encode_packobj(type, content,
782                                                    self.compression_level),
783                                    sha=sha)
784        if self.outbytes >= self.max_pack_size \
785           or self.count >= self.max_pack_objects:
786            self.breakpoint()
787        return sha
788
789    def breakpoint(self):
790        """Clear byte and object counts and return the last processed id."""
791        id = self._end(self.run_midx)
792        self.outbytes = self.count = 0
793        return id
794
795    def _require_objcache(self):
796        if self.objcache is None and self.objcache_maker:
797            self.objcache = self.objcache_maker()
798        if self.objcache is None:
799            raise GitError(
800                    "PackWriter not opened or can't check exists w/o objcache")
801
802    def exists(self, id, want_source=False):
803        """Return non-empty if an object is found in the object cache."""
804        self._require_objcache()
805        return self.objcache.exists(id, want_source=want_source)
806
807    def just_write(self, sha, type, content):
808        """Write an object to the pack file without checking for duplication."""
809        self._write(sha, type, content)
810        # If nothing else, gc doesn't have/want an objcache
811        if self.objcache is not None:
812            self.objcache.add(sha)
813
814    def maybe_write(self, type, content):
815        """Write an object to the pack file if not present and return its id."""
816        sha = calc_hash(type, content)
817        if not self.exists(sha):
818            self._require_objcache()
819            self.just_write(sha, type, content)
820        return sha
821
822    def new_blob(self, blob):
823        """Create a blob object in the pack with the supplied content."""
824        return self.maybe_write(b'blob', blob)
825
826    def new_tree(self, shalist):
827        """Create a tree object in the pack."""
828        content = tree_encode(shalist)
829        return self.maybe_write(b'tree', content)
830
831    def new_commit(self, tree, parent,
832                   author, adate_sec, adate_tz,
833                   committer, cdate_sec, cdate_tz,
834                   msg):
835        """Create a commit object in the pack.  The date_sec values must be
836        epoch-seconds, and if a tz is None, the local timezone is assumed."""
837        if adate_tz is not None:
838            adate_str = _git_date_str(adate_sec, adate_tz)
839        else:
840            adate_str = _local_git_date_str(adate_sec)
841        if cdate_tz is not None:
842            cdate_str = _git_date_str(cdate_sec, cdate_tz)
843        else:
844            cdate_str = _local_git_date_str(cdate_sec)
845        l = []
846        if tree: l.append(b'tree %s' % hexlify(tree))
847        if parent: l.append(b'parent %s' % hexlify(parent))
848        if author: l.append(b'author %s %s' % (author, adate_str))
849        if committer: l.append(b'committer %s %s' % (committer, cdate_str))
850        l.append(b'')
851        l.append(msg)
852        return self.maybe_write(b'commit', b'\n'.join(l))
853
854    def abort(self):
855        """Remove the pack file from disk."""
856        f = self.file
857        if f:
858            pfd = self.parentfd
859            self.file = None
860            self.parentfd = None
861            self.idx = None
862            try:
863                try:
864                    os.unlink(self.filename + b'.pack')
865                finally:
866                    f.close()
867            finally:
868                if pfd is not None:
869                    os.close(pfd)
870
871    def _end(self, run_midx=True):
872        f = self.file
873        if not f: return None
874        self.file = None
875        try:
876            self.objcache = None
877            idx = self.idx
878            self.idx = None
879
880            # update object count
881            f.seek(8)
882            cp = struct.pack('!i', self.count)
883            assert(len(cp) == 4)
884            f.write(cp)
885
886            # calculate the pack sha1sum
887            f.seek(0)
888            sum = Sha1()
889            for b in chunkyreader(f):
890                sum.update(b)
891            packbin = sum.digest()
892            f.write(packbin)
893            fdatasync(f.fileno())
894        finally:
895            f.close()
896
897        obj_list_sha = idx.write(self.filename + b'.idx', packbin)
898        nameprefix = os.path.join(self.repo_dir,
899                                  b'objects/pack/pack-' +  obj_list_sha)
900        if os.path.exists(self.filename + b'.map'):
901            os.unlink(self.filename + b'.map')
902        os.rename(self.filename + b'.pack', nameprefix + b'.pack')
903        os.rename(self.filename + b'.idx', nameprefix + b'.idx')
904        try:
905            os.fsync(self.parentfd)
906        finally:
907            os.close(self.parentfd)
908
909        if run_midx:
910            auto_midx(os.path.join(self.repo_dir, b'objects/pack'))
911
912        if self.on_pack_finish:
913            self.on_pack_finish(nameprefix)
914
915        return nameprefix
916
917    def close(self, run_midx=True):
918        """Close the pack file and move it to its definitive path."""
919        return self._end(run_midx=run_midx)
920
921
922class PackIdxV2Writer:
923    def __init__(self):
924        self.idx = list(list() for i in range(256))
925        self.count = 0
926
927    def add(self, sha, crc, offs):
928        assert(sha)
929        self.count += 1
930        self.idx[byte_int(sha[0])].append((sha, crc, offs))
931
932    def write(self, filename, packbin):
933        ofs64_count = 0
934        for section in self.idx:
935            for entry in section:
936                if entry[2] >= 2**31:
937                    ofs64_count += 1
938
939        # Length: header + fan-out + shas-and-crcs + overflow-offsets
940        index_len = 8 + (4 * 256) + (28 * self.count) + (8 * ofs64_count)
941        idx_map = None
942        idx_f = open(filename, 'w+b')
943        try:
944            idx_f.truncate(index_len)
945            fdatasync(idx_f.fileno())
946            idx_map = mmap_readwrite(idx_f, close=False)
947            try:
948                count = _helpers.write_idx(filename, idx_map, self.idx,
949                                           self.count)
950                assert(count == self.count)
951                idx_map.flush()
952            finally:
953                idx_map.close()
954        finally:
955            idx_f.close()
956
957        idx_f = open(filename, 'a+b')
958        try:
959            idx_f.write(packbin)
960            idx_f.seek(0)
961            idx_sum = Sha1()
962            b = idx_f.read(8 + 4*256)
963            idx_sum.update(b)
964
965            obj_list_sum = Sha1()
966            for b in chunkyreader(idx_f, 20 * self.count):
967                idx_sum.update(b)
968                obj_list_sum.update(b)
969            namebase = hexlify(obj_list_sum.digest())
970
971            for b in chunkyreader(idx_f):
972                idx_sum.update(b)
973            idx_f.write(idx_sum.digest())
974            fdatasync(idx_f.fileno())
975            return namebase
976        finally:
977            idx_f.close()
978
979
980def list_refs(patterns=None, repo_dir=None,
981              limit_to_heads=False, limit_to_tags=False):
982    """Yield (refname, hash) tuples for all repository refs unless
983    patterns are specified.  In that case, only include tuples for
984    refs matching those patterns (cf. git-show-ref(1)).  The limits
985    restrict the result items to refs/heads or refs/tags.  If both
986    limits are specified, items from both sources will be included.
987
988    """
989    argv = [b'git', b'show-ref']
990    if limit_to_heads:
991        argv.append(b'--heads')
992    if limit_to_tags:
993        argv.append(b'--tags')
994    argv.append(b'--')
995    if patterns:
996        argv.extend(patterns)
997    p = subprocess.Popen(argv, env=_gitenv(repo_dir), stdout=subprocess.PIPE)
998    out = p.stdout.read().strip()
999    rv = p.wait()  # not fatal
1000    if rv:
1001        assert(not out)
1002    if out:
1003        for d in out.split(b'\n'):
1004            sha, name = d.split(b' ', 1)
1005            yield name, unhexlify(sha)
1006
1007
1008def read_ref(refname, repo_dir = None):
1009    """Get the commit id of the most recent commit made on a given ref."""
1010    refs = list_refs(patterns=[refname], repo_dir=repo_dir, limit_to_heads=True)
1011    l = tuple(islice(refs, 2))
1012    if l:
1013        assert(len(l) == 1)
1014        return l[0][1]
1015    else:
1016        return None
1017
1018
1019def rev_list_invocation(ref_or_refs, format=None):
1020    if isinstance(ref_or_refs, bytes):
1021        refs = (ref_or_refs,)
1022    else:
1023        refs = ref_or_refs
1024    argv = [b'git', b'rev-list']
1025
1026    if format:
1027        argv.append(b'--pretty=format:' + format)
1028    for ref in refs:
1029        assert not ref.startswith(b'-')
1030        argv.append(ref)
1031    argv.append(b'--')
1032    return argv
1033
1034
1035def rev_list(ref_or_refs, parse=None, format=None, repo_dir=None):
1036    """Yield information about commits as per "git rev-list".  If a format
1037    is not provided, yield one hex hash at a time.  If a format is
1038    provided, pass it to rev-list and call parse(git_stdout) for each
1039    commit with the stream positioned just after the rev-list "commit
1040    HASH" header line.  When a format is provided yield (oidx,
1041    parse(git_stdout)) for each commit.
1042
1043    """
1044    assert bool(parse) == bool(format)
1045    p = subprocess.Popen(rev_list_invocation(ref_or_refs,
1046                                             format=format),
1047                         env=_gitenv(repo_dir),
1048                         stdout = subprocess.PIPE)
1049    if not format:
1050        for line in p.stdout:
1051            yield line.strip()
1052    else:
1053        line = p.stdout.readline()
1054        while line:
1055            s = line.strip()
1056            if not s.startswith(b'commit '):
1057                raise Exception('unexpected line ' + repr(s))
1058            s = s[7:]
1059            assert len(s) == 40
1060            yield s, parse(p.stdout)
1061            line = p.stdout.readline()
1062
1063    rv = p.wait()  # not fatal
1064    if rv:
1065        raise GitError('git rev-list returned error %d' % rv)
1066
1067
1068def get_commit_dates(refs, repo_dir=None):
1069    """Get the dates for the specified commit refs.  For now, every unique
1070       string in refs must resolve to a different commit or this
1071       function will fail."""
1072    result = []
1073    for ref in refs:
1074        commit = get_commit_items(ref, cp(repo_dir))
1075        result.append(commit.author_sec)
1076    return result
1077
1078
1079def rev_parse(committish, repo_dir=None):
1080    """Resolve the full hash for 'committish', if it exists.
1081
1082    Should be roughly equivalent to 'git rev-parse'.
1083
1084    Returns the hex value of the hash if it is found, None if 'committish' does
1085    not correspond to anything.
1086    """
1087    head = read_ref(committish, repo_dir=repo_dir)
1088    if head:
1089        debug2("resolved from ref: commit = %s\n" % hexlify(head))
1090        return head
1091
1092    pL = PackIdxList(repo(b'objects/pack', repo_dir=repo_dir))
1093
1094    if len(committish) == 40:
1095        try:
1096            hash = unhexlify(committish)
1097        except TypeError:
1098            return None
1099
1100        if pL.exists(hash):
1101            return hash
1102
1103    return None
1104
1105
1106def update_ref(refname, newval, oldval, repo_dir=None):
1107    """Update a repository reference."""
1108    if not oldval:
1109        oldval = b''
1110    assert refname.startswith(b'refs/heads/') \
1111        or refname.startswith(b'refs/tags/')
1112    p = subprocess.Popen([b'git', b'update-ref', refname,
1113                          hexlify(newval), hexlify(oldval)],
1114                         env=_gitenv(repo_dir))
1115    _git_wait(b'git update-ref', p)
1116
1117
1118def delete_ref(refname, oldvalue=None):
1119    """Delete a repository reference (see git update-ref(1))."""
1120    assert refname.startswith(b'refs/')
1121    oldvalue = [] if not oldvalue else [oldvalue]
1122    p = subprocess.Popen([b'git', b'update-ref', b'-d', refname] + oldvalue,
1123                         env=_gitenv())
1124    _git_wait('git update-ref', p)
1125
1126
1127def guess_repo(path=None):
1128    """Set the path value in the global variable "repodir".
1129    This makes bup look for an existing bup repository, but not fail if a
1130    repository doesn't exist. Usually, if you are interacting with a bup
1131    repository, you would not be calling this function but using
1132    check_repo_or_die().
1133    """
1134    global repodir
1135    if path:
1136        repodir = path
1137    if not repodir:
1138        repodir = environ.get(b'BUP_DIR')
1139        if not repodir:
1140            repodir = os.path.expanduser(b'~/.bup')
1141
1142
1143def init_repo(path=None):
1144    """Create the Git bare repository for bup in a given path."""
1145    guess_repo(path)
1146    d = repo()  # appends a / to the path
1147    parent = os.path.dirname(os.path.dirname(d))
1148    if parent and not os.path.exists(parent):
1149        raise GitError('parent directory "%s" does not exist\n'
1150                       % path_msg(parent))
1151    if os.path.exists(d) and not os.path.isdir(os.path.join(d, b'.')):
1152        raise GitError('"%s" exists but is not a directory\n' % path_msg(d))
1153    p = subprocess.Popen([b'git', b'--bare', b'init'], stdout=sys.stderr,
1154                         env=_gitenv())
1155    _git_wait('git init', p)
1156    # Force the index version configuration in order to ensure bup works
1157    # regardless of the version of the installed Git binary.
1158    p = subprocess.Popen([b'git', b'config', b'pack.indexVersion', '2'],
1159                         stdout=sys.stderr, env=_gitenv())
1160    _git_wait('git config', p)
1161    # Enable the reflog
1162    p = subprocess.Popen([b'git', b'config', b'core.logAllRefUpdates', b'true'],
1163                         stdout=sys.stderr, env=_gitenv())
1164    _git_wait('git config', p)
1165
1166
1167def check_repo_or_die(path=None):
1168    """Check to see if a bup repository probably exists, and abort if not."""
1169    guess_repo(path)
1170    top = repo()
1171    pst = stat_if_exists(top + b'/objects/pack')
1172    if pst and stat.S_ISDIR(pst.st_mode):
1173        return
1174    if not pst:
1175        top_st = stat_if_exists(top)
1176        if not top_st:
1177            log('error: repository %r does not exist (see "bup help init")\n'
1178                % top)
1179            sys.exit(15)
1180    log('error: %s is not a repository\n' % path_msg(top))
1181    sys.exit(14)
1182
1183
1184def is_suitable_git(ver_str):
1185    if not ver_str.startswith(b'git version '):
1186        return 'unrecognized'
1187    ver_str = ver_str[len(b'git version '):]
1188    if ver_str.startswith(b'0.'):
1189        return 'insufficient'
1190    if ver_str.startswith(b'1.'):
1191        if re.match(br'1\.[012345]rc', ver_str):
1192            return 'insufficient'
1193        if re.match(br'1\.[01234]\.', ver_str):
1194            return 'insufficient'
1195        if re.match(br'1\.5\.[012345]($|\.)', ver_str):
1196            return 'insufficient'
1197        if re.match(br'1\.5\.6-rc', ver_str):
1198            return 'insufficient'
1199        return 'suitable'
1200    if re.match(br'[0-9]+(\.|$)?', ver_str):
1201        return 'suitable'
1202    sys.exit(13)
1203
1204_git_great = None
1205
1206def require_suitable_git(ver_str=None):
1207    """Raise GitError if the version of git isn't suitable.
1208
1209    Rely on ver_str when provided, rather than invoking the git in the
1210    path.
1211
1212    """
1213    global _git_great
1214    if _git_great is not None:
1215        return
1216    if environ.get(b'BUP_GIT_VERSION_IS_FINE', b'').lower() \
1217       in (b'yes', b'true', b'1'):
1218        _git_great = True
1219        return
1220    if not ver_str:
1221        ver_str, _, _ = _git_exo([b'git', b'--version'])
1222    status = is_suitable_git(ver_str)
1223    if status == 'unrecognized':
1224        raise GitError('Unexpected git --version output: %r' % ver_str)
1225    if status == 'insufficient':
1226        log('error: git version must be at least 1.5.6\n')
1227        sys.exit(1)
1228    if status == 'suitable':
1229        _git_great = True
1230        return
1231    assert False
1232
1233
1234class _AbortableIter:
1235    def __init__(self, it, onabort = None):
1236        self.it = it
1237        self.onabort = onabort
1238        self.done = None
1239
1240    def __iter__(self):
1241        return self
1242
1243    def __next__(self):
1244        try:
1245            return next(self.it)
1246        except StopIteration as e:
1247            self.done = True
1248            raise
1249        except:
1250            self.abort()
1251            raise
1252
1253    next = __next__
1254
1255    def abort(self):
1256        """Abort iteration and call the abortion callback, if needed."""
1257        if not self.done:
1258            self.done = True
1259            if self.onabort:
1260                self.onabort()
1261
1262    def __del__(self):
1263        self.abort()
1264
1265
1266class CatPipe:
1267    """Link to 'git cat-file' that is used to retrieve blob data."""
1268    def __init__(self, repo_dir = None):
1269        require_suitable_git()
1270        self.repo_dir = repo_dir
1271        self.p = self.inprogress = None
1272
1273    def _abort(self):
1274        if self.p:
1275            self.p.stdout.close()
1276            self.p.stdin.close()
1277        self.p = None
1278        self.inprogress = None
1279
1280    def restart(self):
1281        self._abort()
1282        self.p = subprocess.Popen([b'git', b'cat-file', b'--batch'],
1283                                  stdin=subprocess.PIPE,
1284                                  stdout=subprocess.PIPE,
1285                                  close_fds = True,
1286                                  bufsize = 4096,
1287                                  env=_gitenv(self.repo_dir))
1288
1289    def get(self, ref):
1290        """Yield (oidx, type, size), followed by the data referred to by ref.
1291        If ref does not exist, only yield (None, None, None).
1292
1293        """
1294        if not self.p or self.p.poll() != None:
1295            self.restart()
1296        assert(self.p)
1297        poll_result = self.p.poll()
1298        assert(poll_result == None)
1299        if self.inprogress:
1300            log('get: opening %r while %r is open\n' % (ref, self.inprogress))
1301        assert(not self.inprogress)
1302        assert ref.find(b'\n') < 0
1303        assert ref.find(b'\r') < 0
1304        assert not ref.startswith(b'-')
1305        self.inprogress = ref
1306        self.p.stdin.write(ref + b'\n')
1307        self.p.stdin.flush()
1308        hdr = self.p.stdout.readline()
1309        if hdr.endswith(b' missing\n'):
1310            self.inprogress = None
1311            yield None, None, None
1312            return
1313        info = hdr.split(b' ')
1314        if len(info) != 3 or len(info[0]) != 40:
1315            raise GitError('expected object (id, type, size), got %r' % info)
1316        oidx, typ, size = info
1317        size = int(size)
1318        it = _AbortableIter(chunkyreader(self.p.stdout, size),
1319                            onabort=self._abort)
1320        try:
1321            yield oidx, typ, size
1322            for blob in it:
1323                yield blob
1324            readline_result = self.p.stdout.readline()
1325            assert readline_result == b'\n'
1326            self.inprogress = None
1327        except Exception as e:
1328            it.abort()
1329            raise
1330
1331    def _join(self, it):
1332        _, typ, _ = next(it)
1333        if typ == b'blob':
1334            for blob in it:
1335                yield blob
1336        elif typ == b'tree':
1337            treefile = b''.join(it)
1338            for (mode, name, sha) in tree_decode(treefile):
1339                for blob in self.join(hexlify(sha)):
1340                    yield blob
1341        elif typ == b'commit':
1342            treeline = b''.join(it).split(b'\n')[0]
1343            assert treeline.startswith(b'tree ')
1344            for blob in self.join(treeline[5:]):
1345                yield blob
1346        else:
1347            raise GitError('invalid object type %r: expected blob/tree/commit'
1348                           % typ)
1349
1350    def join(self, id):
1351        """Generate a list of the content of all blobs that can be reached
1352        from an object.  The hash given in 'id' must point to a blob, a tree
1353        or a commit. The content of all blobs that can be seen from trees or
1354        commits will be added to the list.
1355        """
1356        for d in self._join(self.get(id)):
1357            yield d
1358
1359
1360_cp = {}
1361
1362def cp(repo_dir=None):
1363    """Create a CatPipe object or reuse the already existing one."""
1364    global _cp, repodir
1365    if not repo_dir:
1366        repo_dir = repodir or repo()
1367    repo_dir = os.path.abspath(repo_dir)
1368    cp = _cp.get(repo_dir)
1369    if not cp:
1370        cp = CatPipe(repo_dir)
1371        _cp[repo_dir] = cp
1372    return cp
1373
1374
1375def tags(repo_dir = None):
1376    """Return a dictionary of all tags in the form {hash: [tag_names, ...]}."""
1377    tags = {}
1378    for n, c in list_refs(repo_dir = repo_dir, limit_to_tags=True):
1379        assert n.startswith(b'refs/tags/')
1380        name = n[10:]
1381        if not c in tags:
1382            tags[c] = []
1383        tags[c].append(name)  # more than one tag can point at 'c'
1384    return tags
1385
1386
1387class MissingObject(KeyError):
1388    def __init__(self, oid):
1389        self.oid = oid
1390        KeyError.__init__(self, 'object %r is missing' % hexlify(oid))
1391
1392
1393WalkItem = namedtuple('WalkItem', ['oid', 'type', 'mode',
1394                                   'path', 'chunk_path', 'data'])
1395# The path is the mangled path, and if an item represents a fragment
1396# of a chunked file, the chunk_path will be the chunked subtree path
1397# for the chunk, i.e. ['', '2d3115e', ...].  The top-level path for a
1398# chunked file will have a chunk_path of [''].  So some chunk subtree
1399# of the file '/foo/bar/baz' might look like this:
1400#
1401#   item.path = ['foo', 'bar', 'baz.bup']
1402#   item.chunk_path = ['', '2d3115e', '016b097']
1403#   item.type = 'tree'
1404#   ...
1405
1406
1407def walk_object(get_ref, oidx, stop_at=None, include_data=None):
1408    """Yield everything reachable from oidx via get_ref (which must behave
1409    like CatPipe get) as a WalkItem, stopping whenever stop_at(oidx)
1410    returns true.  Throw MissingObject if a hash encountered is
1411    missing from the repository, and don't read or return blob content
1412    in the data field unless include_data is set.
1413
1414    """
1415    # Maintain the pending stack on the heap to avoid stack overflow
1416    pending = [(oidx, [], [], None)]
1417    while len(pending):
1418        oidx, parent_path, chunk_path, mode = pending.pop()
1419        oid = unhexlify(oidx)
1420        if stop_at and stop_at(oidx):
1421            continue
1422
1423        if (not include_data) and mode and stat.S_ISREG(mode):
1424            # If the object is a "regular file", then it's a leaf in
1425            # the graph, so we can skip reading the data if the caller
1426            # hasn't requested it.
1427            yield WalkItem(oid=oid, type=b'blob',
1428                           chunk_path=chunk_path, path=parent_path,
1429                           mode=mode,
1430                           data=None)
1431            continue
1432
1433        item_it = get_ref(oidx)
1434        get_oidx, typ, _ = next(item_it)
1435        if not get_oidx:
1436            raise MissingObject(unhexlify(oidx))
1437        if typ not in (b'blob', b'commit', b'tree'):
1438            raise Exception('unexpected repository object type %r' % typ)
1439
1440        # FIXME: set the mode based on the type when the mode is None
1441        if typ == b'blob' and not include_data:
1442            # Dump data until we can ask cat_pipe not to fetch it
1443            for ignored in item_it:
1444                pass
1445            data = None
1446        else:
1447            data = b''.join(item_it)
1448
1449        yield WalkItem(oid=oid, type=typ,
1450                       chunk_path=chunk_path, path=parent_path,
1451                       mode=mode,
1452                       data=(data if include_data else None))
1453
1454        if typ == b'commit':
1455            commit_items = parse_commit(data)
1456            for pid in commit_items.parents:
1457                pending.append((pid, parent_path, chunk_path, mode))
1458            pending.append((commit_items.tree, parent_path, chunk_path,
1459                            hashsplit.GIT_MODE_TREE))
1460        elif typ == b'tree':
1461            for mode, name, ent_id in tree_decode(data):
1462                demangled, bup_type = demangle_name(name, mode)
1463                if chunk_path:
1464                    sub_path = parent_path
1465                    sub_chunk_path = chunk_path + [name]
1466                else:
1467                    sub_path = parent_path + [name]
1468                    if bup_type == BUP_CHUNKED:
1469                        sub_chunk_path = [b'']
1470                    else:
1471                        sub_chunk_path = chunk_path
1472                pending.append((hexlify(ent_id), sub_path, sub_chunk_path,
1473                                mode))
1474