1"""Git interaction library. 2bup repositories are in Git format. This library allows us to 3interact with the Git data structures. 4""" 5 6from __future__ import absolute_import, print_function 7import errno, os, sys, zlib, time, subprocess, struct, stat, re, tempfile, glob 8from array import array 9from binascii import hexlify, unhexlify 10from collections import namedtuple 11from itertools import islice 12from numbers import Integral 13 14from bup import _helpers, compat, hashsplit, path, midx, bloom, xstat 15from bup.compat import (buffer, 16 byte_int, bytes_from_byte, bytes_from_uint, 17 environ, 18 items, 19 range, 20 reraise) 21from bup.io import path_msg 22from bup.helpers import (Sha1, add_error, chunkyreader, debug1, debug2, 23 exo, 24 fdatasync, 25 hostname, localtime, log, 26 merge_dict, 27 merge_iter, 28 mmap_read, mmap_readwrite, 29 parse_num, 30 progress, qprogress, stat_if_exists, 31 unlink, 32 utc_offset_str) 33from bup.pwdgrp import username, userfullname 34 35 36verbose = 0 37repodir = None # The default repository, once initialized 38 39_typemap = {b'blob': 3, b'tree': 2, b'commit': 1, b'tag': 4} 40_typermap = {v: k for k, v in items(_typemap)} 41 42 43_total_searches = 0 44_total_steps = 0 45 46 47class GitError(Exception): 48 pass 49 50 51def _gitenv(repo_dir=None): 52 if not repo_dir: 53 repo_dir = repo() 54 return merge_dict(environ, {b'GIT_DIR': os.path.abspath(repo_dir)}) 55 56def _git_wait(cmd, p): 57 rv = p.wait() 58 if rv != 0: 59 raise GitError('%r returned %d' % (cmd, rv)) 60 61def _git_exo(cmd, **kwargs): 62 kwargs['check'] = False 63 result = exo(cmd, **kwargs) 64 _, _, proc = result 65 if proc.returncode != 0: 66 raise GitError('%r returned %d' % (cmd, proc.returncode)) 67 return result 68 69def git_config_get(option, repo_dir=None): 70 cmd = (b'git', b'config', b'--get', option) 71 p = subprocess.Popen(cmd, stdout=subprocess.PIPE, 72 env=_gitenv(repo_dir=repo_dir)) 73 r = p.stdout.read() 74 rc = p.wait() 75 if rc == 0: 76 return r 77 if rc != 1: 78 raise GitError('%r returned %d' % (cmd, rc)) 79 return None 80 81 82def parse_tz_offset(s): 83 """UTC offset in seconds.""" 84 tz_off = (int(s[1:3]) * 60 * 60) + (int(s[3:5]) * 60) 85 if bytes_from_byte(s[0]) == b'-': 86 return - tz_off 87 return tz_off 88 89 90# FIXME: derived from http://git.rsbx.net/Documents/Git_Data_Formats.txt 91# Make sure that's authoritative. 92_start_end_char = br'[^ .,:;<>"\'\0\n]' 93_content_char = br'[^\0\n<>]' 94_safe_str_rx = br'(?:%s{1,2}|(?:%s%s*%s))' \ 95 % (_start_end_char, 96 _start_end_char, _content_char, _start_end_char) 97_tz_rx = br'[-+]\d\d[0-5]\d' 98_parent_rx = br'(?:parent [abcdefABCDEF0123456789]{40}\n)' 99# Assumes every following line starting with a space is part of the 100# mergetag. Is there a formal commit blob spec? 101_mergetag_rx = br'(?:\nmergetag object [abcdefABCDEF0123456789]{40}(?:\n [^\0\n]*)*)' 102_commit_rx = re.compile(br'''tree (?P<tree>[abcdefABCDEF0123456789]{40}) 103(?P<parents>%s*)author (?P<author_name>%s) <(?P<author_mail>%s)> (?P<asec>\d+) (?P<atz>%s) 104committer (?P<committer_name>%s) <(?P<committer_mail>%s)> (?P<csec>\d+) (?P<ctz>%s)(?P<mergetag>%s?) 105 106(?P<message>(?:.|\n)*)''' % (_parent_rx, 107 _safe_str_rx, _safe_str_rx, _tz_rx, 108 _safe_str_rx, _safe_str_rx, _tz_rx, 109 _mergetag_rx)) 110_parent_hash_rx = re.compile(br'\s*parent ([abcdefABCDEF0123456789]{40})\s*') 111 112# Note that the author_sec and committer_sec values are (UTC) epoch 113# seconds, and for now the mergetag is not included. 114CommitInfo = namedtuple('CommitInfo', ['tree', 'parents', 115 'author_name', 'author_mail', 116 'author_sec', 'author_offset', 117 'committer_name', 'committer_mail', 118 'committer_sec', 'committer_offset', 119 'message']) 120 121def parse_commit(content): 122 commit_match = re.match(_commit_rx, content) 123 if not commit_match: 124 raise Exception('cannot parse commit %r' % content) 125 matches = commit_match.groupdict() 126 return CommitInfo(tree=matches['tree'], 127 parents=re.findall(_parent_hash_rx, matches['parents']), 128 author_name=matches['author_name'], 129 author_mail=matches['author_mail'], 130 author_sec=int(matches['asec']), 131 author_offset=parse_tz_offset(matches['atz']), 132 committer_name=matches['committer_name'], 133 committer_mail=matches['committer_mail'], 134 committer_sec=int(matches['csec']), 135 committer_offset=parse_tz_offset(matches['ctz']), 136 message=matches['message']) 137 138 139def get_cat_data(cat_iterator, expected_type): 140 _, kind, _ = next(cat_iterator) 141 if kind != expected_type: 142 raise Exception('expected %r, saw %r' % (expected_type, kind)) 143 return b''.join(cat_iterator) 144 145def get_commit_items(id, cp): 146 return parse_commit(get_cat_data(cp.get(id), b'commit')) 147 148def _local_git_date_str(epoch_sec): 149 return b'%d %s' % (epoch_sec, utc_offset_str(epoch_sec)) 150 151 152def _git_date_str(epoch_sec, tz_offset_sec): 153 offs = tz_offset_sec // 60 154 return b'%d %s%02d%02d' \ 155 % (epoch_sec, 156 b'+' if offs >= 0 else b'-', 157 abs(offs) // 60, 158 abs(offs) % 60) 159 160 161def repo(sub = b'', repo_dir=None): 162 """Get the path to the git repository or one of its subdirectories.""" 163 repo_dir = repo_dir or repodir 164 if not repo_dir: 165 raise GitError('You should call check_repo_or_die()') 166 167 # If there's a .git subdirectory, then the actual repo is in there. 168 gd = os.path.join(repo_dir, b'.git') 169 if os.path.exists(gd): 170 repo_dir = gd 171 172 return os.path.join(repo_dir, sub) 173 174 175_shorten_hash_rx = \ 176 re.compile(br'([^0-9a-z]|\b)([0-9a-z]{7})[0-9a-z]{33}([^0-9a-z]|\b)') 177 178def shorten_hash(s): 179 return _shorten_hash_rx.sub(br'\1\2*\3', s) 180 181 182def repo_rel(path): 183 full = os.path.abspath(path) 184 fullrepo = os.path.abspath(repo(b'')) 185 if not fullrepo.endswith(b'/'): 186 fullrepo += b'/' 187 if full.startswith(fullrepo): 188 path = full[len(fullrepo):] 189 if path.startswith(b'index-cache/'): 190 path = path[len(b'index-cache/'):] 191 return shorten_hash(path) 192 193 194def all_packdirs(): 195 paths = [repo(b'objects/pack')] 196 paths += glob.glob(repo(b'index-cache/*/.')) 197 return paths 198 199 200def auto_midx(objdir): 201 args = [path.exe(), b'midx', b'--auto', b'--dir', objdir] 202 try: 203 rv = subprocess.call(args, stdout=open(os.devnull, 'w')) 204 except OSError as e: 205 # make sure 'args' gets printed to help with debugging 206 add_error('%r: exception: %s' % (args, e)) 207 raise 208 if rv: 209 add_error('%r: returned %d' % (args, rv)) 210 211 args = [path.exe(), b'bloom', b'--dir', objdir] 212 try: 213 rv = subprocess.call(args, stdout=open(os.devnull, 'w')) 214 except OSError as e: 215 # make sure 'args' gets printed to help with debugging 216 add_error('%r: exception: %s' % (args, e)) 217 raise 218 if rv: 219 add_error('%r: returned %d' % (args, rv)) 220 221 222def mangle_name(name, mode, gitmode): 223 """Mangle a file name to present an abstract name for segmented files. 224 Mangled file names will have the ".bup" extension added to them. If a 225 file's name already ends with ".bup", a ".bupl" extension is added to 226 disambiguate normal files from segmented ones. 227 """ 228 if stat.S_ISREG(mode) and not stat.S_ISREG(gitmode): 229 assert(stat.S_ISDIR(gitmode)) 230 return name + b'.bup' 231 elif name.endswith(b'.bup') or name[:-1].endswith(b'.bup'): 232 return name + b'.bupl' 233 else: 234 return name 235 236 237(BUP_NORMAL, BUP_CHUNKED) = (0,1) 238def demangle_name(name, mode): 239 """Remove name mangling from a file name, if necessary. 240 241 The return value is a tuple (demangled_filename,mode), where mode is one of 242 the following: 243 244 * BUP_NORMAL : files that should be read as-is from the repository 245 * BUP_CHUNKED : files that were chunked and need to be reassembled 246 247 For more information on the name mangling algorithm, see mangle_name() 248 """ 249 if name.endswith(b'.bupl'): 250 return (name[:-5], BUP_NORMAL) 251 elif name.endswith(b'.bup'): 252 return (name[:-4], BUP_CHUNKED) 253 elif name.endswith(b'.bupm'): 254 return (name[:-5], 255 BUP_CHUNKED if stat.S_ISDIR(mode) else BUP_NORMAL) 256 else: 257 return (name, BUP_NORMAL) 258 259 260def calc_hash(type, content): 261 """Calculate some content's hash in the Git fashion.""" 262 header = b'%s %d\0' % (type, len(content)) 263 sum = Sha1(header) 264 sum.update(content) 265 return sum.digest() 266 267 268def shalist_item_sort_key(ent): 269 (mode, name, id) = ent 270 assert(mode+0 == mode) 271 if stat.S_ISDIR(mode): 272 return name + b'/' 273 else: 274 return name 275 276 277def tree_encode(shalist): 278 """Generate a git tree object from (mode,name,hash) tuples.""" 279 shalist = sorted(shalist, key = shalist_item_sort_key) 280 l = [] 281 for (mode,name,bin) in shalist: 282 assert(mode) 283 assert(mode+0 == mode) 284 assert(name) 285 assert(len(bin) == 20) 286 s = b'%o %s\0%s' % (mode,name,bin) 287 assert s[0] != b'0' # 0-padded octal is not acceptable in a git tree 288 l.append(s) 289 return b''.join(l) 290 291 292def tree_decode(buf): 293 """Generate a list of (mode,name,hash) from the git tree object in buf.""" 294 ofs = 0 295 while ofs < len(buf): 296 z = buf.find(b'\0', ofs) 297 assert(z > ofs) 298 spl = buf[ofs:z].split(b' ', 1) 299 assert(len(spl) == 2) 300 mode,name = spl 301 sha = buf[z+1:z+1+20] 302 ofs = z+1+20 303 yield (int(mode, 8), name, sha) 304 305 306def _encode_packobj(type, content, compression_level=1): 307 if compression_level not in (0, 1, 2, 3, 4, 5, 6, 7, 8, 9): 308 raise ValueError('invalid compression level %s' % compression_level) 309 szout = b'' 310 sz = len(content) 311 szbits = (sz & 0x0f) | (_typemap[type]<<4) 312 sz >>= 4 313 while 1: 314 if sz: szbits |= 0x80 315 szout += bytes_from_uint(szbits) 316 if not sz: 317 break 318 szbits = sz & 0x7f 319 sz >>= 7 320 z = zlib.compressobj(compression_level) 321 yield szout 322 yield z.compress(content) 323 yield z.flush() 324 325 326def _encode_looseobj(type, content, compression_level=1): 327 z = zlib.compressobj(compression_level) 328 yield z.compress(b'%s %d\0' % (type, len(content))) 329 yield z.compress(content) 330 yield z.flush() 331 332 333def _decode_looseobj(buf): 334 assert(buf); 335 s = zlib.decompress(buf) 336 i = s.find(b'\0') 337 assert(i > 0) 338 l = s[:i].split(b' ') 339 type = l[0] 340 sz = int(l[1]) 341 content = s[i+1:] 342 assert(type in _typemap) 343 assert(sz == len(content)) 344 return (type, content) 345 346 347def _decode_packobj(buf): 348 assert(buf) 349 c = byte_int(buf[0]) 350 type = _typermap[(c & 0x70) >> 4] 351 sz = c & 0x0f 352 shift = 4 353 i = 0 354 while c & 0x80: 355 i += 1 356 c = byte_int(buf[i]) 357 sz |= (c & 0x7f) << shift 358 shift += 7 359 if not (c & 0x80): 360 break 361 return (type, zlib.decompress(buf[i+1:])) 362 363 364class PackIdx: 365 def __init__(self): 366 assert(0) 367 368 def find_offset(self, hash): 369 """Get the offset of an object inside the index file.""" 370 idx = self._idx_from_hash(hash) 371 if idx != None: 372 return self._ofs_from_idx(idx) 373 return None 374 375 def exists(self, hash, want_source=False): 376 """Return nonempty if the object exists in this index.""" 377 if hash and (self._idx_from_hash(hash) != None): 378 return want_source and os.path.basename(self.name) or True 379 return None 380 381 def _idx_from_hash(self, hash): 382 global _total_searches, _total_steps 383 _total_searches += 1 384 assert(len(hash) == 20) 385 b1 = byte_int(hash[0]) 386 start = self.fanout[b1-1] # range -1..254 387 end = self.fanout[b1] # range 0..255 388 want = hash 389 _total_steps += 1 # lookup table is a step 390 while start < end: 391 _total_steps += 1 392 mid = start + (end - start) // 2 393 v = self._idx_to_hash(mid) 394 if v < want: 395 start = mid+1 396 elif v > want: 397 end = mid 398 else: # got it! 399 return mid 400 return None 401 402 403class PackIdxV1(PackIdx): 404 """Object representation of a Git pack index (version 1) file.""" 405 def __init__(self, filename, f): 406 self.name = filename 407 self.idxnames = [self.name] 408 self.map = mmap_read(f) 409 # Min size for 'L' is 4, which is sufficient for struct's '!I' 410 self.fanout = array('L', struct.unpack('!256I', self.map)) 411 self.fanout.append(0) # entry "-1" 412 self.nsha = self.fanout[255] 413 self.sha_ofs = 256 * 4 414 # Avoid slicing shatable for individual hashes (very high overhead) 415 self.shatable = buffer(self.map, self.sha_ofs, self.nsha * 24) 416 417 def __enter__(self): 418 return self 419 420 def __exit__(self, type, value, traceback): 421 self.close() 422 423 def __len__(self): 424 return int(self.nsha) # int() from long for python 2 425 426 def _ofs_from_idx(self, idx): 427 if idx >= self.nsha or idx < 0: 428 raise IndexError('invalid pack index index %d' % idx) 429 ofs = self.sha_ofs + idx * 24 430 return struct.unpack_from('!I', self.map, offset=ofs)[0] 431 432 def _idx_to_hash(self, idx): 433 if idx >= self.nsha or idx < 0: 434 raise IndexError('invalid pack index index %d' % idx) 435 ofs = self.sha_ofs + idx * 24 + 4 436 return self.map[ofs : ofs + 20] 437 438 def __iter__(self): 439 start = self.sha_ofs + 4 440 for ofs in range(start, start + 24 * self.nsha, 24): 441 yield self.map[ofs : ofs + 20] 442 443 def close(self): 444 if self.map is not None: 445 self.shatable = None 446 self.map.close() 447 self.map = None 448 449 450class PackIdxV2(PackIdx): 451 """Object representation of a Git pack index (version 2) file.""" 452 def __init__(self, filename, f): 453 self.name = filename 454 self.idxnames = [self.name] 455 self.map = mmap_read(f) 456 assert self.map[0:8] == b'\377tOc\0\0\0\2' 457 # Min size for 'L' is 4, which is sufficient for struct's '!I' 458 self.fanout = array('L', struct.unpack_from('!256I', self.map, offset=8)) 459 self.fanout.append(0) 460 self.nsha = self.fanout[255] 461 self.sha_ofs = 8 + 256*4 462 self.ofstable_ofs = self.sha_ofs + self.nsha * 20 + self.nsha * 4 463 self.ofs64table_ofs = self.ofstable_ofs + self.nsha * 4 464 # Avoid slicing this for individual hashes (very high overhead) 465 self.shatable = buffer(self.map, self.sha_ofs, self.nsha*20) 466 467 def __enter__(self): 468 return self 469 470 def __exit__(self, type, value, traceback): 471 self.close() 472 473 def __len__(self): 474 return int(self.nsha) # int() from long for python 2 475 476 def _ofs_from_idx(self, idx): 477 if idx >= self.nsha or idx < 0: 478 raise IndexError('invalid pack index index %d' % idx) 479 ofs_ofs = self.ofstable_ofs + idx * 4 480 ofs = struct.unpack_from('!I', self.map, offset=ofs_ofs)[0] 481 if ofs & 0x80000000: 482 idx64 = ofs & 0x7fffffff 483 ofs64_ofs = self.ofs64table_ofs + idx64 * 8 484 ofs = struct.unpack_from('!Q', self.map, offset=ofs64_ofs)[0] 485 return ofs 486 487 def _idx_to_hash(self, idx): 488 if idx >= self.nsha or idx < 0: 489 raise IndexError('invalid pack index index %d' % idx) 490 ofs = self.sha_ofs + idx * 20 491 return self.map[ofs : ofs + 20] 492 493 def __iter__(self): 494 start = self.sha_ofs 495 for ofs in range(start, start + 20 * self.nsha, 20): 496 yield self.map[ofs : ofs + 20] 497 498 def close(self): 499 if self.map is not None: 500 self.shatable = None 501 self.map.close() 502 self.map = None 503 504 505_mpi_count = 0 506class PackIdxList: 507 def __init__(self, dir, ignore_midx=False): 508 global _mpi_count 509 assert(_mpi_count == 0) # these things suck tons of VM; don't waste it 510 _mpi_count += 1 511 self.dir = dir 512 self.also = set() 513 self.packs = [] 514 self.do_bloom = False 515 self.bloom = None 516 self.ignore_midx = ignore_midx 517 self.refresh() 518 519 def __del__(self): 520 global _mpi_count 521 _mpi_count -= 1 522 assert(_mpi_count == 0) 523 524 def __iter__(self): 525 return iter(idxmerge(self.packs)) 526 527 def __len__(self): 528 return sum(len(pack) for pack in self.packs) 529 530 def exists(self, hash, want_source=False): 531 """Return nonempty if the object exists in the index files.""" 532 global _total_searches 533 _total_searches += 1 534 if hash in self.also: 535 return True 536 if self.do_bloom and self.bloom: 537 if self.bloom.exists(hash): 538 self.do_bloom = False 539 else: 540 _total_searches -= 1 # was counted by bloom 541 return None 542 for i in range(len(self.packs)): 543 p = self.packs[i] 544 _total_searches -= 1 # will be incremented by sub-pack 545 ix = p.exists(hash, want_source=want_source) 546 if ix: 547 # reorder so most recently used packs are searched first 548 self.packs = [p] + self.packs[:i] + self.packs[i+1:] 549 return ix 550 self.do_bloom = True 551 return None 552 553 def refresh(self, skip_midx = False): 554 """Refresh the index list. 555 This method verifies if .midx files were superseded (e.g. all of its 556 contents are in another, bigger .midx file) and removes the superseded 557 files. 558 559 If skip_midx is True, all work on .midx files will be skipped and .midx 560 files will be removed from the list. 561 562 The instance variable 'ignore_midx' can force this function to 563 always act as if skip_midx was True. 564 """ 565 if self.bloom is not None: 566 self.bloom.close() 567 self.bloom = None # Always reopen the bloom as it may have been relaced 568 self.do_bloom = False 569 skip_midx = skip_midx or self.ignore_midx 570 d = dict((p.name, p) for p in self.packs 571 if not skip_midx or not isinstance(p, midx.PackMidx)) 572 if os.path.exists(self.dir): 573 if not skip_midx: 574 midxl = [] 575 midxes = set(glob.glob(os.path.join(self.dir, b'*.midx'))) 576 # remove any *.midx files from our list that no longer exist 577 for ix in list(d.values()): 578 if not isinstance(ix, midx.PackMidx): 579 continue 580 if ix.name in midxes: 581 continue 582 # remove the midx 583 del d[ix.name] 584 ix.close() 585 self.packs.remove(ix) 586 for ix in self.packs: 587 if isinstance(ix, midx.PackMidx): 588 for name in ix.idxnames: 589 d[os.path.join(self.dir, name)] = ix 590 for full in midxes: 591 if not d.get(full): 592 mx = midx.PackMidx(full) 593 (mxd, mxf) = os.path.split(mx.name) 594 broken = False 595 for n in mx.idxnames: 596 if not os.path.exists(os.path.join(mxd, n)): 597 log(('warning: index %s missing\n' 598 ' used by %s\n') 599 % (path_msg(n), path_msg(mxf))) 600 broken = True 601 if broken: 602 mx.close() 603 del mx 604 unlink(full) 605 else: 606 midxl.append(mx) 607 midxl.sort(key=lambda ix: 608 (-len(ix), -xstat.stat(ix.name).st_mtime)) 609 for ix in midxl: 610 any_needed = False 611 for sub in ix.idxnames: 612 found = d.get(os.path.join(self.dir, sub)) 613 if not found or isinstance(found, PackIdx): 614 # doesn't exist, or exists but not in a midx 615 any_needed = True 616 break 617 if any_needed: 618 d[ix.name] = ix 619 for name in ix.idxnames: 620 d[os.path.join(self.dir, name)] = ix 621 elif not ix.force_keep: 622 debug1('midx: removing redundant: %s\n' 623 % path_msg(os.path.basename(ix.name))) 624 ix.close() 625 unlink(ix.name) 626 for full in glob.glob(os.path.join(self.dir, b'*.idx')): 627 if not d.get(full): 628 try: 629 ix = open_idx(full) 630 except GitError as e: 631 add_error(e) 632 continue 633 d[full] = ix 634 bfull = os.path.join(self.dir, b'bup.bloom') 635 if self.bloom is None and os.path.exists(bfull): 636 self.bloom = bloom.ShaBloom(bfull) 637 self.packs = list(set(d.values())) 638 self.packs.sort(reverse=True, key=lambda x: len(x)) 639 if self.bloom and self.bloom.valid() and len(self.bloom) >= len(self): 640 self.do_bloom = True 641 else: 642 self.bloom = None 643 debug1('PackIdxList: using %d index%s.\n' 644 % (len(self.packs), len(self.packs)!=1 and 'es' or '')) 645 646 def add(self, hash): 647 """Insert an additional object in the list.""" 648 self.also.add(hash) 649 650 651def open_idx(filename): 652 if filename.endswith(b'.idx'): 653 f = open(filename, 'rb') 654 header = f.read(8) 655 if header[0:4] == b'\377tOc': 656 version = struct.unpack('!I', header[4:8])[0] 657 if version == 2: 658 return PackIdxV2(filename, f) 659 else: 660 raise GitError('%s: expected idx file version 2, got %d' 661 % (path_msg(filename), version)) 662 elif len(header) == 8 and header[0:4] < b'\377tOc': 663 return PackIdxV1(filename, f) 664 else: 665 raise GitError('%s: unrecognized idx file header' 666 % path_msg(filename)) 667 elif filename.endswith(b'.midx'): 668 return midx.PackMidx(filename) 669 else: 670 raise GitError('idx filenames must end with .idx or .midx') 671 672 673def idxmerge(idxlist, final_progress=True): 674 """Generate a list of all the objects reachable in a PackIdxList.""" 675 def pfunc(count, total): 676 qprogress('Reading indexes: %.2f%% (%d/%d)\r' 677 % (count*100.0/total, count, total)) 678 def pfinal(count, total): 679 if final_progress: 680 progress('Reading indexes: %.2f%% (%d/%d), done.\n' 681 % (100, total, total)) 682 return merge_iter(idxlist, 10024, pfunc, pfinal) 683 684 685def _make_objcache(): 686 return PackIdxList(repo(b'objects/pack')) 687 688# bup-gc assumes that it can disable all PackWriter activities 689# (bloom/midx/cache) via the constructor and close() arguments. 690 691class PackWriter: 692 """Writes Git objects inside a pack file.""" 693 def __init__(self, objcache_maker=_make_objcache, compression_level=1, 694 run_midx=True, on_pack_finish=None, 695 max_pack_size=None, max_pack_objects=None, repo_dir=None): 696 self.repo_dir = repo_dir or repo() 697 self.file = None 698 self.parentfd = None 699 self.count = 0 700 self.outbytes = 0 701 self.filename = None 702 self.idx = None 703 self.objcache_maker = objcache_maker 704 self.objcache = None 705 self.compression_level = compression_level 706 self.run_midx=run_midx 707 self.on_pack_finish = on_pack_finish 708 if not max_pack_size: 709 max_pack_size = git_config_get(b'pack.packSizeLimit', 710 repo_dir=self.repo_dir) 711 if max_pack_size is not None: 712 max_pack_size = parse_num(max_pack_size) 713 if not max_pack_size: 714 # larger packs slow down pruning 715 max_pack_size = 1000 * 1000 * 1000 716 self.max_pack_size = max_pack_size 717 # cache memory usage is about 83 bytes per object 718 self.max_pack_objects = max_pack_objects if max_pack_objects \ 719 else max(1, self.max_pack_size // 5000) 720 721 def __del__(self): 722 self.close() 723 724 def __enter__(self): 725 return self 726 727 def __exit__(self, type, value, traceback): 728 self.close() 729 730 def _open(self): 731 if not self.file: 732 objdir = dir = os.path.join(self.repo_dir, b'objects') 733 fd, name = tempfile.mkstemp(suffix=b'.pack', dir=objdir) 734 try: 735 self.file = os.fdopen(fd, 'w+b') 736 except: 737 os.close(fd) 738 raise 739 try: 740 self.parentfd = os.open(objdir, os.O_RDONLY) 741 except: 742 f = self.file 743 self.file = None 744 f.close() 745 raise 746 assert name.endswith(b'.pack') 747 self.filename = name[:-5] 748 self.file.write(b'PACK\0\0\0\2\0\0\0\0') 749 self.idx = PackIdxV2Writer() 750 751 def _raw_write(self, datalist, sha): 752 self._open() 753 f = self.file 754 # in case we get interrupted (eg. KeyboardInterrupt), it's best if 755 # the file never has a *partial* blob. So let's make sure it's 756 # all-or-nothing. (The blob shouldn't be very big anyway, thanks 757 # to our hashsplit algorithm.) f.write() does its own buffering, 758 # but that's okay because we'll flush it in _end(). 759 oneblob = b''.join(datalist) 760 try: 761 f.write(oneblob) 762 except IOError as e: 763 reraise(GitError(e)) 764 nw = len(oneblob) 765 crc = zlib.crc32(oneblob) & 0xffffffff 766 self._update_idx(sha, crc, nw) 767 self.outbytes += nw 768 self.count += 1 769 return nw, crc 770 771 def _update_idx(self, sha, crc, size): 772 assert(sha) 773 if self.idx: 774 self.idx.add(sha, crc, self.file.tell() - size) 775 776 def _write(self, sha, type, content): 777 if verbose: 778 log('>') 779 if not sha: 780 sha = calc_hash(type, content) 781 size, crc = self._raw_write(_encode_packobj(type, content, 782 self.compression_level), 783 sha=sha) 784 if self.outbytes >= self.max_pack_size \ 785 or self.count >= self.max_pack_objects: 786 self.breakpoint() 787 return sha 788 789 def breakpoint(self): 790 """Clear byte and object counts and return the last processed id.""" 791 id = self._end(self.run_midx) 792 self.outbytes = self.count = 0 793 return id 794 795 def _require_objcache(self): 796 if self.objcache is None and self.objcache_maker: 797 self.objcache = self.objcache_maker() 798 if self.objcache is None: 799 raise GitError( 800 "PackWriter not opened or can't check exists w/o objcache") 801 802 def exists(self, id, want_source=False): 803 """Return non-empty if an object is found in the object cache.""" 804 self._require_objcache() 805 return self.objcache.exists(id, want_source=want_source) 806 807 def just_write(self, sha, type, content): 808 """Write an object to the pack file without checking for duplication.""" 809 self._write(sha, type, content) 810 # If nothing else, gc doesn't have/want an objcache 811 if self.objcache is not None: 812 self.objcache.add(sha) 813 814 def maybe_write(self, type, content): 815 """Write an object to the pack file if not present and return its id.""" 816 sha = calc_hash(type, content) 817 if not self.exists(sha): 818 self._require_objcache() 819 self.just_write(sha, type, content) 820 return sha 821 822 def new_blob(self, blob): 823 """Create a blob object in the pack with the supplied content.""" 824 return self.maybe_write(b'blob', blob) 825 826 def new_tree(self, shalist): 827 """Create a tree object in the pack.""" 828 content = tree_encode(shalist) 829 return self.maybe_write(b'tree', content) 830 831 def new_commit(self, tree, parent, 832 author, adate_sec, adate_tz, 833 committer, cdate_sec, cdate_tz, 834 msg): 835 """Create a commit object in the pack. The date_sec values must be 836 epoch-seconds, and if a tz is None, the local timezone is assumed.""" 837 if adate_tz is not None: 838 adate_str = _git_date_str(adate_sec, adate_tz) 839 else: 840 adate_str = _local_git_date_str(adate_sec) 841 if cdate_tz is not None: 842 cdate_str = _git_date_str(cdate_sec, cdate_tz) 843 else: 844 cdate_str = _local_git_date_str(cdate_sec) 845 l = [] 846 if tree: l.append(b'tree %s' % hexlify(tree)) 847 if parent: l.append(b'parent %s' % hexlify(parent)) 848 if author: l.append(b'author %s %s' % (author, adate_str)) 849 if committer: l.append(b'committer %s %s' % (committer, cdate_str)) 850 l.append(b'') 851 l.append(msg) 852 return self.maybe_write(b'commit', b'\n'.join(l)) 853 854 def abort(self): 855 """Remove the pack file from disk.""" 856 f = self.file 857 if f: 858 pfd = self.parentfd 859 self.file = None 860 self.parentfd = None 861 self.idx = None 862 try: 863 try: 864 os.unlink(self.filename + b'.pack') 865 finally: 866 f.close() 867 finally: 868 if pfd is not None: 869 os.close(pfd) 870 871 def _end(self, run_midx=True): 872 f = self.file 873 if not f: return None 874 self.file = None 875 try: 876 self.objcache = None 877 idx = self.idx 878 self.idx = None 879 880 # update object count 881 f.seek(8) 882 cp = struct.pack('!i', self.count) 883 assert(len(cp) == 4) 884 f.write(cp) 885 886 # calculate the pack sha1sum 887 f.seek(0) 888 sum = Sha1() 889 for b in chunkyreader(f): 890 sum.update(b) 891 packbin = sum.digest() 892 f.write(packbin) 893 fdatasync(f.fileno()) 894 finally: 895 f.close() 896 897 obj_list_sha = idx.write(self.filename + b'.idx', packbin) 898 nameprefix = os.path.join(self.repo_dir, 899 b'objects/pack/pack-' + obj_list_sha) 900 if os.path.exists(self.filename + b'.map'): 901 os.unlink(self.filename + b'.map') 902 os.rename(self.filename + b'.pack', nameprefix + b'.pack') 903 os.rename(self.filename + b'.idx', nameprefix + b'.idx') 904 try: 905 os.fsync(self.parentfd) 906 finally: 907 os.close(self.parentfd) 908 909 if run_midx: 910 auto_midx(os.path.join(self.repo_dir, b'objects/pack')) 911 912 if self.on_pack_finish: 913 self.on_pack_finish(nameprefix) 914 915 return nameprefix 916 917 def close(self, run_midx=True): 918 """Close the pack file and move it to its definitive path.""" 919 return self._end(run_midx=run_midx) 920 921 922class PackIdxV2Writer: 923 def __init__(self): 924 self.idx = list(list() for i in range(256)) 925 self.count = 0 926 927 def add(self, sha, crc, offs): 928 assert(sha) 929 self.count += 1 930 self.idx[byte_int(sha[0])].append((sha, crc, offs)) 931 932 def write(self, filename, packbin): 933 ofs64_count = 0 934 for section in self.idx: 935 for entry in section: 936 if entry[2] >= 2**31: 937 ofs64_count += 1 938 939 # Length: header + fan-out + shas-and-crcs + overflow-offsets 940 index_len = 8 + (4 * 256) + (28 * self.count) + (8 * ofs64_count) 941 idx_map = None 942 idx_f = open(filename, 'w+b') 943 try: 944 idx_f.truncate(index_len) 945 fdatasync(idx_f.fileno()) 946 idx_map = mmap_readwrite(idx_f, close=False) 947 try: 948 count = _helpers.write_idx(filename, idx_map, self.idx, 949 self.count) 950 assert(count == self.count) 951 idx_map.flush() 952 finally: 953 idx_map.close() 954 finally: 955 idx_f.close() 956 957 idx_f = open(filename, 'a+b') 958 try: 959 idx_f.write(packbin) 960 idx_f.seek(0) 961 idx_sum = Sha1() 962 b = idx_f.read(8 + 4*256) 963 idx_sum.update(b) 964 965 obj_list_sum = Sha1() 966 for b in chunkyreader(idx_f, 20 * self.count): 967 idx_sum.update(b) 968 obj_list_sum.update(b) 969 namebase = hexlify(obj_list_sum.digest()) 970 971 for b in chunkyreader(idx_f): 972 idx_sum.update(b) 973 idx_f.write(idx_sum.digest()) 974 fdatasync(idx_f.fileno()) 975 return namebase 976 finally: 977 idx_f.close() 978 979 980def list_refs(patterns=None, repo_dir=None, 981 limit_to_heads=False, limit_to_tags=False): 982 """Yield (refname, hash) tuples for all repository refs unless 983 patterns are specified. In that case, only include tuples for 984 refs matching those patterns (cf. git-show-ref(1)). The limits 985 restrict the result items to refs/heads or refs/tags. If both 986 limits are specified, items from both sources will be included. 987 988 """ 989 argv = [b'git', b'show-ref'] 990 if limit_to_heads: 991 argv.append(b'--heads') 992 if limit_to_tags: 993 argv.append(b'--tags') 994 argv.append(b'--') 995 if patterns: 996 argv.extend(patterns) 997 p = subprocess.Popen(argv, env=_gitenv(repo_dir), stdout=subprocess.PIPE) 998 out = p.stdout.read().strip() 999 rv = p.wait() # not fatal 1000 if rv: 1001 assert(not out) 1002 if out: 1003 for d in out.split(b'\n'): 1004 sha, name = d.split(b' ', 1) 1005 yield name, unhexlify(sha) 1006 1007 1008def read_ref(refname, repo_dir = None): 1009 """Get the commit id of the most recent commit made on a given ref.""" 1010 refs = list_refs(patterns=[refname], repo_dir=repo_dir, limit_to_heads=True) 1011 l = tuple(islice(refs, 2)) 1012 if l: 1013 assert(len(l) == 1) 1014 return l[0][1] 1015 else: 1016 return None 1017 1018 1019def rev_list_invocation(ref_or_refs, format=None): 1020 if isinstance(ref_or_refs, bytes): 1021 refs = (ref_or_refs,) 1022 else: 1023 refs = ref_or_refs 1024 argv = [b'git', b'rev-list'] 1025 1026 if format: 1027 argv.append(b'--pretty=format:' + format) 1028 for ref in refs: 1029 assert not ref.startswith(b'-') 1030 argv.append(ref) 1031 argv.append(b'--') 1032 return argv 1033 1034 1035def rev_list(ref_or_refs, parse=None, format=None, repo_dir=None): 1036 """Yield information about commits as per "git rev-list". If a format 1037 is not provided, yield one hex hash at a time. If a format is 1038 provided, pass it to rev-list and call parse(git_stdout) for each 1039 commit with the stream positioned just after the rev-list "commit 1040 HASH" header line. When a format is provided yield (oidx, 1041 parse(git_stdout)) for each commit. 1042 1043 """ 1044 assert bool(parse) == bool(format) 1045 p = subprocess.Popen(rev_list_invocation(ref_or_refs, 1046 format=format), 1047 env=_gitenv(repo_dir), 1048 stdout = subprocess.PIPE) 1049 if not format: 1050 for line in p.stdout: 1051 yield line.strip() 1052 else: 1053 line = p.stdout.readline() 1054 while line: 1055 s = line.strip() 1056 if not s.startswith(b'commit '): 1057 raise Exception('unexpected line ' + repr(s)) 1058 s = s[7:] 1059 assert len(s) == 40 1060 yield s, parse(p.stdout) 1061 line = p.stdout.readline() 1062 1063 rv = p.wait() # not fatal 1064 if rv: 1065 raise GitError('git rev-list returned error %d' % rv) 1066 1067 1068def get_commit_dates(refs, repo_dir=None): 1069 """Get the dates for the specified commit refs. For now, every unique 1070 string in refs must resolve to a different commit or this 1071 function will fail.""" 1072 result = [] 1073 for ref in refs: 1074 commit = get_commit_items(ref, cp(repo_dir)) 1075 result.append(commit.author_sec) 1076 return result 1077 1078 1079def rev_parse(committish, repo_dir=None): 1080 """Resolve the full hash for 'committish', if it exists. 1081 1082 Should be roughly equivalent to 'git rev-parse'. 1083 1084 Returns the hex value of the hash if it is found, None if 'committish' does 1085 not correspond to anything. 1086 """ 1087 head = read_ref(committish, repo_dir=repo_dir) 1088 if head: 1089 debug2("resolved from ref: commit = %s\n" % hexlify(head)) 1090 return head 1091 1092 pL = PackIdxList(repo(b'objects/pack', repo_dir=repo_dir)) 1093 1094 if len(committish) == 40: 1095 try: 1096 hash = unhexlify(committish) 1097 except TypeError: 1098 return None 1099 1100 if pL.exists(hash): 1101 return hash 1102 1103 return None 1104 1105 1106def update_ref(refname, newval, oldval, repo_dir=None): 1107 """Update a repository reference.""" 1108 if not oldval: 1109 oldval = b'' 1110 assert refname.startswith(b'refs/heads/') \ 1111 or refname.startswith(b'refs/tags/') 1112 p = subprocess.Popen([b'git', b'update-ref', refname, 1113 hexlify(newval), hexlify(oldval)], 1114 env=_gitenv(repo_dir)) 1115 _git_wait(b'git update-ref', p) 1116 1117 1118def delete_ref(refname, oldvalue=None): 1119 """Delete a repository reference (see git update-ref(1)).""" 1120 assert refname.startswith(b'refs/') 1121 oldvalue = [] if not oldvalue else [oldvalue] 1122 p = subprocess.Popen([b'git', b'update-ref', b'-d', refname] + oldvalue, 1123 env=_gitenv()) 1124 _git_wait('git update-ref', p) 1125 1126 1127def guess_repo(path=None): 1128 """Set the path value in the global variable "repodir". 1129 This makes bup look for an existing bup repository, but not fail if a 1130 repository doesn't exist. Usually, if you are interacting with a bup 1131 repository, you would not be calling this function but using 1132 check_repo_or_die(). 1133 """ 1134 global repodir 1135 if path: 1136 repodir = path 1137 if not repodir: 1138 repodir = environ.get(b'BUP_DIR') 1139 if not repodir: 1140 repodir = os.path.expanduser(b'~/.bup') 1141 1142 1143def init_repo(path=None): 1144 """Create the Git bare repository for bup in a given path.""" 1145 guess_repo(path) 1146 d = repo() # appends a / to the path 1147 parent = os.path.dirname(os.path.dirname(d)) 1148 if parent and not os.path.exists(parent): 1149 raise GitError('parent directory "%s" does not exist\n' 1150 % path_msg(parent)) 1151 if os.path.exists(d) and not os.path.isdir(os.path.join(d, b'.')): 1152 raise GitError('"%s" exists but is not a directory\n' % path_msg(d)) 1153 p = subprocess.Popen([b'git', b'--bare', b'init'], stdout=sys.stderr, 1154 env=_gitenv()) 1155 _git_wait('git init', p) 1156 # Force the index version configuration in order to ensure bup works 1157 # regardless of the version of the installed Git binary. 1158 p = subprocess.Popen([b'git', b'config', b'pack.indexVersion', '2'], 1159 stdout=sys.stderr, env=_gitenv()) 1160 _git_wait('git config', p) 1161 # Enable the reflog 1162 p = subprocess.Popen([b'git', b'config', b'core.logAllRefUpdates', b'true'], 1163 stdout=sys.stderr, env=_gitenv()) 1164 _git_wait('git config', p) 1165 1166 1167def check_repo_or_die(path=None): 1168 """Check to see if a bup repository probably exists, and abort if not.""" 1169 guess_repo(path) 1170 top = repo() 1171 pst = stat_if_exists(top + b'/objects/pack') 1172 if pst and stat.S_ISDIR(pst.st_mode): 1173 return 1174 if not pst: 1175 top_st = stat_if_exists(top) 1176 if not top_st: 1177 log('error: repository %r does not exist (see "bup help init")\n' 1178 % top) 1179 sys.exit(15) 1180 log('error: %s is not a repository\n' % path_msg(top)) 1181 sys.exit(14) 1182 1183 1184def is_suitable_git(ver_str): 1185 if not ver_str.startswith(b'git version '): 1186 return 'unrecognized' 1187 ver_str = ver_str[len(b'git version '):] 1188 if ver_str.startswith(b'0.'): 1189 return 'insufficient' 1190 if ver_str.startswith(b'1.'): 1191 if re.match(br'1\.[012345]rc', ver_str): 1192 return 'insufficient' 1193 if re.match(br'1\.[01234]\.', ver_str): 1194 return 'insufficient' 1195 if re.match(br'1\.5\.[012345]($|\.)', ver_str): 1196 return 'insufficient' 1197 if re.match(br'1\.5\.6-rc', ver_str): 1198 return 'insufficient' 1199 return 'suitable' 1200 if re.match(br'[0-9]+(\.|$)?', ver_str): 1201 return 'suitable' 1202 sys.exit(13) 1203 1204_git_great = None 1205 1206def require_suitable_git(ver_str=None): 1207 """Raise GitError if the version of git isn't suitable. 1208 1209 Rely on ver_str when provided, rather than invoking the git in the 1210 path. 1211 1212 """ 1213 global _git_great 1214 if _git_great is not None: 1215 return 1216 if environ.get(b'BUP_GIT_VERSION_IS_FINE', b'').lower() \ 1217 in (b'yes', b'true', b'1'): 1218 _git_great = True 1219 return 1220 if not ver_str: 1221 ver_str, _, _ = _git_exo([b'git', b'--version']) 1222 status = is_suitable_git(ver_str) 1223 if status == 'unrecognized': 1224 raise GitError('Unexpected git --version output: %r' % ver_str) 1225 if status == 'insufficient': 1226 log('error: git version must be at least 1.5.6\n') 1227 sys.exit(1) 1228 if status == 'suitable': 1229 _git_great = True 1230 return 1231 assert False 1232 1233 1234class _AbortableIter: 1235 def __init__(self, it, onabort = None): 1236 self.it = it 1237 self.onabort = onabort 1238 self.done = None 1239 1240 def __iter__(self): 1241 return self 1242 1243 def __next__(self): 1244 try: 1245 return next(self.it) 1246 except StopIteration as e: 1247 self.done = True 1248 raise 1249 except: 1250 self.abort() 1251 raise 1252 1253 next = __next__ 1254 1255 def abort(self): 1256 """Abort iteration and call the abortion callback, if needed.""" 1257 if not self.done: 1258 self.done = True 1259 if self.onabort: 1260 self.onabort() 1261 1262 def __del__(self): 1263 self.abort() 1264 1265 1266class CatPipe: 1267 """Link to 'git cat-file' that is used to retrieve blob data.""" 1268 def __init__(self, repo_dir = None): 1269 require_suitable_git() 1270 self.repo_dir = repo_dir 1271 self.p = self.inprogress = None 1272 1273 def _abort(self): 1274 if self.p: 1275 self.p.stdout.close() 1276 self.p.stdin.close() 1277 self.p = None 1278 self.inprogress = None 1279 1280 def restart(self): 1281 self._abort() 1282 self.p = subprocess.Popen([b'git', b'cat-file', b'--batch'], 1283 stdin=subprocess.PIPE, 1284 stdout=subprocess.PIPE, 1285 close_fds = True, 1286 bufsize = 4096, 1287 env=_gitenv(self.repo_dir)) 1288 1289 def get(self, ref): 1290 """Yield (oidx, type, size), followed by the data referred to by ref. 1291 If ref does not exist, only yield (None, None, None). 1292 1293 """ 1294 if not self.p or self.p.poll() != None: 1295 self.restart() 1296 assert(self.p) 1297 poll_result = self.p.poll() 1298 assert(poll_result == None) 1299 if self.inprogress: 1300 log('get: opening %r while %r is open\n' % (ref, self.inprogress)) 1301 assert(not self.inprogress) 1302 assert ref.find(b'\n') < 0 1303 assert ref.find(b'\r') < 0 1304 assert not ref.startswith(b'-') 1305 self.inprogress = ref 1306 self.p.stdin.write(ref + b'\n') 1307 self.p.stdin.flush() 1308 hdr = self.p.stdout.readline() 1309 if hdr.endswith(b' missing\n'): 1310 self.inprogress = None 1311 yield None, None, None 1312 return 1313 info = hdr.split(b' ') 1314 if len(info) != 3 or len(info[0]) != 40: 1315 raise GitError('expected object (id, type, size), got %r' % info) 1316 oidx, typ, size = info 1317 size = int(size) 1318 it = _AbortableIter(chunkyreader(self.p.stdout, size), 1319 onabort=self._abort) 1320 try: 1321 yield oidx, typ, size 1322 for blob in it: 1323 yield blob 1324 readline_result = self.p.stdout.readline() 1325 assert readline_result == b'\n' 1326 self.inprogress = None 1327 except Exception as e: 1328 it.abort() 1329 raise 1330 1331 def _join(self, it): 1332 _, typ, _ = next(it) 1333 if typ == b'blob': 1334 for blob in it: 1335 yield blob 1336 elif typ == b'tree': 1337 treefile = b''.join(it) 1338 for (mode, name, sha) in tree_decode(treefile): 1339 for blob in self.join(hexlify(sha)): 1340 yield blob 1341 elif typ == b'commit': 1342 treeline = b''.join(it).split(b'\n')[0] 1343 assert treeline.startswith(b'tree ') 1344 for blob in self.join(treeline[5:]): 1345 yield blob 1346 else: 1347 raise GitError('invalid object type %r: expected blob/tree/commit' 1348 % typ) 1349 1350 def join(self, id): 1351 """Generate a list of the content of all blobs that can be reached 1352 from an object. The hash given in 'id' must point to a blob, a tree 1353 or a commit. The content of all blobs that can be seen from trees or 1354 commits will be added to the list. 1355 """ 1356 for d in self._join(self.get(id)): 1357 yield d 1358 1359 1360_cp = {} 1361 1362def cp(repo_dir=None): 1363 """Create a CatPipe object or reuse the already existing one.""" 1364 global _cp, repodir 1365 if not repo_dir: 1366 repo_dir = repodir or repo() 1367 repo_dir = os.path.abspath(repo_dir) 1368 cp = _cp.get(repo_dir) 1369 if not cp: 1370 cp = CatPipe(repo_dir) 1371 _cp[repo_dir] = cp 1372 return cp 1373 1374 1375def tags(repo_dir = None): 1376 """Return a dictionary of all tags in the form {hash: [tag_names, ...]}.""" 1377 tags = {} 1378 for n, c in list_refs(repo_dir = repo_dir, limit_to_tags=True): 1379 assert n.startswith(b'refs/tags/') 1380 name = n[10:] 1381 if not c in tags: 1382 tags[c] = [] 1383 tags[c].append(name) # more than one tag can point at 'c' 1384 return tags 1385 1386 1387class MissingObject(KeyError): 1388 def __init__(self, oid): 1389 self.oid = oid 1390 KeyError.__init__(self, 'object %r is missing' % hexlify(oid)) 1391 1392 1393WalkItem = namedtuple('WalkItem', ['oid', 'type', 'mode', 1394 'path', 'chunk_path', 'data']) 1395# The path is the mangled path, and if an item represents a fragment 1396# of a chunked file, the chunk_path will be the chunked subtree path 1397# for the chunk, i.e. ['', '2d3115e', ...]. The top-level path for a 1398# chunked file will have a chunk_path of ['']. So some chunk subtree 1399# of the file '/foo/bar/baz' might look like this: 1400# 1401# item.path = ['foo', 'bar', 'baz.bup'] 1402# item.chunk_path = ['', '2d3115e', '016b097'] 1403# item.type = 'tree' 1404# ... 1405 1406 1407def walk_object(get_ref, oidx, stop_at=None, include_data=None): 1408 """Yield everything reachable from oidx via get_ref (which must behave 1409 like CatPipe get) as a WalkItem, stopping whenever stop_at(oidx) 1410 returns true. Throw MissingObject if a hash encountered is 1411 missing from the repository, and don't read or return blob content 1412 in the data field unless include_data is set. 1413 1414 """ 1415 # Maintain the pending stack on the heap to avoid stack overflow 1416 pending = [(oidx, [], [], None)] 1417 while len(pending): 1418 oidx, parent_path, chunk_path, mode = pending.pop() 1419 oid = unhexlify(oidx) 1420 if stop_at and stop_at(oidx): 1421 continue 1422 1423 if (not include_data) and mode and stat.S_ISREG(mode): 1424 # If the object is a "regular file", then it's a leaf in 1425 # the graph, so we can skip reading the data if the caller 1426 # hasn't requested it. 1427 yield WalkItem(oid=oid, type=b'blob', 1428 chunk_path=chunk_path, path=parent_path, 1429 mode=mode, 1430 data=None) 1431 continue 1432 1433 item_it = get_ref(oidx) 1434 get_oidx, typ, _ = next(item_it) 1435 if not get_oidx: 1436 raise MissingObject(unhexlify(oidx)) 1437 if typ not in (b'blob', b'commit', b'tree'): 1438 raise Exception('unexpected repository object type %r' % typ) 1439 1440 # FIXME: set the mode based on the type when the mode is None 1441 if typ == b'blob' and not include_data: 1442 # Dump data until we can ask cat_pipe not to fetch it 1443 for ignored in item_it: 1444 pass 1445 data = None 1446 else: 1447 data = b''.join(item_it) 1448 1449 yield WalkItem(oid=oid, type=typ, 1450 chunk_path=chunk_path, path=parent_path, 1451 mode=mode, 1452 data=(data if include_data else None)) 1453 1454 if typ == b'commit': 1455 commit_items = parse_commit(data) 1456 for pid in commit_items.parents: 1457 pending.append((pid, parent_path, chunk_path, mode)) 1458 pending.append((commit_items.tree, parent_path, chunk_path, 1459 hashsplit.GIT_MODE_TREE)) 1460 elif typ == b'tree': 1461 for mode, name, ent_id in tree_decode(data): 1462 demangled, bup_type = demangle_name(name, mode) 1463 if chunk_path: 1464 sub_path = parent_path 1465 sub_chunk_path = chunk_path + [name] 1466 else: 1467 sub_path = parent_path + [name] 1468 if bup_type == BUP_CHUNKED: 1469 sub_chunk_path = [b''] 1470 else: 1471 sub_chunk_path = chunk_path 1472 pending.append((hexlify(ent_id), sub_path, sub_chunk_path, 1473 mode)) 1474