1# pack.py -- For dealing with packed git objects. 2# Copyright (C) 2007 James Westby <jw+debian@jameswestby.net> 3# Copyright (C) 2008-2013 Jelmer Vernooij <jelmer@jelmer.uk> 4# 5# Dulwich is dual-licensed under the Apache License, Version 2.0 and the GNU 6# General Public License as public by the Free Software Foundation; version 2.0 7# or (at your option) any later version. You can redistribute it and/or 8# modify it under the terms of either of these two licenses. 9# 10# Unless required by applicable law or agreed to in writing, software 11# distributed under the License is distributed on an "AS IS" BASIS, 12# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13# See the License for the specific language governing permissions and 14# limitations under the License. 15# 16# You should have received a copy of the licenses; if not, see 17# <http://www.gnu.org/licenses/> for a copy of the GNU General Public License 18# and <http://www.apache.org/licenses/LICENSE-2.0> for a copy of the Apache 19# License, Version 2.0. 20# 21 22"""Classes for dealing with packed git objects. 23 24A pack is a compact representation of a bunch of objects, stored 25using deltas where possible. 26 27They have two parts, the pack file, which stores the data, and an index 28that tells you where the data is. 29 30To find an object you look in all of the index files 'til you find a 31match for the object name. You then use the pointer got from this as 32a pointer in to the corresponding packfile. 33""" 34 35from collections import defaultdict 36 37import binascii 38from io import BytesIO, UnsupportedOperation 39from collections import ( 40 deque, 41 ) 42import difflib 43import struct 44 45from itertools import chain 46try: 47 from itertools import imap, izip 48except ImportError: 49 # Python3 50 imap = map 51 izip = zip 52 53import os 54import sys 55 56from hashlib import sha1 57from os import ( 58 SEEK_CUR, 59 SEEK_END, 60 ) 61from struct import unpack_from 62import zlib 63 64try: 65 import mmap 66except ImportError: 67 has_mmap = False 68else: 69 has_mmap = True 70 71# For some reason the above try, except fails to set has_mmap = False for plan9 72if sys.platform == 'Plan9': 73 has_mmap = False 74 75from dulwich.errors import ( # noqa: E402 76 ApplyDeltaError, 77 ChecksumMismatch, 78 ) 79from dulwich.file import GitFile # noqa: E402 80from dulwich.lru_cache import ( # noqa: E402 81 LRUSizeCache, 82 ) 83from dulwich.objects import ( # noqa: E402 84 ShaFile, 85 hex_to_sha, 86 sha_to_hex, 87 object_header, 88 ) 89 90 91OFS_DELTA = 6 92REF_DELTA = 7 93 94DELTA_TYPES = (OFS_DELTA, REF_DELTA) 95 96 97DEFAULT_PACK_DELTA_WINDOW_SIZE = 10 98 99 100def take_msb_bytes(read, crc32=None): 101 """Read bytes marked with most significant bit. 102 103 Args: 104 read: Read function 105 """ 106 ret = [] 107 while len(ret) == 0 or ret[-1] & 0x80: 108 b = read(1) 109 if crc32 is not None: 110 crc32 = binascii.crc32(b, crc32) 111 ret.append(ord(b[:1])) 112 return ret, crc32 113 114 115class PackFileDisappeared(Exception): 116 117 def __init__(self, obj): 118 self.obj = obj 119 120 121class UnpackedObject(object): 122 """Class encapsulating an object unpacked from a pack file. 123 124 These objects should only be created from within unpack_object. Most 125 members start out as empty and are filled in at various points by 126 read_zlib_chunks, unpack_object, DeltaChainIterator, etc. 127 128 End users of this object should take care that the function they're getting 129 this object from is guaranteed to set the members they need. 130 """ 131 132 __slots__ = [ 133 'offset', # Offset in its pack. 134 '_sha', # Cached binary SHA. 135 'obj_type_num', # Type of this object. 136 'obj_chunks', # Decompressed and delta-resolved chunks. 137 'pack_type_num', # Type of this object in the pack (may be a delta). 138 'delta_base', # Delta base offset or SHA. 139 'comp_chunks', # Compressed object chunks. 140 'decomp_chunks', # Decompressed object chunks. 141 'decomp_len', # Decompressed length of this object. 142 'crc32', # CRC32. 143 ] 144 145 # TODO(dborowitz): read_zlib_chunks and unpack_object could very well be 146 # methods of this object. 147 def __init__(self, pack_type_num, delta_base, decomp_len, crc32): 148 self.offset = None 149 self._sha = None 150 self.pack_type_num = pack_type_num 151 self.delta_base = delta_base 152 self.comp_chunks = None 153 self.decomp_chunks = [] 154 self.decomp_len = decomp_len 155 self.crc32 = crc32 156 157 if pack_type_num in DELTA_TYPES: 158 self.obj_type_num = None 159 self.obj_chunks = None 160 else: 161 self.obj_type_num = pack_type_num 162 self.obj_chunks = self.decomp_chunks 163 self.delta_base = delta_base 164 165 def sha(self): 166 """Return the binary SHA of this object.""" 167 if self._sha is None: 168 self._sha = obj_sha(self.obj_type_num, self.obj_chunks) 169 return self._sha 170 171 def sha_file(self): 172 """Return a ShaFile from this object.""" 173 return ShaFile.from_raw_chunks(self.obj_type_num, self.obj_chunks) 174 175 # Only provided for backwards compatibility with code that expects either 176 # chunks or a delta tuple. 177 def _obj(self): 178 """Return the decompressed chunks, or (delta base, delta chunks).""" 179 if self.pack_type_num in DELTA_TYPES: 180 return (self.delta_base, self.decomp_chunks) 181 else: 182 return self.decomp_chunks 183 184 def __eq__(self, other): 185 if not isinstance(other, UnpackedObject): 186 return False 187 for slot in self.__slots__: 188 if getattr(self, slot) != getattr(other, slot): 189 return False 190 return True 191 192 def __ne__(self, other): 193 return not (self == other) 194 195 def __repr__(self): 196 data = ['%s=%r' % (s, getattr(self, s)) for s in self.__slots__] 197 return '%s(%s)' % (self.__class__.__name__, ', '.join(data)) 198 199 200_ZLIB_BUFSIZE = 4096 201 202 203def read_zlib_chunks(read_some, unpacked, include_comp=False, 204 buffer_size=_ZLIB_BUFSIZE): 205 """Read zlib data from a buffer. 206 207 This function requires that the buffer have additional data following the 208 compressed data, which is guaranteed to be the case for git pack files. 209 210 Args: 211 read_some: Read function that returns at least one byte, but may 212 return less than the requested size. 213 unpacked: An UnpackedObject to write result data to. If its crc32 214 attr is not None, the CRC32 of the compressed bytes will be computed 215 using this starting CRC32. 216 After this function, will have the following attrs set: 217 * comp_chunks (if include_comp is True) 218 * decomp_chunks 219 * decomp_len 220 * crc32 221 include_comp: If True, include compressed data in the result. 222 buffer_size: Size of the read buffer. 223 Returns: Leftover unused data from the decompression. 224 Raises: 225 zlib.error: if a decompression error occurred. 226 """ 227 if unpacked.decomp_len <= -1: 228 raise ValueError('non-negative zlib data stream size expected') 229 decomp_obj = zlib.decompressobj() 230 231 comp_chunks = [] 232 decomp_chunks = unpacked.decomp_chunks 233 decomp_len = 0 234 crc32 = unpacked.crc32 235 236 while True: 237 add = read_some(buffer_size) 238 if not add: 239 raise zlib.error('EOF before end of zlib stream') 240 comp_chunks.append(add) 241 decomp = decomp_obj.decompress(add) 242 decomp_len += len(decomp) 243 decomp_chunks.append(decomp) 244 unused = decomp_obj.unused_data 245 if unused: 246 left = len(unused) 247 if crc32 is not None: 248 crc32 = binascii.crc32(add[:-left], crc32) 249 if include_comp: 250 comp_chunks[-1] = add[:-left] 251 break 252 elif crc32 is not None: 253 crc32 = binascii.crc32(add, crc32) 254 if crc32 is not None: 255 crc32 &= 0xffffffff 256 257 if decomp_len != unpacked.decomp_len: 258 raise zlib.error('decompressed data does not match expected size') 259 260 unpacked.crc32 = crc32 261 if include_comp: 262 unpacked.comp_chunks = comp_chunks 263 return unused 264 265 266def iter_sha1(iter): 267 """Return the hexdigest of the SHA1 over a set of names. 268 269 Args: 270 iter: Iterator over string objects 271 Returns: 40-byte hex sha1 digest 272 """ 273 sha = sha1() 274 for name in iter: 275 sha.update(name) 276 return sha.hexdigest().encode('ascii') 277 278 279def load_pack_index(path): 280 """Load an index file by path. 281 282 Args: 283 filename: Path to the index file 284 Returns: A PackIndex loaded from the given path 285 """ 286 with GitFile(path, 'rb') as f: 287 return load_pack_index_file(path, f) 288 289 290def _load_file_contents(f, size=None): 291 try: 292 fd = f.fileno() 293 except (UnsupportedOperation, AttributeError): 294 fd = None 295 # Attempt to use mmap if possible 296 if fd is not None: 297 if size is None: 298 size = os.fstat(fd).st_size 299 if has_mmap: 300 try: 301 contents = mmap.mmap(fd, size, access=mmap.ACCESS_READ) 302 except mmap.error: 303 # Perhaps a socket? 304 pass 305 else: 306 return contents, size 307 contents = f.read() 308 size = len(contents) 309 return contents, size 310 311 312def load_pack_index_file(path, f): 313 """Load an index file from a file-like object. 314 315 Args: 316 path: Path for the index file 317 f: File-like object 318 Returns: A PackIndex loaded from the given file 319 """ 320 contents, size = _load_file_contents(f) 321 if contents[:4] == b'\377tOc': 322 version = struct.unpack(b'>L', contents[4:8])[0] 323 if version == 2: 324 return PackIndex2( 325 path, file=f, contents=contents, size=size) 326 else: 327 raise KeyError('Unknown pack index format %d' % version) 328 else: 329 return PackIndex1(path, file=f, contents=contents, size=size) 330 331 332def bisect_find_sha(start, end, sha, unpack_name): 333 """Find a SHA in a data blob with sorted SHAs. 334 335 Args: 336 start: Start index of range to search 337 end: End index of range to search 338 sha: Sha to find 339 unpack_name: Callback to retrieve SHA by index 340 Returns: Index of the SHA, or None if it wasn't found 341 """ 342 assert start <= end 343 while start <= end: 344 i = (start + end) // 2 345 file_sha = unpack_name(i) 346 if file_sha < sha: 347 start = i + 1 348 elif file_sha > sha: 349 end = i - 1 350 else: 351 return i 352 return None 353 354 355class PackIndex(object): 356 """An index in to a packfile. 357 358 Given a sha id of an object a pack index can tell you the location in the 359 packfile of that object if it has it. 360 """ 361 362 def __eq__(self, other): 363 if not isinstance(other, PackIndex): 364 return False 365 366 for (name1, _, _), (name2, _, _) in izip(self.iterentries(), 367 other.iterentries()): 368 if name1 != name2: 369 return False 370 return True 371 372 def __ne__(self, other): 373 return not self.__eq__(other) 374 375 def __len__(self): 376 """Return the number of entries in this pack index.""" 377 raise NotImplementedError(self.__len__) 378 379 def __iter__(self): 380 """Iterate over the SHAs in this pack.""" 381 return imap(sha_to_hex, self._itersha()) 382 383 def iterentries(self): 384 """Iterate over the entries in this pack index. 385 386 Returns: iterator over tuples with object name, offset in packfile and 387 crc32 checksum. 388 """ 389 raise NotImplementedError(self.iterentries) 390 391 def get_pack_checksum(self): 392 """Return the SHA1 checksum stored for the corresponding packfile. 393 394 Returns: 20-byte binary digest 395 """ 396 raise NotImplementedError(self.get_pack_checksum) 397 398 def object_index(self, sha): 399 """Return the index in to the corresponding packfile for the object. 400 401 Given the name of an object it will return the offset that object 402 lives at within the corresponding pack file. If the pack file doesn't 403 have the object then None will be returned. 404 """ 405 if len(sha) == 40: 406 sha = hex_to_sha(sha) 407 try: 408 return self._object_index(sha) 409 except ValueError: 410 closed = getattr(self._contents, 'closed', None) 411 if closed in (None, True): 412 raise PackFileDisappeared(self) 413 raise 414 415 def object_sha1(self, index): 416 """Return the SHA1 corresponding to the index in the pack file. 417 """ 418 # PERFORMANCE/TODO(jelmer): Avoid scanning entire index 419 for (name, offset, crc32) in self.iterentries(): 420 if offset == index: 421 return name 422 else: 423 raise KeyError(index) 424 425 def _object_index(self, sha): 426 """See object_index. 427 428 Args: 429 sha: A *binary* SHA string. (20 characters long)_ 430 """ 431 raise NotImplementedError(self._object_index) 432 433 def objects_sha1(self): 434 """Return the hex SHA1 over all the shas of all objects in this pack. 435 436 Note: This is used for the filename of the pack. 437 """ 438 return iter_sha1(self._itersha()) 439 440 def _itersha(self): 441 """Yield all the SHA1's of the objects in the index, sorted.""" 442 raise NotImplementedError(self._itersha) 443 444 445class MemoryPackIndex(PackIndex): 446 """Pack index that is stored entirely in memory.""" 447 448 def __init__(self, entries, pack_checksum=None): 449 """Create a new MemoryPackIndex. 450 451 Args: 452 entries: Sequence of name, idx, crc32 (sorted) 453 pack_checksum: Optional pack checksum 454 """ 455 self._by_sha = {} 456 self._by_index = {} 457 for name, idx, crc32 in entries: 458 self._by_sha[name] = idx 459 self._by_index[idx] = name 460 self._entries = entries 461 self._pack_checksum = pack_checksum 462 463 def get_pack_checksum(self): 464 return self._pack_checksum 465 466 def __len__(self): 467 return len(self._entries) 468 469 def _object_index(self, sha): 470 return self._by_sha[sha][0] 471 472 def object_sha1(self, index): 473 return self._by_index[index] 474 475 def _itersha(self): 476 return iter(self._by_sha) 477 478 def iterentries(self): 479 return iter(self._entries) 480 481 482class FilePackIndex(PackIndex): 483 """Pack index that is based on a file. 484 485 To do the loop it opens the file, and indexes first 256 4 byte groups 486 with the first byte of the sha id. The value in the four byte group indexed 487 is the end of the group that shares the same starting byte. Subtract one 488 from the starting byte and index again to find the start of the group. 489 The values are sorted by sha id within the group, so do the math to find 490 the start and end offset and then bisect in to find if the value is 491 present. 492 """ 493 494 def __init__(self, filename, file=None, contents=None, size=None): 495 """Create a pack index object. 496 497 Provide it with the name of the index file to consider, and it will map 498 it whenever required. 499 """ 500 self._filename = filename 501 # Take the size now, so it can be checked each time we map the file to 502 # ensure that it hasn't changed. 503 if file is None: 504 self._file = GitFile(filename, 'rb') 505 else: 506 self._file = file 507 if contents is None: 508 self._contents, self._size = _load_file_contents(self._file, size) 509 else: 510 self._contents, self._size = (contents, size) 511 512 @property 513 def path(self): 514 return self._filename 515 516 def __eq__(self, other): 517 # Quick optimization: 518 if (isinstance(other, FilePackIndex) and 519 self._fan_out_table != other._fan_out_table): 520 return False 521 522 return super(FilePackIndex, self).__eq__(other) 523 524 def close(self): 525 self._file.close() 526 if getattr(self._contents, "close", None) is not None: 527 self._contents.close() 528 529 def __len__(self): 530 """Return the number of entries in this pack index.""" 531 return self._fan_out_table[-1] 532 533 def _unpack_entry(self, i): 534 """Unpack the i-th entry in the index file. 535 536 Returns: Tuple with object name (SHA), offset in pack file and CRC32 537 checksum (if known). 538 """ 539 raise NotImplementedError(self._unpack_entry) 540 541 def _unpack_name(self, i): 542 """Unpack the i-th name from the index file.""" 543 raise NotImplementedError(self._unpack_name) 544 545 def _unpack_offset(self, i): 546 """Unpack the i-th object offset from the index file.""" 547 raise NotImplementedError(self._unpack_offset) 548 549 def _unpack_crc32_checksum(self, i): 550 """Unpack the crc32 checksum for the ith object from the index file. 551 """ 552 raise NotImplementedError(self._unpack_crc32_checksum) 553 554 def _itersha(self): 555 for i in range(len(self)): 556 yield self._unpack_name(i) 557 558 def iterentries(self): 559 """Iterate over the entries in this pack index. 560 561 Returns: iterator over tuples with object name, offset in packfile and 562 crc32 checksum. 563 """ 564 for i in range(len(self)): 565 yield self._unpack_entry(i) 566 567 def _read_fan_out_table(self, start_offset): 568 ret = [] 569 for i in range(0x100): 570 fanout_entry = self._contents[ 571 start_offset+i*4:start_offset+(i+1)*4] 572 ret.append(struct.unpack('>L', fanout_entry)[0]) 573 return ret 574 575 def check(self): 576 """Check that the stored checksum matches the actual checksum.""" 577 actual = self.calculate_checksum() 578 stored = self.get_stored_checksum() 579 if actual != stored: 580 raise ChecksumMismatch(stored, actual) 581 582 def calculate_checksum(self): 583 """Calculate the SHA1 checksum over this pack index. 584 585 Returns: This is a 20-byte binary digest 586 """ 587 return sha1(self._contents[:-20]).digest() 588 589 def get_pack_checksum(self): 590 """Return the SHA1 checksum stored for the corresponding packfile. 591 592 Returns: 20-byte binary digest 593 """ 594 return bytes(self._contents[-40:-20]) 595 596 def get_stored_checksum(self): 597 """Return the SHA1 checksum stored for this index. 598 599 Returns: 20-byte binary digest 600 """ 601 return bytes(self._contents[-20:]) 602 603 def _object_index(self, sha): 604 """See object_index. 605 606 Args: 607 sha: A *binary* SHA string. (20 characters long)_ 608 """ 609 assert len(sha) == 20 610 idx = ord(sha[:1]) 611 if idx == 0: 612 start = 0 613 else: 614 start = self._fan_out_table[idx-1] 615 end = self._fan_out_table[idx] 616 i = bisect_find_sha(start, end, sha, self._unpack_name) 617 if i is None: 618 raise KeyError(sha) 619 return self._unpack_offset(i) 620 621 622class PackIndex1(FilePackIndex): 623 """Version 1 Pack Index file.""" 624 625 def __init__(self, filename, file=None, contents=None, size=None): 626 super(PackIndex1, self).__init__(filename, file, contents, size) 627 self.version = 1 628 self._fan_out_table = self._read_fan_out_table(0) 629 630 def _unpack_entry(self, i): 631 (offset, name) = unpack_from('>L20s', self._contents, 632 (0x100 * 4) + (i * 24)) 633 return (name, offset, None) 634 635 def _unpack_name(self, i): 636 offset = (0x100 * 4) + (i * 24) + 4 637 return self._contents[offset:offset+20] 638 639 def _unpack_offset(self, i): 640 offset = (0x100 * 4) + (i * 24) 641 return unpack_from('>L', self._contents, offset)[0] 642 643 def _unpack_crc32_checksum(self, i): 644 # Not stored in v1 index files 645 return None 646 647 648class PackIndex2(FilePackIndex): 649 """Version 2 Pack Index file.""" 650 651 def __init__(self, filename, file=None, contents=None, size=None): 652 super(PackIndex2, self).__init__(filename, file, contents, size) 653 if self._contents[:4] != b'\377tOc': 654 raise AssertionError('Not a v2 pack index file') 655 (self.version, ) = unpack_from(b'>L', self._contents, 4) 656 if self.version != 2: 657 raise AssertionError('Version was %d' % self.version) 658 self._fan_out_table = self._read_fan_out_table(8) 659 self._name_table_offset = 8 + 0x100 * 4 660 self._crc32_table_offset = self._name_table_offset + 20 * len(self) 661 self._pack_offset_table_offset = (self._crc32_table_offset + 662 4 * len(self)) 663 self._pack_offset_largetable_offset = ( 664 self._pack_offset_table_offset + 4 * len(self)) 665 666 def _unpack_entry(self, i): 667 return (self._unpack_name(i), self._unpack_offset(i), 668 self._unpack_crc32_checksum(i)) 669 670 def _unpack_name(self, i): 671 offset = self._name_table_offset + i * 20 672 return self._contents[offset:offset+20] 673 674 def _unpack_offset(self, i): 675 offset = self._pack_offset_table_offset + i * 4 676 offset = unpack_from('>L', self._contents, offset)[0] 677 if offset & (2**31): 678 offset = ( 679 self._pack_offset_largetable_offset + 680 (offset & (2 ** 31 - 1)) * 8) 681 offset = unpack_from('>Q', self._contents, offset)[0] 682 return offset 683 684 def _unpack_crc32_checksum(self, i): 685 return unpack_from('>L', self._contents, 686 self._crc32_table_offset + i * 4)[0] 687 688 689def read_pack_header(read): 690 """Read the header of a pack file. 691 692 Args: 693 read: Read function 694 Returns: Tuple of (pack version, number of objects). If no data is 695 available to read, returns (None, None). 696 """ 697 header = read(12) 698 if not header: 699 return None, None 700 if header[:4] != b'PACK': 701 raise AssertionError('Invalid pack header %r' % header) 702 (version,) = unpack_from(b'>L', header, 4) 703 if version not in (2, 3): 704 raise AssertionError('Version was %d' % version) 705 (num_objects,) = unpack_from(b'>L', header, 8) 706 return (version, num_objects) 707 708 709def chunks_length(chunks): 710 if isinstance(chunks, bytes): 711 return len(chunks) 712 else: 713 return sum(imap(len, chunks)) 714 715 716def unpack_object(read_all, read_some=None, compute_crc32=False, 717 include_comp=False, zlib_bufsize=_ZLIB_BUFSIZE): 718 """Unpack a Git object. 719 720 Args: 721 read_all: Read function that blocks until the number of requested 722 bytes are read. 723 read_some: Read function that returns at least one byte, but may not 724 return the number of bytes requested. 725 compute_crc32: If True, compute the CRC32 of the compressed data. If 726 False, the returned CRC32 will be None. 727 include_comp: If True, include compressed data in the result. 728 zlib_bufsize: An optional buffer size for zlib operations. 729 Returns: A tuple of (unpacked, unused), where unused is the unused data 730 leftover from decompression, and unpacked in an UnpackedObject with 731 the following attrs set: 732 733 * obj_chunks (for non-delta types) 734 * pack_type_num 735 * delta_base (for delta types) 736 * comp_chunks (if include_comp is True) 737 * decomp_chunks 738 * decomp_len 739 * crc32 (if compute_crc32 is True) 740 """ 741 if read_some is None: 742 read_some = read_all 743 if compute_crc32: 744 crc32 = 0 745 else: 746 crc32 = None 747 748 bytes, crc32 = take_msb_bytes(read_all, crc32=crc32) 749 type_num = (bytes[0] >> 4) & 0x07 750 size = bytes[0] & 0x0f 751 for i, byte in enumerate(bytes[1:]): 752 size += (byte & 0x7f) << ((i * 7) + 4) 753 754 raw_base = len(bytes) 755 if type_num == OFS_DELTA: 756 bytes, crc32 = take_msb_bytes(read_all, crc32=crc32) 757 raw_base += len(bytes) 758 if bytes[-1] & 0x80: 759 raise AssertionError 760 delta_base_offset = bytes[0] & 0x7f 761 for byte in bytes[1:]: 762 delta_base_offset += 1 763 delta_base_offset <<= 7 764 delta_base_offset += (byte & 0x7f) 765 delta_base = delta_base_offset 766 elif type_num == REF_DELTA: 767 delta_base = read_all(20) 768 if compute_crc32: 769 crc32 = binascii.crc32(delta_base, crc32) 770 raw_base += 20 771 else: 772 delta_base = None 773 774 unpacked = UnpackedObject(type_num, delta_base, size, crc32) 775 unused = read_zlib_chunks(read_some, unpacked, buffer_size=zlib_bufsize, 776 include_comp=include_comp) 777 return unpacked, unused 778 779 780def _compute_object_size(value): 781 """Compute the size of a unresolved object for use with LRUSizeCache.""" 782 (num, obj) = value 783 if num in DELTA_TYPES: 784 return chunks_length(obj[1]) 785 return chunks_length(obj) 786 787 788class PackStreamReader(object): 789 """Class to read a pack stream. 790 791 The pack is read from a ReceivableProtocol using read() or recv() as 792 appropriate. 793 """ 794 795 def __init__(self, read_all, read_some=None, zlib_bufsize=_ZLIB_BUFSIZE): 796 self.read_all = read_all 797 if read_some is None: 798 self.read_some = read_all 799 else: 800 self.read_some = read_some 801 self.sha = sha1() 802 self._offset = 0 803 self._rbuf = BytesIO() 804 # trailer is a deque to avoid memory allocation on small reads 805 self._trailer = deque() 806 self._zlib_bufsize = zlib_bufsize 807 808 def _read(self, read, size): 809 """Read up to size bytes using the given callback. 810 811 As a side effect, update the verifier's hash (excluding the last 20 812 bytes read). 813 814 Args: 815 read: The read callback to read from. 816 size: The maximum number of bytes to read; the particular 817 behavior is callback-specific. 818 """ 819 data = read(size) 820 821 # maintain a trailer of the last 20 bytes we've read 822 n = len(data) 823 self._offset += n 824 tn = len(self._trailer) 825 if n >= 20: 826 to_pop = tn 827 to_add = 20 828 else: 829 to_pop = max(n + tn - 20, 0) 830 to_add = n 831 self.sha.update( 832 bytes(bytearray([self._trailer.popleft() for _ in range(to_pop)]))) 833 self._trailer.extend(data[-to_add:]) 834 835 # hash everything but the trailer 836 self.sha.update(data[:-to_add]) 837 return data 838 839 def _buf_len(self): 840 buf = self._rbuf 841 start = buf.tell() 842 buf.seek(0, SEEK_END) 843 end = buf.tell() 844 buf.seek(start) 845 return end - start 846 847 @property 848 def offset(self): 849 return self._offset - self._buf_len() 850 851 def read(self, size): 852 """Read, blocking until size bytes are read.""" 853 buf_len = self._buf_len() 854 if buf_len >= size: 855 return self._rbuf.read(size) 856 buf_data = self._rbuf.read() 857 self._rbuf = BytesIO() 858 return buf_data + self._read(self.read_all, size - buf_len) 859 860 def recv(self, size): 861 """Read up to size bytes, blocking until one byte is read.""" 862 buf_len = self._buf_len() 863 if buf_len: 864 data = self._rbuf.read(size) 865 if size >= buf_len: 866 self._rbuf = BytesIO() 867 return data 868 return self._read(self.read_some, size) 869 870 def __len__(self): 871 return self._num_objects 872 873 def read_objects(self, compute_crc32=False): 874 """Read the objects in this pack file. 875 876 Args: 877 compute_crc32: If True, compute the CRC32 of the compressed 878 data. If False, the returned CRC32 will be None. 879 Returns: Iterator over UnpackedObjects with the following members set: 880 offset 881 obj_type_num 882 obj_chunks (for non-delta types) 883 delta_base (for delta types) 884 decomp_chunks 885 decomp_len 886 crc32 (if compute_crc32 is True) 887 Raises: 888 ChecksumMismatch: if the checksum of the pack contents does not 889 match the checksum in the pack trailer. 890 zlib.error: if an error occurred during zlib decompression. 891 IOError: if an error occurred writing to the output file. 892 """ 893 pack_version, self._num_objects = read_pack_header(self.read) 894 if pack_version is None: 895 return 896 897 for i in range(self._num_objects): 898 offset = self.offset 899 unpacked, unused = unpack_object( 900 self.read, read_some=self.recv, compute_crc32=compute_crc32, 901 zlib_bufsize=self._zlib_bufsize) 902 unpacked.offset = offset 903 904 # prepend any unused data to current read buffer 905 buf = BytesIO() 906 buf.write(unused) 907 buf.write(self._rbuf.read()) 908 buf.seek(0) 909 self._rbuf = buf 910 911 yield unpacked 912 913 if self._buf_len() < 20: 914 # If the read buffer is full, then the last read() got the whole 915 # trailer off the wire. If not, it means there is still some of the 916 # trailer to read. We need to read() all 20 bytes; N come from the 917 # read buffer and (20 - N) come from the wire. 918 self.read(20) 919 920 pack_sha = bytearray(self._trailer) 921 if pack_sha != self.sha.digest(): 922 raise ChecksumMismatch(sha_to_hex(pack_sha), self.sha.hexdigest()) 923 924 925class PackStreamCopier(PackStreamReader): 926 """Class to verify a pack stream as it is being read. 927 928 The pack is read from a ReceivableProtocol using read() or recv() as 929 appropriate and written out to the given file-like object. 930 """ 931 932 def __init__(self, read_all, read_some, outfile, delta_iter=None): 933 """Initialize the copier. 934 935 Args: 936 read_all: Read function that blocks until the number of 937 requested bytes are read. 938 read_some: Read function that returns at least one byte, but may 939 not return the number of bytes requested. 940 outfile: File-like object to write output through. 941 delta_iter: Optional DeltaChainIterator to record deltas as we 942 read them. 943 """ 944 super(PackStreamCopier, self).__init__(read_all, read_some=read_some) 945 self.outfile = outfile 946 self._delta_iter = delta_iter 947 948 def _read(self, read, size): 949 """Read data from the read callback and write it to the file.""" 950 data = super(PackStreamCopier, self)._read(read, size) 951 self.outfile.write(data) 952 return data 953 954 def verify(self): 955 """Verify a pack stream and write it to the output file. 956 957 See PackStreamReader.iterobjects for a list of exceptions this may 958 throw. 959 """ 960 if self._delta_iter: 961 for unpacked in self.read_objects(): 962 self._delta_iter.record(unpacked) 963 else: 964 for _ in self.read_objects(): 965 pass 966 967 968def obj_sha(type, chunks): 969 """Compute the SHA for a numeric type and object chunks.""" 970 sha = sha1() 971 sha.update(object_header(type, chunks_length(chunks))) 972 if isinstance(chunks, bytes): 973 sha.update(chunks) 974 else: 975 for chunk in chunks: 976 sha.update(chunk) 977 return sha.digest() 978 979 980def compute_file_sha(f, start_ofs=0, end_ofs=0, buffer_size=1 << 16): 981 """Hash a portion of a file into a new SHA. 982 983 Args: 984 f: A file-like object to read from that supports seek(). 985 start_ofs: The offset in the file to start reading at. 986 end_ofs: The offset in the file to end reading at, relative to the 987 end of the file. 988 buffer_size: A buffer size for reading. 989 Returns: A new SHA object updated with data read from the file. 990 """ 991 sha = sha1() 992 f.seek(0, SEEK_END) 993 length = f.tell() 994 if (end_ofs < 0 and length + end_ofs < start_ofs) or end_ofs > length: 995 raise AssertionError( 996 "Attempt to read beyond file length. " 997 "start_ofs: %d, end_ofs: %d, file length: %d" % ( 998 start_ofs, end_ofs, length)) 999 todo = length + end_ofs - start_ofs 1000 f.seek(start_ofs) 1001 while todo: 1002 data = f.read(min(todo, buffer_size)) 1003 sha.update(data) 1004 todo -= len(data) 1005 return sha 1006 1007 1008class PackData(object): 1009 """The data contained in a packfile. 1010 1011 Pack files can be accessed both sequentially for exploding a pack, and 1012 directly with the help of an index to retrieve a specific object. 1013 1014 The objects within are either complete or a delta against another. 1015 1016 The header is variable length. If the MSB of each byte is set then it 1017 indicates that the subsequent byte is still part of the header. 1018 For the first byte the next MS bits are the type, which tells you the type 1019 of object, and whether it is a delta. The LS byte is the lowest bits of the 1020 size. For each subsequent byte the LS 7 bits are the next MS bits of the 1021 size, i.e. the last byte of the header contains the MS bits of the size. 1022 1023 For the complete objects the data is stored as zlib deflated data. 1024 The size in the header is the uncompressed object size, so to uncompress 1025 you need to just keep feeding data to zlib until you get an object back, 1026 or it errors on bad data. This is done here by just giving the complete 1027 buffer from the start of the deflated object on. This is bad, but until I 1028 get mmap sorted out it will have to do. 1029 1030 Currently there are no integrity checks done. Also no attempt is made to 1031 try and detect the delta case, or a request for an object at the wrong 1032 position. It will all just throw a zlib or KeyError. 1033 """ 1034 1035 def __init__(self, filename, file=None, size=None): 1036 """Create a PackData object representing the pack in the given filename. 1037 1038 The file must exist and stay readable until the object is disposed of. 1039 It must also stay the same size. It will be mapped whenever needed. 1040 1041 Currently there is a restriction on the size of the pack as the python 1042 mmap implementation is flawed. 1043 """ 1044 self._filename = filename 1045 self._size = size 1046 self._header_size = 12 1047 if file is None: 1048 self._file = GitFile(self._filename, 'rb') 1049 else: 1050 self._file = file 1051 (version, self._num_objects) = read_pack_header(self._file.read) 1052 self._offset_cache = LRUSizeCache( 1053 1024*1024*20, compute_size=_compute_object_size) 1054 self.pack = None 1055 1056 @property 1057 def filename(self): 1058 return os.path.basename(self._filename) 1059 1060 @property 1061 def path(self): 1062 return self._filename 1063 1064 @classmethod 1065 def from_file(cls, file, size): 1066 return cls(str(file), file=file, size=size) 1067 1068 @classmethod 1069 def from_path(cls, path): 1070 return cls(filename=path) 1071 1072 def close(self): 1073 self._file.close() 1074 1075 def __enter__(self): 1076 return self 1077 1078 def __exit__(self, exc_type, exc_val, exc_tb): 1079 self.close() 1080 1081 def _get_size(self): 1082 if self._size is not None: 1083 return self._size 1084 self._size = os.path.getsize(self._filename) 1085 if self._size < self._header_size: 1086 errmsg = ('%s is too small for a packfile (%d < %d)' % 1087 (self._filename, self._size, self._header_size)) 1088 raise AssertionError(errmsg) 1089 return self._size 1090 1091 def __len__(self): 1092 """Returns the number of objects in this pack.""" 1093 return self._num_objects 1094 1095 def calculate_checksum(self): 1096 """Calculate the checksum for this pack. 1097 1098 Returns: 20-byte binary SHA1 digest 1099 """ 1100 return compute_file_sha(self._file, end_ofs=-20).digest() 1101 1102 def get_ref(self, sha): 1103 """Get the object for a ref SHA, only looking in this pack.""" 1104 # TODO: cache these results 1105 if self.pack is None: 1106 raise KeyError(sha) 1107 try: 1108 offset = self.pack.index.object_index(sha) 1109 except KeyError: 1110 offset = None 1111 if offset: 1112 type, obj = self.get_object_at(offset) 1113 elif self.pack is not None and self.pack.resolve_ext_ref: 1114 type, obj = self.pack.resolve_ext_ref(sha) 1115 else: 1116 raise KeyError(sha) 1117 return offset, type, obj 1118 1119 def resolve_object(self, offset, type, obj, get_ref=None): 1120 """Resolve an object, possibly resolving deltas when necessary. 1121 1122 Returns: Tuple with object type and contents. 1123 """ 1124 # Walk down the delta chain, building a stack of deltas to reach 1125 # the requested object. 1126 base_offset = offset 1127 base_type = type 1128 base_obj = obj 1129 delta_stack = [] 1130 while base_type in DELTA_TYPES: 1131 prev_offset = base_offset 1132 if get_ref is None: 1133 get_ref = self.get_ref 1134 if base_type == OFS_DELTA: 1135 (delta_offset, delta) = base_obj 1136 # TODO: clean up asserts and replace with nicer error messages 1137 base_offset = base_offset - delta_offset 1138 base_type, base_obj = self.get_object_at(base_offset) 1139 assert isinstance(base_type, int) 1140 elif base_type == REF_DELTA: 1141 (basename, delta) = base_obj 1142 assert isinstance(basename, bytes) and len(basename) == 20 1143 base_offset, base_type, base_obj = get_ref(basename) 1144 assert isinstance(base_type, int) 1145 delta_stack.append((prev_offset, base_type, delta)) 1146 1147 # Now grab the base object (mustn't be a delta) and apply the 1148 # deltas all the way up the stack. 1149 chunks = base_obj 1150 for prev_offset, delta_type, delta in reversed(delta_stack): 1151 chunks = apply_delta(chunks, delta) 1152 # TODO(dborowitz): This can result in poor performance if 1153 # large base objects are separated from deltas in the pack. 1154 # We should reorganize so that we apply deltas to all 1155 # objects in a chain one after the other to optimize cache 1156 # performance. 1157 if prev_offset is not None: 1158 self._offset_cache[prev_offset] = base_type, chunks 1159 return base_type, chunks 1160 1161 def iterobjects(self, progress=None, compute_crc32=True): 1162 self._file.seek(self._header_size) 1163 for i in range(1, self._num_objects + 1): 1164 offset = self._file.tell() 1165 unpacked, unused = unpack_object( 1166 self._file.read, compute_crc32=compute_crc32) 1167 if progress is not None: 1168 progress(i, self._num_objects) 1169 yield (offset, unpacked.pack_type_num, unpacked._obj(), 1170 unpacked.crc32) 1171 # Back up over unused data. 1172 self._file.seek(-len(unused), SEEK_CUR) 1173 1174 def _iter_unpacked(self): 1175 # TODO(dborowitz): Merge this with iterobjects, if we can change its 1176 # return type. 1177 self._file.seek(self._header_size) 1178 1179 if self._num_objects is None: 1180 return 1181 1182 for _ in range(self._num_objects): 1183 offset = self._file.tell() 1184 unpacked, unused = unpack_object( 1185 self._file.read, compute_crc32=False) 1186 unpacked.offset = offset 1187 yield unpacked 1188 # Back up over unused data. 1189 self._file.seek(-len(unused), SEEK_CUR) 1190 1191 def iterentries(self, progress=None): 1192 """Yield entries summarizing the contents of this pack. 1193 1194 Args: 1195 progress: Progress function, called with current and total 1196 object count. 1197 Returns: iterator of tuples with (sha, offset, crc32) 1198 """ 1199 num_objects = self._num_objects 1200 resolve_ext_ref = ( 1201 self.pack.resolve_ext_ref if self.pack is not None else None) 1202 indexer = PackIndexer.for_pack_data( 1203 self, resolve_ext_ref=resolve_ext_ref) 1204 for i, result in enumerate(indexer): 1205 if progress is not None: 1206 progress(i, num_objects) 1207 yield result 1208 1209 def sorted_entries(self, progress=None): 1210 """Return entries in this pack, sorted by SHA. 1211 1212 Args: 1213 progress: Progress function, called with current and total 1214 object count 1215 Returns: List of tuples with (sha, offset, crc32) 1216 """ 1217 ret = sorted(self.iterentries(progress=progress)) 1218 return ret 1219 1220 def create_index_v1(self, filename, progress=None): 1221 """Create a version 1 file for this data file. 1222 1223 Args: 1224 filename: Index filename. 1225 progress: Progress report function 1226 Returns: Checksum of index file 1227 """ 1228 entries = self.sorted_entries(progress=progress) 1229 with GitFile(filename, 'wb') as f: 1230 return write_pack_index_v1(f, entries, self.calculate_checksum()) 1231 1232 def create_index_v2(self, filename, progress=None): 1233 """Create a version 2 index file for this data file. 1234 1235 Args: 1236 filename: Index filename. 1237 progress: Progress report function 1238 Returns: Checksum of index file 1239 """ 1240 entries = self.sorted_entries(progress=progress) 1241 with GitFile(filename, 'wb') as f: 1242 return write_pack_index_v2(f, entries, self.calculate_checksum()) 1243 1244 def create_index(self, filename, progress=None, 1245 version=2): 1246 """Create an index file for this data file. 1247 1248 Args: 1249 filename: Index filename. 1250 progress: Progress report function 1251 Returns: Checksum of index file 1252 """ 1253 if version == 1: 1254 return self.create_index_v1(filename, progress) 1255 elif version == 2: 1256 return self.create_index_v2(filename, progress) 1257 else: 1258 raise ValueError('unknown index format %d' % version) 1259 1260 def get_stored_checksum(self): 1261 """Return the expected checksum stored in this pack.""" 1262 self._file.seek(-20, SEEK_END) 1263 return self._file.read(20) 1264 1265 def check(self): 1266 """Check the consistency of this pack.""" 1267 actual = self.calculate_checksum() 1268 stored = self.get_stored_checksum() 1269 if actual != stored: 1270 raise ChecksumMismatch(stored, actual) 1271 1272 def get_compressed_data_at(self, offset): 1273 """Given offset in the packfile return compressed data that is there. 1274 1275 Using the associated index the location of an object can be looked up, 1276 and then the packfile can be asked directly for that object using this 1277 function. 1278 """ 1279 assert offset >= self._header_size 1280 self._file.seek(offset) 1281 unpacked, _ = unpack_object(self._file.read, include_comp=True) 1282 return (unpacked.pack_type_num, unpacked.delta_base, 1283 unpacked.comp_chunks) 1284 1285 def get_object_at(self, offset): 1286 """Given an offset in to the packfile return the object that is there. 1287 1288 Using the associated index the location of an object can be looked up, 1289 and then the packfile can be asked directly for that object using this 1290 function. 1291 """ 1292 try: 1293 return self._offset_cache[offset] 1294 except KeyError: 1295 pass 1296 assert offset >= self._header_size 1297 self._file.seek(offset) 1298 unpacked, _ = unpack_object(self._file.read) 1299 return (unpacked.pack_type_num, unpacked._obj()) 1300 1301 1302class DeltaChainIterator(object): 1303 """Abstract iterator over pack data based on delta chains. 1304 1305 Each object in the pack is guaranteed to be inflated exactly once, 1306 regardless of how many objects reference it as a delta base. As a result, 1307 memory usage is proportional to the length of the longest delta chain. 1308 1309 Subclasses can override _result to define the result type of the iterator. 1310 By default, results are UnpackedObjects with the following members set: 1311 1312 * offset 1313 * obj_type_num 1314 * obj_chunks 1315 * pack_type_num 1316 * delta_base (for delta types) 1317 * comp_chunks (if _include_comp is True) 1318 * decomp_chunks 1319 * decomp_len 1320 * crc32 (if _compute_crc32 is True) 1321 """ 1322 1323 _compute_crc32 = False 1324 _include_comp = False 1325 1326 def __init__(self, file_obj, resolve_ext_ref=None): 1327 self._file = file_obj 1328 self._resolve_ext_ref = resolve_ext_ref 1329 self._pending_ofs = defaultdict(list) 1330 self._pending_ref = defaultdict(list) 1331 self._full_ofs = [] 1332 self._shas = {} 1333 self._ext_refs = [] 1334 1335 @classmethod 1336 def for_pack_data(cls, pack_data, resolve_ext_ref=None): 1337 walker = cls(None, resolve_ext_ref=resolve_ext_ref) 1338 walker.set_pack_data(pack_data) 1339 for unpacked in pack_data._iter_unpacked(): 1340 walker.record(unpacked) 1341 return walker 1342 1343 def record(self, unpacked): 1344 type_num = unpacked.pack_type_num 1345 offset = unpacked.offset 1346 if type_num == OFS_DELTA: 1347 base_offset = offset - unpacked.delta_base 1348 self._pending_ofs[base_offset].append(offset) 1349 elif type_num == REF_DELTA: 1350 self._pending_ref[unpacked.delta_base].append(offset) 1351 else: 1352 self._full_ofs.append((offset, type_num)) 1353 1354 def set_pack_data(self, pack_data): 1355 self._file = pack_data._file 1356 1357 def _walk_all_chains(self): 1358 for offset, type_num in self._full_ofs: 1359 for result in self._follow_chain(offset, type_num, None): 1360 yield result 1361 for result in self._walk_ref_chains(): 1362 yield result 1363 assert not self._pending_ofs 1364 1365 def _ensure_no_pending(self): 1366 if self._pending_ref: 1367 raise KeyError([sha_to_hex(s) for s in self._pending_ref]) 1368 1369 def _walk_ref_chains(self): 1370 if not self._resolve_ext_ref: 1371 self._ensure_no_pending() 1372 return 1373 1374 for base_sha, pending in sorted(self._pending_ref.items()): 1375 if base_sha not in self._pending_ref: 1376 continue 1377 try: 1378 type_num, chunks = self._resolve_ext_ref(base_sha) 1379 except KeyError: 1380 # Not an external ref, but may depend on one. Either it will 1381 # get popped via a _follow_chain call, or we will raise an 1382 # error below. 1383 continue 1384 self._ext_refs.append(base_sha) 1385 self._pending_ref.pop(base_sha) 1386 for new_offset in pending: 1387 for result in self._follow_chain(new_offset, type_num, chunks): 1388 yield result 1389 1390 self._ensure_no_pending() 1391 1392 def _result(self, unpacked): 1393 return unpacked 1394 1395 def _resolve_object(self, offset, obj_type_num, base_chunks): 1396 self._file.seek(offset) 1397 unpacked, _ = unpack_object( 1398 self._file.read, include_comp=self._include_comp, 1399 compute_crc32=self._compute_crc32) 1400 unpacked.offset = offset 1401 if base_chunks is None: 1402 assert unpacked.pack_type_num == obj_type_num 1403 else: 1404 assert unpacked.pack_type_num in DELTA_TYPES 1405 unpacked.obj_type_num = obj_type_num 1406 unpacked.obj_chunks = apply_delta(base_chunks, 1407 unpacked.decomp_chunks) 1408 return unpacked 1409 1410 def _follow_chain(self, offset, obj_type_num, base_chunks): 1411 # Unlike PackData.get_object_at, there is no need to cache offsets as 1412 # this approach by design inflates each object exactly once. 1413 todo = [(offset, obj_type_num, base_chunks)] 1414 for offset, obj_type_num, base_chunks in todo: 1415 unpacked = self._resolve_object(offset, obj_type_num, base_chunks) 1416 yield self._result(unpacked) 1417 1418 unblocked = chain(self._pending_ofs.pop(unpacked.offset, []), 1419 self._pending_ref.pop(unpacked.sha(), [])) 1420 todo.extend( 1421 (new_offset, unpacked.obj_type_num, unpacked.obj_chunks) 1422 for new_offset in unblocked) 1423 1424 def __iter__(self): 1425 return self._walk_all_chains() 1426 1427 def ext_refs(self): 1428 return self._ext_refs 1429 1430 1431class PackIndexer(DeltaChainIterator): 1432 """Delta chain iterator that yields index entries.""" 1433 1434 _compute_crc32 = True 1435 1436 def _result(self, unpacked): 1437 return unpacked.sha(), unpacked.offset, unpacked.crc32 1438 1439 1440class PackInflater(DeltaChainIterator): 1441 """Delta chain iterator that yields ShaFile objects.""" 1442 1443 def _result(self, unpacked): 1444 return unpacked.sha_file() 1445 1446 1447class SHA1Reader(object): 1448 """Wrapper for file-like object that remembers the SHA1 of its data.""" 1449 1450 def __init__(self, f): 1451 self.f = f 1452 self.sha1 = sha1(b'') 1453 1454 def read(self, num=None): 1455 data = self.f.read(num) 1456 self.sha1.update(data) 1457 return data 1458 1459 def check_sha(self): 1460 stored = self.f.read(20) 1461 if stored != self.sha1.digest(): 1462 raise ChecksumMismatch(self.sha1.hexdigest(), sha_to_hex(stored)) 1463 1464 def close(self): 1465 return self.f.close() 1466 1467 def tell(self): 1468 return self.f.tell() 1469 1470 1471class SHA1Writer(object): 1472 """Wrapper for file-like object that remembers the SHA1 of its data.""" 1473 1474 def __init__(self, f): 1475 self.f = f 1476 self.length = 0 1477 self.sha1 = sha1(b'') 1478 1479 def write(self, data): 1480 self.sha1.update(data) 1481 self.f.write(data) 1482 self.length += len(data) 1483 1484 def write_sha(self): 1485 sha = self.sha1.digest() 1486 assert len(sha) == 20 1487 self.f.write(sha) 1488 self.length += len(sha) 1489 return sha 1490 1491 def close(self): 1492 sha = self.write_sha() 1493 self.f.close() 1494 return sha 1495 1496 def offset(self): 1497 return self.length 1498 1499 def tell(self): 1500 return self.f.tell() 1501 1502 1503def pack_object_header(type_num, delta_base, size): 1504 """Create a pack object header for the given object info. 1505 1506 Args: 1507 type_num: Numeric type of the object. 1508 delta_base: Delta base offset or ref, or None for whole objects. 1509 size: Uncompressed object size. 1510 Returns: A header for a packed object. 1511 """ 1512 header = [] 1513 c = (type_num << 4) | (size & 15) 1514 size >>= 4 1515 while size: 1516 header.append(c | 0x80) 1517 c = size & 0x7f 1518 size >>= 7 1519 header.append(c) 1520 if type_num == OFS_DELTA: 1521 ret = [delta_base & 0x7f] 1522 delta_base >>= 7 1523 while delta_base: 1524 delta_base -= 1 1525 ret.insert(0, 0x80 | (delta_base & 0x7f)) 1526 delta_base >>= 7 1527 header.extend(ret) 1528 elif type_num == REF_DELTA: 1529 assert len(delta_base) == 20 1530 header += delta_base 1531 return bytearray(header) 1532 1533 1534def write_pack_object(f, type, object, sha=None, compression_level=-1): 1535 """Write pack object to a file. 1536 1537 Args: 1538 f: File to write to 1539 type: Numeric type of the object 1540 object: Object to write 1541 compression_level: the zlib compression level 1542 Returns: Tuple with offset at which the object was written, and crc32 1543 """ 1544 if type in DELTA_TYPES: 1545 delta_base, object = object 1546 else: 1547 delta_base = None 1548 header = bytes(pack_object_header(type, delta_base, len(object))) 1549 comp_data = zlib.compress(object, compression_level) 1550 crc32 = 0 1551 for data in (header, comp_data): 1552 f.write(data) 1553 if sha is not None: 1554 sha.update(data) 1555 crc32 = binascii.crc32(data, crc32) 1556 return crc32 & 0xffffffff 1557 1558 1559def write_pack(filename, objects, deltify=None, delta_window_size=None, 1560 compression_level=-1): 1561 """Write a new pack data file. 1562 1563 Args: 1564 filename: Path to the new pack file (without .pack extension) 1565 objects: Iterable of (object, path) tuples to write. 1566 Should provide __len__ 1567 window_size: Delta window size 1568 deltify: Whether to deltify pack objects 1569 compression_level: the zlib compression level 1570 Returns: Tuple with checksum of pack file and index file 1571 """ 1572 with GitFile(filename + '.pack', 'wb') as f: 1573 entries, data_sum = write_pack_objects( 1574 f, objects, delta_window_size=delta_window_size, deltify=deltify, 1575 compression_level=compression_level) 1576 entries = sorted([(k, v[0], v[1]) for (k, v) in entries.items()]) 1577 with GitFile(filename + '.idx', 'wb') as f: 1578 return data_sum, write_pack_index_v2(f, entries, data_sum) 1579 1580 1581def write_pack_header(f, num_objects): 1582 """Write a pack header for the given number of objects.""" 1583 f.write(b'PACK') # Pack header 1584 f.write(struct.pack(b'>L', 2)) # Pack version 1585 f.write(struct.pack(b'>L', num_objects)) # Number of objects in pack 1586 1587 1588def deltify_pack_objects(objects, window_size=None): 1589 """Generate deltas for pack objects. 1590 1591 Args: 1592 objects: An iterable of (object, path) tuples to deltify. 1593 window_size: Window size; None for default 1594 Returns: Iterator over type_num, object id, delta_base, content 1595 delta_base is None for full text entries 1596 """ 1597 # TODO(jelmer): Use threads 1598 if window_size is None: 1599 window_size = DEFAULT_PACK_DELTA_WINDOW_SIZE 1600 # Build a list of objects ordered by the magic Linus heuristic 1601 # This helps us find good objects to diff against us 1602 magic = [] 1603 for obj, path in objects: 1604 magic.append((obj.type_num, path, -obj.raw_length(), obj)) 1605 magic.sort() 1606 1607 possible_bases = deque() 1608 1609 for type_num, path, neg_length, o in magic: 1610 raw = o.as_raw_string() 1611 winner = raw 1612 winner_base = None 1613 for base in possible_bases: 1614 if base.type_num != type_num: 1615 continue 1616 delta = create_delta(base.as_raw_string(), raw) 1617 if len(delta) < len(winner): 1618 winner_base = base.sha().digest() 1619 winner = delta 1620 yield type_num, o.sha().digest(), winner_base, winner 1621 possible_bases.appendleft(o) 1622 while len(possible_bases) > window_size: 1623 possible_bases.pop() 1624 1625 1626def pack_objects_to_data(objects): 1627 """Create pack data from objects 1628 1629 Args: 1630 objects: Pack objects 1631 Returns: Tuples with (type_num, hexdigest, delta base, object chunks) 1632 """ 1633 count = len(objects) 1634 return (count, 1635 ((o.type_num, o.sha().digest(), None, o.as_raw_string()) 1636 for (o, path) in objects)) 1637 1638 1639def write_pack_objects(f, objects, delta_window_size=None, deltify=None, 1640 compression_level=-1): 1641 """Write a new pack data file. 1642 1643 Args: 1644 f: File to write to 1645 objects: Iterable of (object, path) tuples to write. 1646 Should provide __len__ 1647 window_size: Sliding window size for searching for deltas; 1648 Set to None for default window size. 1649 deltify: Whether to deltify objects 1650 compression_level: the zlib compression level to use 1651 Returns: Dict mapping id -> (offset, crc32 checksum), pack checksum 1652 """ 1653 if deltify is None: 1654 # PERFORMANCE/TODO(jelmer): This should be enabled but is *much* too 1655 # slow at the moment. 1656 deltify = False 1657 if deltify: 1658 pack_contents = deltify_pack_objects(objects, delta_window_size) 1659 pack_contents_count = len(objects) 1660 else: 1661 pack_contents_count, pack_contents = pack_objects_to_data(objects) 1662 1663 return write_pack_data( 1664 f, pack_contents_count, pack_contents, 1665 compression_level=compression_level) 1666 1667 1668def write_pack_data( 1669 f, num_records, records, progress=None, compression_level=-1): 1670 """Write a new pack data file. 1671 1672 Args: 1673 f: File to write to 1674 num_records: Number of records 1675 records: Iterator over type_num, object_id, delta_base, raw 1676 progress: Function to report progress to 1677 compression_level: the zlib compression level 1678 Returns: Dict mapping id -> (offset, crc32 checksum), pack checksum 1679 """ 1680 # Write the pack 1681 entries = {} 1682 f = SHA1Writer(f) 1683 write_pack_header(f, num_records) 1684 for i, (type_num, object_id, delta_base, raw) in enumerate(records): 1685 if progress is not None: 1686 progress(( 1687 'writing pack data: %d/%d\r' % 1688 (i, num_records)).encode('ascii')) 1689 offset = f.offset() 1690 if delta_base is not None: 1691 try: 1692 base_offset, base_crc32 = entries[delta_base] 1693 except KeyError: 1694 type_num = REF_DELTA 1695 raw = (delta_base, raw) 1696 else: 1697 type_num = OFS_DELTA 1698 raw = (offset - base_offset, raw) 1699 crc32 = write_pack_object( 1700 f, type_num, raw, compression_level=compression_level) 1701 entries[object_id] = (offset, crc32) 1702 return entries, f.write_sha() 1703 1704 1705def write_pack_index_v1(f, entries, pack_checksum): 1706 """Write a new pack index file. 1707 1708 Args: 1709 f: A file-like object to write to 1710 entries: List of tuples with object name (sha), offset_in_pack, 1711 and crc32_checksum. 1712 pack_checksum: Checksum of the pack file. 1713 Returns: The SHA of the written index file 1714 """ 1715 f = SHA1Writer(f) 1716 fan_out_table = defaultdict(lambda: 0) 1717 for (name, offset, entry_checksum) in entries: 1718 fan_out_table[ord(name[:1])] += 1 1719 # Fan-out table 1720 for i in range(0x100): 1721 f.write(struct.pack('>L', fan_out_table[i])) 1722 fan_out_table[i+1] += fan_out_table[i] 1723 for (name, offset, entry_checksum) in entries: 1724 if not (offset <= 0xffffffff): 1725 raise TypeError("pack format 1 only supports offsets < 2Gb") 1726 f.write(struct.pack('>L20s', offset, name)) 1727 assert len(pack_checksum) == 20 1728 f.write(pack_checksum) 1729 return f.write_sha() 1730 1731 1732def _delta_encode_size(size): 1733 ret = bytearray() 1734 c = size & 0x7f 1735 size >>= 7 1736 while size: 1737 ret.append(c | 0x80) 1738 c = size & 0x7f 1739 size >>= 7 1740 ret.append(c) 1741 return ret 1742 1743 1744# The length of delta compression copy operations in version 2 packs is limited 1745# to 64K. To copy more, we use several copy operations. Version 3 packs allow 1746# 24-bit lengths in copy operations, but we always make version 2 packs. 1747_MAX_COPY_LEN = 0xffff 1748 1749 1750def _encode_copy_operation(start, length): 1751 scratch = [] 1752 op = 0x80 1753 for i in range(4): 1754 if start & 0xff << i*8: 1755 scratch.append((start >> i*8) & 0xff) 1756 op |= 1 << i 1757 for i in range(2): 1758 if length & 0xff << i*8: 1759 scratch.append((length >> i*8) & 0xff) 1760 op |= 1 << (4+i) 1761 return bytearray([op] + scratch) 1762 1763 1764def create_delta(base_buf, target_buf): 1765 """Use python difflib to work out how to transform base_buf to target_buf. 1766 1767 Args: 1768 base_buf: Base buffer 1769 target_buf: Target buffer 1770 """ 1771 assert isinstance(base_buf, bytes) 1772 assert isinstance(target_buf, bytes) 1773 out_buf = bytearray() 1774 # write delta header 1775 out_buf += _delta_encode_size(len(base_buf)) 1776 out_buf += _delta_encode_size(len(target_buf)) 1777 # write out delta opcodes 1778 seq = difflib.SequenceMatcher(a=base_buf, b=target_buf) 1779 for opcode, i1, i2, j1, j2 in seq.get_opcodes(): 1780 # Git patch opcodes don't care about deletes! 1781 # if opcode == 'replace' or opcode == 'delete': 1782 # pass 1783 if opcode == 'equal': 1784 # If they are equal, unpacker will use data from base_buf 1785 # Write out an opcode that says what range to use 1786 copy_start = i1 1787 copy_len = i2 - i1 1788 while copy_len > 0: 1789 to_copy = min(copy_len, _MAX_COPY_LEN) 1790 out_buf += _encode_copy_operation(copy_start, to_copy) 1791 copy_start += to_copy 1792 copy_len -= to_copy 1793 if opcode == 'replace' or opcode == 'insert': 1794 # If we are replacing a range or adding one, then we just 1795 # output it to the stream (prefixed by its size) 1796 s = j2 - j1 1797 o = j1 1798 while s > 127: 1799 out_buf.append(127) 1800 out_buf += bytearray(target_buf[o:o+127]) 1801 s -= 127 1802 o += 127 1803 out_buf.append(s) 1804 out_buf += bytearray(target_buf[o:o+s]) 1805 return bytes(out_buf) 1806 1807 1808def apply_delta(src_buf, delta): 1809 """Based on the similar function in git's patch-delta.c. 1810 1811 Args: 1812 src_buf: Source buffer 1813 delta: Delta instructions 1814 """ 1815 if not isinstance(src_buf, bytes): 1816 src_buf = b''.join(src_buf) 1817 if not isinstance(delta, bytes): 1818 delta = b''.join(delta) 1819 out = [] 1820 index = 0 1821 delta_length = len(delta) 1822 1823 def get_delta_header_size(delta, index): 1824 size = 0 1825 i = 0 1826 while delta: 1827 cmd = ord(delta[index:index+1]) 1828 index += 1 1829 size |= (cmd & ~0x80) << i 1830 i += 7 1831 if not cmd & 0x80: 1832 break 1833 return size, index 1834 src_size, index = get_delta_header_size(delta, index) 1835 dest_size, index = get_delta_header_size(delta, index) 1836 assert src_size == len(src_buf), '%d vs %d' % (src_size, len(src_buf)) 1837 while index < delta_length: 1838 cmd = ord(delta[index:index+1]) 1839 index += 1 1840 if cmd & 0x80: 1841 cp_off = 0 1842 for i in range(4): 1843 if cmd & (1 << i): 1844 x = ord(delta[index:index+1]) 1845 index += 1 1846 cp_off |= x << (i * 8) 1847 cp_size = 0 1848 # Version 3 packs can contain copy sizes larger than 64K. 1849 for i in range(3): 1850 if cmd & (1 << (4+i)): 1851 x = ord(delta[index:index+1]) 1852 index += 1 1853 cp_size |= x << (i * 8) 1854 if cp_size == 0: 1855 cp_size = 0x10000 1856 if (cp_off + cp_size < cp_size or 1857 cp_off + cp_size > src_size or 1858 cp_size > dest_size): 1859 break 1860 out.append(src_buf[cp_off:cp_off+cp_size]) 1861 elif cmd != 0: 1862 out.append(delta[index:index+cmd]) 1863 index += cmd 1864 else: 1865 raise ApplyDeltaError('Invalid opcode 0') 1866 1867 if index != delta_length: 1868 raise ApplyDeltaError('delta not empty: %r' % delta[index:]) 1869 1870 if dest_size != chunks_length(out): 1871 raise ApplyDeltaError('dest size incorrect') 1872 1873 return out 1874 1875 1876def write_pack_index_v2(f, entries, pack_checksum): 1877 """Write a new pack index file. 1878 1879 Args: 1880 f: File-like object to write to 1881 entries: List of tuples with object name (sha), offset_in_pack, and 1882 crc32_checksum. 1883 pack_checksum: Checksum of the pack file. 1884 Returns: The SHA of the index file written 1885 """ 1886 f = SHA1Writer(f) 1887 f.write(b'\377tOc') # Magic! 1888 f.write(struct.pack('>L', 2)) 1889 fan_out_table = defaultdict(lambda: 0) 1890 for (name, offset, entry_checksum) in entries: 1891 fan_out_table[ord(name[:1])] += 1 1892 # Fan-out table 1893 largetable = [] 1894 for i in range(0x100): 1895 f.write(struct.pack(b'>L', fan_out_table[i])) 1896 fan_out_table[i+1] += fan_out_table[i] 1897 for (name, offset, entry_checksum) in entries: 1898 f.write(name) 1899 for (name, offset, entry_checksum) in entries: 1900 f.write(struct.pack(b'>L', entry_checksum)) 1901 for (name, offset, entry_checksum) in entries: 1902 if offset < 2**31: 1903 f.write(struct.pack(b'>L', offset)) 1904 else: 1905 f.write(struct.pack(b'>L', 2**31 + len(largetable))) 1906 largetable.append(offset) 1907 for offset in largetable: 1908 f.write(struct.pack(b'>Q', offset)) 1909 assert len(pack_checksum) == 20 1910 f.write(pack_checksum) 1911 return f.write_sha() 1912 1913 1914write_pack_index = write_pack_index_v2 1915 1916 1917class Pack(object): 1918 """A Git pack object.""" 1919 1920 def __init__(self, basename, resolve_ext_ref=None): 1921 self._basename = basename 1922 self._data = None 1923 self._idx = None 1924 self._idx_path = self._basename + '.idx' 1925 self._data_path = self._basename + '.pack' 1926 self._data_load = lambda: PackData(self._data_path) 1927 self._idx_load = lambda: load_pack_index(self._idx_path) 1928 self.resolve_ext_ref = resolve_ext_ref 1929 1930 @classmethod 1931 def from_lazy_objects(self, data_fn, idx_fn): 1932 """Create a new pack object from callables to load pack data and 1933 index objects.""" 1934 ret = Pack('') 1935 ret._data_load = data_fn 1936 ret._idx_load = idx_fn 1937 return ret 1938 1939 @classmethod 1940 def from_objects(self, data, idx): 1941 """Create a new pack object from pack data and index objects.""" 1942 ret = Pack('') 1943 ret._data_load = lambda: data 1944 ret._idx_load = lambda: idx 1945 return ret 1946 1947 def name(self): 1948 """The SHA over the SHAs of the objects in this pack.""" 1949 return self.index.objects_sha1() 1950 1951 @property 1952 def data(self): 1953 """The pack data object being used.""" 1954 if self._data is None: 1955 self._data = self._data_load() 1956 self._data.pack = self 1957 self.check_length_and_checksum() 1958 return self._data 1959 1960 @property 1961 def index(self): 1962 """The index being used. 1963 1964 Note: This may be an in-memory index 1965 """ 1966 if self._idx is None: 1967 self._idx = self._idx_load() 1968 return self._idx 1969 1970 def close(self): 1971 if self._data is not None: 1972 self._data.close() 1973 if self._idx is not None: 1974 self._idx.close() 1975 1976 def __enter__(self): 1977 return self 1978 1979 def __exit__(self, exc_type, exc_val, exc_tb): 1980 self.close() 1981 1982 def __eq__(self, other): 1983 return isinstance(self, type(other)) and self.index == other.index 1984 1985 def __len__(self): 1986 """Number of entries in this pack.""" 1987 return len(self.index) 1988 1989 def __repr__(self): 1990 return '%s(%r)' % (self.__class__.__name__, self._basename) 1991 1992 def __iter__(self): 1993 """Iterate over all the sha1s of the objects in this pack.""" 1994 return iter(self.index) 1995 1996 def check_length_and_checksum(self): 1997 """Sanity check the length and checksum of the pack index and data.""" 1998 assert len(self.index) == len(self.data) 1999 idx_stored_checksum = self.index.get_pack_checksum() 2000 data_stored_checksum = self.data.get_stored_checksum() 2001 if idx_stored_checksum != data_stored_checksum: 2002 raise ChecksumMismatch(sha_to_hex(idx_stored_checksum), 2003 sha_to_hex(data_stored_checksum)) 2004 2005 def check(self): 2006 """Check the integrity of this pack. 2007 2008 Raises: 2009 ChecksumMismatch: if a checksum for the index or data is wrong 2010 """ 2011 self.index.check() 2012 self.data.check() 2013 for obj in self.iterobjects(): 2014 obj.check() 2015 # TODO: object connectivity checks 2016 2017 def get_stored_checksum(self): 2018 return self.data.get_stored_checksum() 2019 2020 def __contains__(self, sha1): 2021 """Check whether this pack contains a particular SHA1.""" 2022 try: 2023 self.index.object_index(sha1) 2024 return True 2025 except KeyError: 2026 return False 2027 2028 def get_raw_unresolved(self, sha1): 2029 """Get raw unresolved data for a SHA. 2030 2031 Args: 2032 sha1: SHA to return data for 2033 Returns: Tuple with pack object type, delta base (if applicable), 2034 list of data chunks 2035 """ 2036 offset = self.index.object_index(sha1) 2037 (obj_type, delta_base, chunks) = self.data.get_compressed_data_at( 2038 offset) 2039 if obj_type == OFS_DELTA: 2040 delta_base = sha_to_hex( 2041 self.index.object_sha1(offset - delta_base)) 2042 obj_type = REF_DELTA 2043 return (obj_type, delta_base, chunks) 2044 2045 def get_raw(self, sha1): 2046 offset = self.index.object_index(sha1) 2047 obj_type, obj = self.data.get_object_at(offset) 2048 type_num, chunks = self.data.resolve_object(offset, obj_type, obj) 2049 return type_num, b''.join(chunks) 2050 2051 def __getitem__(self, sha1): 2052 """Retrieve the specified SHA1.""" 2053 type, uncomp = self.get_raw(sha1) 2054 return ShaFile.from_raw_string(type, uncomp, sha=sha1) 2055 2056 def iterobjects(self): 2057 """Iterate over the objects in this pack.""" 2058 return iter(PackInflater.for_pack_data( 2059 self.data, resolve_ext_ref=self.resolve_ext_ref)) 2060 2061 def pack_tuples(self): 2062 """Provide an iterable for use with write_pack_objects. 2063 2064 Returns: Object that can iterate over (object, path) tuples 2065 and provides __len__ 2066 """ 2067 class PackTupleIterable(object): 2068 2069 def __init__(self, pack): 2070 self.pack = pack 2071 2072 def __len__(self): 2073 return len(self.pack) 2074 2075 def __iter__(self): 2076 return ((o, None) for o in self.pack.iterobjects()) 2077 2078 return PackTupleIterable(self) 2079 2080 def keep(self, msg=None): 2081 """Add a .keep file for the pack, preventing git from garbage collecting it. 2082 2083 Args: 2084 msg: A message written inside the .keep file; can be used later 2085 to determine whether or not a .keep file is obsolete. 2086 Returns: The path of the .keep file, as a string. 2087 """ 2088 keepfile_name = '%s.keep' % self._basename 2089 with GitFile(keepfile_name, 'wb') as keepfile: 2090 if msg: 2091 keepfile.write(msg) 2092 keepfile.write(b'\n') 2093 return keepfile_name 2094 2095 2096try: 2097 from dulwich._pack import apply_delta, bisect_find_sha # noqa: F811 2098except ImportError: 2099 pass 2100