1# pack.py -- For dealing with packed git objects.
2# Copyright (C) 2007 James Westby <jw+debian@jameswestby.net>
3# Copyright (C) 2008-2013 Jelmer Vernooij <jelmer@jelmer.uk>
4#
5# Dulwich is dual-licensed under the Apache License, Version 2.0 and the GNU
6# General Public License as public by the Free Software Foundation; version 2.0
7# or (at your option) any later version. You can redistribute it and/or
8# modify it under the terms of either of these two licenses.
9#
10# Unless required by applicable law or agreed to in writing, software
11# distributed under the License is distributed on an "AS IS" BASIS,
12# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13# See the License for the specific language governing permissions and
14# limitations under the License.
15#
16# You should have received a copy of the licenses; if not, see
17# <http://www.gnu.org/licenses/> for a copy of the GNU General Public License
18# and <http://www.apache.org/licenses/LICENSE-2.0> for a copy of the Apache
19# License, Version 2.0.
20#
21
22"""Classes for dealing with packed git objects.
23
24A pack is a compact representation of a bunch of objects, stored
25using deltas where possible.
26
27They have two parts, the pack file, which stores the data, and an index
28that tells you where the data is.
29
30To find an object you look in all of the index files 'til you find a
31match for the object name. You then use the pointer got from this as
32a pointer in to the corresponding packfile.
33"""
34
35from collections import defaultdict
36
37import binascii
38from io import BytesIO, UnsupportedOperation
39from collections import (
40    deque,
41    )
42import difflib
43import struct
44
45from itertools import chain
46try:
47    from itertools import imap, izip
48except ImportError:
49    # Python3
50    imap = map
51    izip = zip
52
53import os
54import sys
55
56from hashlib import sha1
57from os import (
58    SEEK_CUR,
59    SEEK_END,
60    )
61from struct import unpack_from
62import zlib
63
64try:
65    import mmap
66except ImportError:
67    has_mmap = False
68else:
69    has_mmap = True
70
71# For some reason the above try, except fails to set has_mmap = False for plan9
72if sys.platform == 'Plan9':
73    has_mmap = False
74
75from dulwich.errors import (  # noqa: E402
76    ApplyDeltaError,
77    ChecksumMismatch,
78    )
79from dulwich.file import GitFile  # noqa: E402
80from dulwich.lru_cache import (  # noqa: E402
81    LRUSizeCache,
82    )
83from dulwich.objects import (  # noqa: E402
84    ShaFile,
85    hex_to_sha,
86    sha_to_hex,
87    object_header,
88    )
89
90
91OFS_DELTA = 6
92REF_DELTA = 7
93
94DELTA_TYPES = (OFS_DELTA, REF_DELTA)
95
96
97DEFAULT_PACK_DELTA_WINDOW_SIZE = 10
98
99
100def take_msb_bytes(read, crc32=None):
101    """Read bytes marked with most significant bit.
102
103    Args:
104      read: Read function
105    """
106    ret = []
107    while len(ret) == 0 or ret[-1] & 0x80:
108        b = read(1)
109        if crc32 is not None:
110            crc32 = binascii.crc32(b, crc32)
111        ret.append(ord(b[:1]))
112    return ret, crc32
113
114
115class PackFileDisappeared(Exception):
116
117    def __init__(self, obj):
118        self.obj = obj
119
120
121class UnpackedObject(object):
122    """Class encapsulating an object unpacked from a pack file.
123
124    These objects should only be created from within unpack_object. Most
125    members start out as empty and are filled in at various points by
126    read_zlib_chunks, unpack_object, DeltaChainIterator, etc.
127
128    End users of this object should take care that the function they're getting
129    this object from is guaranteed to set the members they need.
130    """
131
132    __slots__ = [
133      'offset',         # Offset in its pack.
134      '_sha',           # Cached binary SHA.
135      'obj_type_num',   # Type of this object.
136      'obj_chunks',     # Decompressed and delta-resolved chunks.
137      'pack_type_num',  # Type of this object in the pack (may be a delta).
138      'delta_base',     # Delta base offset or SHA.
139      'comp_chunks',    # Compressed object chunks.
140      'decomp_chunks',  # Decompressed object chunks.
141      'decomp_len',     # Decompressed length of this object.
142      'crc32',          # CRC32.
143      ]
144
145    # TODO(dborowitz): read_zlib_chunks and unpack_object could very well be
146    # methods of this object.
147    def __init__(self, pack_type_num, delta_base, decomp_len, crc32):
148        self.offset = None
149        self._sha = None
150        self.pack_type_num = pack_type_num
151        self.delta_base = delta_base
152        self.comp_chunks = None
153        self.decomp_chunks = []
154        self.decomp_len = decomp_len
155        self.crc32 = crc32
156
157        if pack_type_num in DELTA_TYPES:
158            self.obj_type_num = None
159            self.obj_chunks = None
160        else:
161            self.obj_type_num = pack_type_num
162            self.obj_chunks = self.decomp_chunks
163            self.delta_base = delta_base
164
165    def sha(self):
166        """Return the binary SHA of this object."""
167        if self._sha is None:
168            self._sha = obj_sha(self.obj_type_num, self.obj_chunks)
169        return self._sha
170
171    def sha_file(self):
172        """Return a ShaFile from this object."""
173        return ShaFile.from_raw_chunks(self.obj_type_num, self.obj_chunks)
174
175    # Only provided for backwards compatibility with code that expects either
176    # chunks or a delta tuple.
177    def _obj(self):
178        """Return the decompressed chunks, or (delta base, delta chunks)."""
179        if self.pack_type_num in DELTA_TYPES:
180            return (self.delta_base, self.decomp_chunks)
181        else:
182            return self.decomp_chunks
183
184    def __eq__(self, other):
185        if not isinstance(other, UnpackedObject):
186            return False
187        for slot in self.__slots__:
188            if getattr(self, slot) != getattr(other, slot):
189                return False
190        return True
191
192    def __ne__(self, other):
193        return not (self == other)
194
195    def __repr__(self):
196        data = ['%s=%r' % (s, getattr(self, s)) for s in self.__slots__]
197        return '%s(%s)' % (self.__class__.__name__, ', '.join(data))
198
199
200_ZLIB_BUFSIZE = 4096
201
202
203def read_zlib_chunks(read_some, unpacked, include_comp=False,
204                     buffer_size=_ZLIB_BUFSIZE):
205    """Read zlib data from a buffer.
206
207    This function requires that the buffer have additional data following the
208    compressed data, which is guaranteed to be the case for git pack files.
209
210    Args:
211      read_some: Read function that returns at least one byte, but may
212        return less than the requested size.
213      unpacked: An UnpackedObject to write result data to. If its crc32
214        attr is not None, the CRC32 of the compressed bytes will be computed
215        using this starting CRC32.
216        After this function, will have the following attrs set:
217        * comp_chunks    (if include_comp is True)
218        * decomp_chunks
219        * decomp_len
220        * crc32
221      include_comp: If True, include compressed data in the result.
222      buffer_size: Size of the read buffer.
223    Returns: Leftover unused data from the decompression.
224    Raises:
225      zlib.error: if a decompression error occurred.
226    """
227    if unpacked.decomp_len <= -1:
228        raise ValueError('non-negative zlib data stream size expected')
229    decomp_obj = zlib.decompressobj()
230
231    comp_chunks = []
232    decomp_chunks = unpacked.decomp_chunks
233    decomp_len = 0
234    crc32 = unpacked.crc32
235
236    while True:
237        add = read_some(buffer_size)
238        if not add:
239            raise zlib.error('EOF before end of zlib stream')
240        comp_chunks.append(add)
241        decomp = decomp_obj.decompress(add)
242        decomp_len += len(decomp)
243        decomp_chunks.append(decomp)
244        unused = decomp_obj.unused_data
245        if unused:
246            left = len(unused)
247            if crc32 is not None:
248                crc32 = binascii.crc32(add[:-left], crc32)
249            if include_comp:
250                comp_chunks[-1] = add[:-left]
251            break
252        elif crc32 is not None:
253            crc32 = binascii.crc32(add, crc32)
254    if crc32 is not None:
255        crc32 &= 0xffffffff
256
257    if decomp_len != unpacked.decomp_len:
258        raise zlib.error('decompressed data does not match expected size')
259
260    unpacked.crc32 = crc32
261    if include_comp:
262        unpacked.comp_chunks = comp_chunks
263    return unused
264
265
266def iter_sha1(iter):
267    """Return the hexdigest of the SHA1 over a set of names.
268
269    Args:
270      iter: Iterator over string objects
271    Returns: 40-byte hex sha1 digest
272    """
273    sha = sha1()
274    for name in iter:
275        sha.update(name)
276    return sha.hexdigest().encode('ascii')
277
278
279def load_pack_index(path):
280    """Load an index file by path.
281
282    Args:
283      filename: Path to the index file
284    Returns: A PackIndex loaded from the given path
285    """
286    with GitFile(path, 'rb') as f:
287        return load_pack_index_file(path, f)
288
289
290def _load_file_contents(f, size=None):
291    try:
292        fd = f.fileno()
293    except (UnsupportedOperation, AttributeError):
294        fd = None
295    # Attempt to use mmap if possible
296    if fd is not None:
297        if size is None:
298            size = os.fstat(fd).st_size
299        if has_mmap:
300            try:
301                contents = mmap.mmap(fd, size, access=mmap.ACCESS_READ)
302            except mmap.error:
303                # Perhaps a socket?
304                pass
305            else:
306                return contents, size
307    contents = f.read()
308    size = len(contents)
309    return contents, size
310
311
312def load_pack_index_file(path, f):
313    """Load an index file from a file-like object.
314
315    Args:
316      path: Path for the index file
317      f: File-like object
318    Returns: A PackIndex loaded from the given file
319    """
320    contents, size = _load_file_contents(f)
321    if contents[:4] == b'\377tOc':
322        version = struct.unpack(b'>L', contents[4:8])[0]
323        if version == 2:
324            return PackIndex2(
325                path, file=f, contents=contents, size=size)
326        else:
327            raise KeyError('Unknown pack index format %d' % version)
328    else:
329        return PackIndex1(path, file=f, contents=contents, size=size)
330
331
332def bisect_find_sha(start, end, sha, unpack_name):
333    """Find a SHA in a data blob with sorted SHAs.
334
335    Args:
336      start: Start index of range to search
337      end: End index of range to search
338      sha: Sha to find
339      unpack_name: Callback to retrieve SHA by index
340    Returns: Index of the SHA, or None if it wasn't found
341    """
342    assert start <= end
343    while start <= end:
344        i = (start + end) // 2
345        file_sha = unpack_name(i)
346        if file_sha < sha:
347            start = i + 1
348        elif file_sha > sha:
349            end = i - 1
350        else:
351            return i
352    return None
353
354
355class PackIndex(object):
356    """An index in to a packfile.
357
358    Given a sha id of an object a pack index can tell you the location in the
359    packfile of that object if it has it.
360    """
361
362    def __eq__(self, other):
363        if not isinstance(other, PackIndex):
364            return False
365
366        for (name1, _, _), (name2, _, _) in izip(self.iterentries(),
367                                                 other.iterentries()):
368            if name1 != name2:
369                return False
370        return True
371
372    def __ne__(self, other):
373        return not self.__eq__(other)
374
375    def __len__(self):
376        """Return the number of entries in this pack index."""
377        raise NotImplementedError(self.__len__)
378
379    def __iter__(self):
380        """Iterate over the SHAs in this pack."""
381        return imap(sha_to_hex, self._itersha())
382
383    def iterentries(self):
384        """Iterate over the entries in this pack index.
385
386        Returns: iterator over tuples with object name, offset in packfile and
387            crc32 checksum.
388        """
389        raise NotImplementedError(self.iterentries)
390
391    def get_pack_checksum(self):
392        """Return the SHA1 checksum stored for the corresponding packfile.
393
394        Returns: 20-byte binary digest
395        """
396        raise NotImplementedError(self.get_pack_checksum)
397
398    def object_index(self, sha):
399        """Return the index in to the corresponding packfile for the object.
400
401        Given the name of an object it will return the offset that object
402        lives at within the corresponding pack file. If the pack file doesn't
403        have the object then None will be returned.
404        """
405        if len(sha) == 40:
406            sha = hex_to_sha(sha)
407        try:
408            return self._object_index(sha)
409        except ValueError:
410            closed = getattr(self._contents, 'closed', None)
411            if closed in (None, True):
412                raise PackFileDisappeared(self)
413            raise
414
415    def object_sha1(self, index):
416        """Return the SHA1 corresponding to the index in the pack file.
417        """
418        # PERFORMANCE/TODO(jelmer): Avoid scanning entire index
419        for (name, offset, crc32) in self.iterentries():
420            if offset == index:
421                return name
422        else:
423            raise KeyError(index)
424
425    def _object_index(self, sha):
426        """See object_index.
427
428        Args:
429          sha: A *binary* SHA string. (20 characters long)_
430        """
431        raise NotImplementedError(self._object_index)
432
433    def objects_sha1(self):
434        """Return the hex SHA1 over all the shas of all objects in this pack.
435
436        Note: This is used for the filename of the pack.
437        """
438        return iter_sha1(self._itersha())
439
440    def _itersha(self):
441        """Yield all the SHA1's of the objects in the index, sorted."""
442        raise NotImplementedError(self._itersha)
443
444
445class MemoryPackIndex(PackIndex):
446    """Pack index that is stored entirely in memory."""
447
448    def __init__(self, entries, pack_checksum=None):
449        """Create a new MemoryPackIndex.
450
451        Args:
452          entries: Sequence of name, idx, crc32 (sorted)
453          pack_checksum: Optional pack checksum
454        """
455        self._by_sha = {}
456        self._by_index = {}
457        for name, idx, crc32 in entries:
458            self._by_sha[name] = idx
459            self._by_index[idx] = name
460        self._entries = entries
461        self._pack_checksum = pack_checksum
462
463    def get_pack_checksum(self):
464        return self._pack_checksum
465
466    def __len__(self):
467        return len(self._entries)
468
469    def _object_index(self, sha):
470        return self._by_sha[sha][0]
471
472    def object_sha1(self, index):
473        return self._by_index[index]
474
475    def _itersha(self):
476        return iter(self._by_sha)
477
478    def iterentries(self):
479        return iter(self._entries)
480
481
482class FilePackIndex(PackIndex):
483    """Pack index that is based on a file.
484
485    To do the loop it opens the file, and indexes first 256 4 byte groups
486    with the first byte of the sha id. The value in the four byte group indexed
487    is the end of the group that shares the same starting byte. Subtract one
488    from the starting byte and index again to find the start of the group.
489    The values are sorted by sha id within the group, so do the math to find
490    the start and end offset and then bisect in to find if the value is
491    present.
492    """
493
494    def __init__(self, filename, file=None, contents=None, size=None):
495        """Create a pack index object.
496
497        Provide it with the name of the index file to consider, and it will map
498        it whenever required.
499        """
500        self._filename = filename
501        # Take the size now, so it can be checked each time we map the file to
502        # ensure that it hasn't changed.
503        if file is None:
504            self._file = GitFile(filename, 'rb')
505        else:
506            self._file = file
507        if contents is None:
508            self._contents, self._size = _load_file_contents(self._file, size)
509        else:
510            self._contents, self._size = (contents, size)
511
512    @property
513    def path(self):
514        return self._filename
515
516    def __eq__(self, other):
517        # Quick optimization:
518        if (isinstance(other, FilePackIndex) and
519                self._fan_out_table != other._fan_out_table):
520            return False
521
522        return super(FilePackIndex, self).__eq__(other)
523
524    def close(self):
525        self._file.close()
526        if getattr(self._contents, "close", None) is not None:
527            self._contents.close()
528
529    def __len__(self):
530        """Return the number of entries in this pack index."""
531        return self._fan_out_table[-1]
532
533    def _unpack_entry(self, i):
534        """Unpack the i-th entry in the index file.
535
536        Returns: Tuple with object name (SHA), offset in pack file and CRC32
537            checksum (if known).
538        """
539        raise NotImplementedError(self._unpack_entry)
540
541    def _unpack_name(self, i):
542        """Unpack the i-th name from the index file."""
543        raise NotImplementedError(self._unpack_name)
544
545    def _unpack_offset(self, i):
546        """Unpack the i-th object offset from the index file."""
547        raise NotImplementedError(self._unpack_offset)
548
549    def _unpack_crc32_checksum(self, i):
550        """Unpack the crc32 checksum for the ith object from the index file.
551        """
552        raise NotImplementedError(self._unpack_crc32_checksum)
553
554    def _itersha(self):
555        for i in range(len(self)):
556            yield self._unpack_name(i)
557
558    def iterentries(self):
559        """Iterate over the entries in this pack index.
560
561        Returns: iterator over tuples with object name, offset in packfile and
562            crc32 checksum.
563        """
564        for i in range(len(self)):
565            yield self._unpack_entry(i)
566
567    def _read_fan_out_table(self, start_offset):
568        ret = []
569        for i in range(0x100):
570            fanout_entry = self._contents[
571                start_offset+i*4:start_offset+(i+1)*4]
572            ret.append(struct.unpack('>L', fanout_entry)[0])
573        return ret
574
575    def check(self):
576        """Check that the stored checksum matches the actual checksum."""
577        actual = self.calculate_checksum()
578        stored = self.get_stored_checksum()
579        if actual != stored:
580            raise ChecksumMismatch(stored, actual)
581
582    def calculate_checksum(self):
583        """Calculate the SHA1 checksum over this pack index.
584
585        Returns: This is a 20-byte binary digest
586        """
587        return sha1(self._contents[:-20]).digest()
588
589    def get_pack_checksum(self):
590        """Return the SHA1 checksum stored for the corresponding packfile.
591
592        Returns: 20-byte binary digest
593        """
594        return bytes(self._contents[-40:-20])
595
596    def get_stored_checksum(self):
597        """Return the SHA1 checksum stored for this index.
598
599        Returns: 20-byte binary digest
600        """
601        return bytes(self._contents[-20:])
602
603    def _object_index(self, sha):
604        """See object_index.
605
606        Args:
607          sha: A *binary* SHA string. (20 characters long)_
608        """
609        assert len(sha) == 20
610        idx = ord(sha[:1])
611        if idx == 0:
612            start = 0
613        else:
614            start = self._fan_out_table[idx-1]
615        end = self._fan_out_table[idx]
616        i = bisect_find_sha(start, end, sha, self._unpack_name)
617        if i is None:
618            raise KeyError(sha)
619        return self._unpack_offset(i)
620
621
622class PackIndex1(FilePackIndex):
623    """Version 1 Pack Index file."""
624
625    def __init__(self, filename, file=None, contents=None, size=None):
626        super(PackIndex1, self).__init__(filename, file, contents, size)
627        self.version = 1
628        self._fan_out_table = self._read_fan_out_table(0)
629
630    def _unpack_entry(self, i):
631        (offset, name) = unpack_from('>L20s', self._contents,
632                                     (0x100 * 4) + (i * 24))
633        return (name, offset, None)
634
635    def _unpack_name(self, i):
636        offset = (0x100 * 4) + (i * 24) + 4
637        return self._contents[offset:offset+20]
638
639    def _unpack_offset(self, i):
640        offset = (0x100 * 4) + (i * 24)
641        return unpack_from('>L', self._contents, offset)[0]
642
643    def _unpack_crc32_checksum(self, i):
644        # Not stored in v1 index files
645        return None
646
647
648class PackIndex2(FilePackIndex):
649    """Version 2 Pack Index file."""
650
651    def __init__(self, filename, file=None, contents=None, size=None):
652        super(PackIndex2, self).__init__(filename, file, contents, size)
653        if self._contents[:4] != b'\377tOc':
654            raise AssertionError('Not a v2 pack index file')
655        (self.version, ) = unpack_from(b'>L', self._contents, 4)
656        if self.version != 2:
657            raise AssertionError('Version was %d' % self.version)
658        self._fan_out_table = self._read_fan_out_table(8)
659        self._name_table_offset = 8 + 0x100 * 4
660        self._crc32_table_offset = self._name_table_offset + 20 * len(self)
661        self._pack_offset_table_offset = (self._crc32_table_offset +
662                                          4 * len(self))
663        self._pack_offset_largetable_offset = (
664            self._pack_offset_table_offset + 4 * len(self))
665
666    def _unpack_entry(self, i):
667        return (self._unpack_name(i), self._unpack_offset(i),
668                self._unpack_crc32_checksum(i))
669
670    def _unpack_name(self, i):
671        offset = self._name_table_offset + i * 20
672        return self._contents[offset:offset+20]
673
674    def _unpack_offset(self, i):
675        offset = self._pack_offset_table_offset + i * 4
676        offset = unpack_from('>L', self._contents, offset)[0]
677        if offset & (2**31):
678            offset = (
679                self._pack_offset_largetable_offset +
680                (offset & (2 ** 31 - 1)) * 8)
681            offset = unpack_from('>Q', self._contents, offset)[0]
682        return offset
683
684    def _unpack_crc32_checksum(self, i):
685        return unpack_from('>L', self._contents,
686                           self._crc32_table_offset + i * 4)[0]
687
688
689def read_pack_header(read):
690    """Read the header of a pack file.
691
692    Args:
693      read: Read function
694    Returns: Tuple of (pack version, number of objects). If no data is
695        available to read, returns (None, None).
696    """
697    header = read(12)
698    if not header:
699        return None, None
700    if header[:4] != b'PACK':
701        raise AssertionError('Invalid pack header %r' % header)
702    (version,) = unpack_from(b'>L', header, 4)
703    if version not in (2, 3):
704        raise AssertionError('Version was %d' % version)
705    (num_objects,) = unpack_from(b'>L', header, 8)
706    return (version, num_objects)
707
708
709def chunks_length(chunks):
710    if isinstance(chunks, bytes):
711        return len(chunks)
712    else:
713        return sum(imap(len, chunks))
714
715
716def unpack_object(read_all, read_some=None, compute_crc32=False,
717                  include_comp=False, zlib_bufsize=_ZLIB_BUFSIZE):
718    """Unpack a Git object.
719
720    Args:
721      read_all: Read function that blocks until the number of requested
722        bytes are read.
723      read_some: Read function that returns at least one byte, but may not
724        return the number of bytes requested.
725      compute_crc32: If True, compute the CRC32 of the compressed data. If
726        False, the returned CRC32 will be None.
727      include_comp: If True, include compressed data in the result.
728      zlib_bufsize: An optional buffer size for zlib operations.
729    Returns: A tuple of (unpacked, unused), where unused is the unused data
730        leftover from decompression, and unpacked in an UnpackedObject with
731        the following attrs set:
732
733        * obj_chunks     (for non-delta types)
734        * pack_type_num
735        * delta_base     (for delta types)
736        * comp_chunks    (if include_comp is True)
737        * decomp_chunks
738        * decomp_len
739        * crc32          (if compute_crc32 is True)
740    """
741    if read_some is None:
742        read_some = read_all
743    if compute_crc32:
744        crc32 = 0
745    else:
746        crc32 = None
747
748    bytes, crc32 = take_msb_bytes(read_all, crc32=crc32)
749    type_num = (bytes[0] >> 4) & 0x07
750    size = bytes[0] & 0x0f
751    for i, byte in enumerate(bytes[1:]):
752        size += (byte & 0x7f) << ((i * 7) + 4)
753
754    raw_base = len(bytes)
755    if type_num == OFS_DELTA:
756        bytes, crc32 = take_msb_bytes(read_all, crc32=crc32)
757        raw_base += len(bytes)
758        if bytes[-1] & 0x80:
759            raise AssertionError
760        delta_base_offset = bytes[0] & 0x7f
761        for byte in bytes[1:]:
762            delta_base_offset += 1
763            delta_base_offset <<= 7
764            delta_base_offset += (byte & 0x7f)
765        delta_base = delta_base_offset
766    elif type_num == REF_DELTA:
767        delta_base = read_all(20)
768        if compute_crc32:
769            crc32 = binascii.crc32(delta_base, crc32)
770        raw_base += 20
771    else:
772        delta_base = None
773
774    unpacked = UnpackedObject(type_num, delta_base, size, crc32)
775    unused = read_zlib_chunks(read_some, unpacked, buffer_size=zlib_bufsize,
776                              include_comp=include_comp)
777    return unpacked, unused
778
779
780def _compute_object_size(value):
781    """Compute the size of a unresolved object for use with LRUSizeCache."""
782    (num, obj) = value
783    if num in DELTA_TYPES:
784        return chunks_length(obj[1])
785    return chunks_length(obj)
786
787
788class PackStreamReader(object):
789    """Class to read a pack stream.
790
791    The pack is read from a ReceivableProtocol using read() or recv() as
792    appropriate.
793    """
794
795    def __init__(self, read_all, read_some=None, zlib_bufsize=_ZLIB_BUFSIZE):
796        self.read_all = read_all
797        if read_some is None:
798            self.read_some = read_all
799        else:
800            self.read_some = read_some
801        self.sha = sha1()
802        self._offset = 0
803        self._rbuf = BytesIO()
804        # trailer is a deque to avoid memory allocation on small reads
805        self._trailer = deque()
806        self._zlib_bufsize = zlib_bufsize
807
808    def _read(self, read, size):
809        """Read up to size bytes using the given callback.
810
811        As a side effect, update the verifier's hash (excluding the last 20
812        bytes read).
813
814        Args:
815          read: The read callback to read from.
816          size: The maximum number of bytes to read; the particular
817            behavior is callback-specific.
818        """
819        data = read(size)
820
821        # maintain a trailer of the last 20 bytes we've read
822        n = len(data)
823        self._offset += n
824        tn = len(self._trailer)
825        if n >= 20:
826            to_pop = tn
827            to_add = 20
828        else:
829            to_pop = max(n + tn - 20, 0)
830            to_add = n
831        self.sha.update(
832            bytes(bytearray([self._trailer.popleft() for _ in range(to_pop)])))
833        self._trailer.extend(data[-to_add:])
834
835        # hash everything but the trailer
836        self.sha.update(data[:-to_add])
837        return data
838
839    def _buf_len(self):
840        buf = self._rbuf
841        start = buf.tell()
842        buf.seek(0, SEEK_END)
843        end = buf.tell()
844        buf.seek(start)
845        return end - start
846
847    @property
848    def offset(self):
849        return self._offset - self._buf_len()
850
851    def read(self, size):
852        """Read, blocking until size bytes are read."""
853        buf_len = self._buf_len()
854        if buf_len >= size:
855            return self._rbuf.read(size)
856        buf_data = self._rbuf.read()
857        self._rbuf = BytesIO()
858        return buf_data + self._read(self.read_all, size - buf_len)
859
860    def recv(self, size):
861        """Read up to size bytes, blocking until one byte is read."""
862        buf_len = self._buf_len()
863        if buf_len:
864            data = self._rbuf.read(size)
865            if size >= buf_len:
866                self._rbuf = BytesIO()
867            return data
868        return self._read(self.read_some, size)
869
870    def __len__(self):
871        return self._num_objects
872
873    def read_objects(self, compute_crc32=False):
874        """Read the objects in this pack file.
875
876        Args:
877          compute_crc32: If True, compute the CRC32 of the compressed
878            data. If False, the returned CRC32 will be None.
879        Returns: Iterator over UnpackedObjects with the following members set:
880            offset
881            obj_type_num
882            obj_chunks (for non-delta types)
883            delta_base (for delta types)
884            decomp_chunks
885            decomp_len
886            crc32 (if compute_crc32 is True)
887        Raises:
888          ChecksumMismatch: if the checksum of the pack contents does not
889            match the checksum in the pack trailer.
890          zlib.error: if an error occurred during zlib decompression.
891          IOError: if an error occurred writing to the output file.
892        """
893        pack_version, self._num_objects = read_pack_header(self.read)
894        if pack_version is None:
895            return
896
897        for i in range(self._num_objects):
898            offset = self.offset
899            unpacked, unused = unpack_object(
900              self.read, read_some=self.recv, compute_crc32=compute_crc32,
901              zlib_bufsize=self._zlib_bufsize)
902            unpacked.offset = offset
903
904            # prepend any unused data to current read buffer
905            buf = BytesIO()
906            buf.write(unused)
907            buf.write(self._rbuf.read())
908            buf.seek(0)
909            self._rbuf = buf
910
911            yield unpacked
912
913        if self._buf_len() < 20:
914            # If the read buffer is full, then the last read() got the whole
915            # trailer off the wire. If not, it means there is still some of the
916            # trailer to read. We need to read() all 20 bytes; N come from the
917            # read buffer and (20 - N) come from the wire.
918            self.read(20)
919
920        pack_sha = bytearray(self._trailer)
921        if pack_sha != self.sha.digest():
922            raise ChecksumMismatch(sha_to_hex(pack_sha), self.sha.hexdigest())
923
924
925class PackStreamCopier(PackStreamReader):
926    """Class to verify a pack stream as it is being read.
927
928    The pack is read from a ReceivableProtocol using read() or recv() as
929    appropriate and written out to the given file-like object.
930    """
931
932    def __init__(self, read_all, read_some, outfile, delta_iter=None):
933        """Initialize the copier.
934
935        Args:
936          read_all: Read function that blocks until the number of
937            requested bytes are read.
938          read_some: Read function that returns at least one byte, but may
939            not return the number of bytes requested.
940          outfile: File-like object to write output through.
941          delta_iter: Optional DeltaChainIterator to record deltas as we
942            read them.
943        """
944        super(PackStreamCopier, self).__init__(read_all, read_some=read_some)
945        self.outfile = outfile
946        self._delta_iter = delta_iter
947
948    def _read(self, read, size):
949        """Read data from the read callback and write it to the file."""
950        data = super(PackStreamCopier, self)._read(read, size)
951        self.outfile.write(data)
952        return data
953
954    def verify(self):
955        """Verify a pack stream and write it to the output file.
956
957        See PackStreamReader.iterobjects for a list of exceptions this may
958        throw.
959        """
960        if self._delta_iter:
961            for unpacked in self.read_objects():
962                self._delta_iter.record(unpacked)
963        else:
964            for _ in self.read_objects():
965                pass
966
967
968def obj_sha(type, chunks):
969    """Compute the SHA for a numeric type and object chunks."""
970    sha = sha1()
971    sha.update(object_header(type, chunks_length(chunks)))
972    if isinstance(chunks, bytes):
973        sha.update(chunks)
974    else:
975        for chunk in chunks:
976            sha.update(chunk)
977    return sha.digest()
978
979
980def compute_file_sha(f, start_ofs=0, end_ofs=0, buffer_size=1 << 16):
981    """Hash a portion of a file into a new SHA.
982
983    Args:
984      f: A file-like object to read from that supports seek().
985      start_ofs: The offset in the file to start reading at.
986      end_ofs: The offset in the file to end reading at, relative to the
987        end of the file.
988      buffer_size: A buffer size for reading.
989    Returns: A new SHA object updated with data read from the file.
990    """
991    sha = sha1()
992    f.seek(0, SEEK_END)
993    length = f.tell()
994    if (end_ofs < 0 and length + end_ofs < start_ofs) or end_ofs > length:
995        raise AssertionError(
996            "Attempt to read beyond file length. "
997            "start_ofs: %d, end_ofs: %d, file length: %d" % (
998                start_ofs, end_ofs, length))
999    todo = length + end_ofs - start_ofs
1000    f.seek(start_ofs)
1001    while todo:
1002        data = f.read(min(todo, buffer_size))
1003        sha.update(data)
1004        todo -= len(data)
1005    return sha
1006
1007
1008class PackData(object):
1009    """The data contained in a packfile.
1010
1011    Pack files can be accessed both sequentially for exploding a pack, and
1012    directly with the help of an index to retrieve a specific object.
1013
1014    The objects within are either complete or a delta against another.
1015
1016    The header is variable length. If the MSB of each byte is set then it
1017    indicates that the subsequent byte is still part of the header.
1018    For the first byte the next MS bits are the type, which tells you the type
1019    of object, and whether it is a delta. The LS byte is the lowest bits of the
1020    size. For each subsequent byte the LS 7 bits are the next MS bits of the
1021    size, i.e. the last byte of the header contains the MS bits of the size.
1022
1023    For the complete objects the data is stored as zlib deflated data.
1024    The size in the header is the uncompressed object size, so to uncompress
1025    you need to just keep feeding data to zlib until you get an object back,
1026    or it errors on bad data. This is done here by just giving the complete
1027    buffer from the start of the deflated object on. This is bad, but until I
1028    get mmap sorted out it will have to do.
1029
1030    Currently there are no integrity checks done. Also no attempt is made to
1031    try and detect the delta case, or a request for an object at the wrong
1032    position.  It will all just throw a zlib or KeyError.
1033    """
1034
1035    def __init__(self, filename, file=None, size=None):
1036        """Create a PackData object representing the pack in the given filename.
1037
1038        The file must exist and stay readable until the object is disposed of.
1039        It must also stay the same size. It will be mapped whenever needed.
1040
1041        Currently there is a restriction on the size of the pack as the python
1042        mmap implementation is flawed.
1043        """
1044        self._filename = filename
1045        self._size = size
1046        self._header_size = 12
1047        if file is None:
1048            self._file = GitFile(self._filename, 'rb')
1049        else:
1050            self._file = file
1051        (version, self._num_objects) = read_pack_header(self._file.read)
1052        self._offset_cache = LRUSizeCache(
1053            1024*1024*20, compute_size=_compute_object_size)
1054        self.pack = None
1055
1056    @property
1057    def filename(self):
1058        return os.path.basename(self._filename)
1059
1060    @property
1061    def path(self):
1062        return self._filename
1063
1064    @classmethod
1065    def from_file(cls, file, size):
1066        return cls(str(file), file=file, size=size)
1067
1068    @classmethod
1069    def from_path(cls, path):
1070        return cls(filename=path)
1071
1072    def close(self):
1073        self._file.close()
1074
1075    def __enter__(self):
1076        return self
1077
1078    def __exit__(self, exc_type, exc_val, exc_tb):
1079        self.close()
1080
1081    def _get_size(self):
1082        if self._size is not None:
1083            return self._size
1084        self._size = os.path.getsize(self._filename)
1085        if self._size < self._header_size:
1086            errmsg = ('%s is too small for a packfile (%d < %d)' %
1087                      (self._filename, self._size, self._header_size))
1088            raise AssertionError(errmsg)
1089        return self._size
1090
1091    def __len__(self):
1092        """Returns the number of objects in this pack."""
1093        return self._num_objects
1094
1095    def calculate_checksum(self):
1096        """Calculate the checksum for this pack.
1097
1098        Returns: 20-byte binary SHA1 digest
1099        """
1100        return compute_file_sha(self._file, end_ofs=-20).digest()
1101
1102    def get_ref(self, sha):
1103        """Get the object for a ref SHA, only looking in this pack."""
1104        # TODO: cache these results
1105        if self.pack is None:
1106            raise KeyError(sha)
1107        try:
1108            offset = self.pack.index.object_index(sha)
1109        except KeyError:
1110            offset = None
1111        if offset:
1112            type, obj = self.get_object_at(offset)
1113        elif self.pack is not None and self.pack.resolve_ext_ref:
1114            type, obj = self.pack.resolve_ext_ref(sha)
1115        else:
1116            raise KeyError(sha)
1117        return offset, type, obj
1118
1119    def resolve_object(self, offset, type, obj, get_ref=None):
1120        """Resolve an object, possibly resolving deltas when necessary.
1121
1122        Returns: Tuple with object type and contents.
1123        """
1124        # Walk down the delta chain, building a stack of deltas to reach
1125        # the requested object.
1126        base_offset = offset
1127        base_type = type
1128        base_obj = obj
1129        delta_stack = []
1130        while base_type in DELTA_TYPES:
1131            prev_offset = base_offset
1132            if get_ref is None:
1133                get_ref = self.get_ref
1134            if base_type == OFS_DELTA:
1135                (delta_offset, delta) = base_obj
1136                # TODO: clean up asserts and replace with nicer error messages
1137                base_offset = base_offset - delta_offset
1138                base_type, base_obj = self.get_object_at(base_offset)
1139                assert isinstance(base_type, int)
1140            elif base_type == REF_DELTA:
1141                (basename, delta) = base_obj
1142                assert isinstance(basename, bytes) and len(basename) == 20
1143                base_offset, base_type, base_obj = get_ref(basename)
1144                assert isinstance(base_type, int)
1145            delta_stack.append((prev_offset, base_type, delta))
1146
1147        # Now grab the base object (mustn't be a delta) and apply the
1148        # deltas all the way up the stack.
1149        chunks = base_obj
1150        for prev_offset, delta_type, delta in reversed(delta_stack):
1151            chunks = apply_delta(chunks, delta)
1152            # TODO(dborowitz): This can result in poor performance if
1153            # large base objects are separated from deltas in the pack.
1154            # We should reorganize so that we apply deltas to all
1155            # objects in a chain one after the other to optimize cache
1156            # performance.
1157            if prev_offset is not None:
1158                self._offset_cache[prev_offset] = base_type, chunks
1159        return base_type, chunks
1160
1161    def iterobjects(self, progress=None, compute_crc32=True):
1162        self._file.seek(self._header_size)
1163        for i in range(1, self._num_objects + 1):
1164            offset = self._file.tell()
1165            unpacked, unused = unpack_object(
1166              self._file.read, compute_crc32=compute_crc32)
1167            if progress is not None:
1168                progress(i, self._num_objects)
1169            yield (offset, unpacked.pack_type_num, unpacked._obj(),
1170                   unpacked.crc32)
1171            # Back up over unused data.
1172            self._file.seek(-len(unused), SEEK_CUR)
1173
1174    def _iter_unpacked(self):
1175        # TODO(dborowitz): Merge this with iterobjects, if we can change its
1176        # return type.
1177        self._file.seek(self._header_size)
1178
1179        if self._num_objects is None:
1180            return
1181
1182        for _ in range(self._num_objects):
1183            offset = self._file.tell()
1184            unpacked, unused = unpack_object(
1185              self._file.read, compute_crc32=False)
1186            unpacked.offset = offset
1187            yield unpacked
1188            # Back up over unused data.
1189            self._file.seek(-len(unused), SEEK_CUR)
1190
1191    def iterentries(self, progress=None):
1192        """Yield entries summarizing the contents of this pack.
1193
1194        Args:
1195          progress: Progress function, called with current and total
1196            object count.
1197        Returns: iterator of tuples with (sha, offset, crc32)
1198        """
1199        num_objects = self._num_objects
1200        resolve_ext_ref = (
1201            self.pack.resolve_ext_ref if self.pack is not None else None)
1202        indexer = PackIndexer.for_pack_data(
1203            self, resolve_ext_ref=resolve_ext_ref)
1204        for i, result in enumerate(indexer):
1205            if progress is not None:
1206                progress(i, num_objects)
1207            yield result
1208
1209    def sorted_entries(self, progress=None):
1210        """Return entries in this pack, sorted by SHA.
1211
1212        Args:
1213          progress: Progress function, called with current and total
1214            object count
1215        Returns: List of tuples with (sha, offset, crc32)
1216        """
1217        ret = sorted(self.iterentries(progress=progress))
1218        return ret
1219
1220    def create_index_v1(self, filename, progress=None):
1221        """Create a version 1 file for this data file.
1222
1223        Args:
1224          filename: Index filename.
1225          progress: Progress report function
1226        Returns: Checksum of index file
1227        """
1228        entries = self.sorted_entries(progress=progress)
1229        with GitFile(filename, 'wb') as f:
1230            return write_pack_index_v1(f, entries, self.calculate_checksum())
1231
1232    def create_index_v2(self, filename, progress=None):
1233        """Create a version 2 index file for this data file.
1234
1235        Args:
1236          filename: Index filename.
1237          progress: Progress report function
1238        Returns: Checksum of index file
1239        """
1240        entries = self.sorted_entries(progress=progress)
1241        with GitFile(filename, 'wb') as f:
1242            return write_pack_index_v2(f, entries, self.calculate_checksum())
1243
1244    def create_index(self, filename, progress=None,
1245                     version=2):
1246        """Create an  index file for this data file.
1247
1248        Args:
1249          filename: Index filename.
1250          progress: Progress report function
1251        Returns: Checksum of index file
1252        """
1253        if version == 1:
1254            return self.create_index_v1(filename, progress)
1255        elif version == 2:
1256            return self.create_index_v2(filename, progress)
1257        else:
1258            raise ValueError('unknown index format %d' % version)
1259
1260    def get_stored_checksum(self):
1261        """Return the expected checksum stored in this pack."""
1262        self._file.seek(-20, SEEK_END)
1263        return self._file.read(20)
1264
1265    def check(self):
1266        """Check the consistency of this pack."""
1267        actual = self.calculate_checksum()
1268        stored = self.get_stored_checksum()
1269        if actual != stored:
1270            raise ChecksumMismatch(stored, actual)
1271
1272    def get_compressed_data_at(self, offset):
1273        """Given offset in the packfile return compressed data that is there.
1274
1275        Using the associated index the location of an object can be looked up,
1276        and then the packfile can be asked directly for that object using this
1277        function.
1278        """
1279        assert offset >= self._header_size
1280        self._file.seek(offset)
1281        unpacked, _ = unpack_object(self._file.read, include_comp=True)
1282        return (unpacked.pack_type_num, unpacked.delta_base,
1283                unpacked.comp_chunks)
1284
1285    def get_object_at(self, offset):
1286        """Given an offset in to the packfile return the object that is there.
1287
1288        Using the associated index the location of an object can be looked up,
1289        and then the packfile can be asked directly for that object using this
1290        function.
1291        """
1292        try:
1293            return self._offset_cache[offset]
1294        except KeyError:
1295            pass
1296        assert offset >= self._header_size
1297        self._file.seek(offset)
1298        unpacked, _ = unpack_object(self._file.read)
1299        return (unpacked.pack_type_num, unpacked._obj())
1300
1301
1302class DeltaChainIterator(object):
1303    """Abstract iterator over pack data based on delta chains.
1304
1305    Each object in the pack is guaranteed to be inflated exactly once,
1306    regardless of how many objects reference it as a delta base. As a result,
1307    memory usage is proportional to the length of the longest delta chain.
1308
1309    Subclasses can override _result to define the result type of the iterator.
1310    By default, results are UnpackedObjects with the following members set:
1311
1312    * offset
1313    * obj_type_num
1314    * obj_chunks
1315    * pack_type_num
1316    * delta_base     (for delta types)
1317    * comp_chunks    (if _include_comp is True)
1318    * decomp_chunks
1319    * decomp_len
1320    * crc32          (if _compute_crc32 is True)
1321    """
1322
1323    _compute_crc32 = False
1324    _include_comp = False
1325
1326    def __init__(self, file_obj, resolve_ext_ref=None):
1327        self._file = file_obj
1328        self._resolve_ext_ref = resolve_ext_ref
1329        self._pending_ofs = defaultdict(list)
1330        self._pending_ref = defaultdict(list)
1331        self._full_ofs = []
1332        self._shas = {}
1333        self._ext_refs = []
1334
1335    @classmethod
1336    def for_pack_data(cls, pack_data, resolve_ext_ref=None):
1337        walker = cls(None, resolve_ext_ref=resolve_ext_ref)
1338        walker.set_pack_data(pack_data)
1339        for unpacked in pack_data._iter_unpacked():
1340            walker.record(unpacked)
1341        return walker
1342
1343    def record(self, unpacked):
1344        type_num = unpacked.pack_type_num
1345        offset = unpacked.offset
1346        if type_num == OFS_DELTA:
1347            base_offset = offset - unpacked.delta_base
1348            self._pending_ofs[base_offset].append(offset)
1349        elif type_num == REF_DELTA:
1350            self._pending_ref[unpacked.delta_base].append(offset)
1351        else:
1352            self._full_ofs.append((offset, type_num))
1353
1354    def set_pack_data(self, pack_data):
1355        self._file = pack_data._file
1356
1357    def _walk_all_chains(self):
1358        for offset, type_num in self._full_ofs:
1359            for result in self._follow_chain(offset, type_num, None):
1360                yield result
1361        for result in self._walk_ref_chains():
1362            yield result
1363        assert not self._pending_ofs
1364
1365    def _ensure_no_pending(self):
1366        if self._pending_ref:
1367            raise KeyError([sha_to_hex(s) for s in self._pending_ref])
1368
1369    def _walk_ref_chains(self):
1370        if not self._resolve_ext_ref:
1371            self._ensure_no_pending()
1372            return
1373
1374        for base_sha, pending in sorted(self._pending_ref.items()):
1375            if base_sha not in self._pending_ref:
1376                continue
1377            try:
1378                type_num, chunks = self._resolve_ext_ref(base_sha)
1379            except KeyError:
1380                # Not an external ref, but may depend on one. Either it will
1381                # get popped via a _follow_chain call, or we will raise an
1382                # error below.
1383                continue
1384            self._ext_refs.append(base_sha)
1385            self._pending_ref.pop(base_sha)
1386            for new_offset in pending:
1387                for result in self._follow_chain(new_offset, type_num, chunks):
1388                    yield result
1389
1390        self._ensure_no_pending()
1391
1392    def _result(self, unpacked):
1393        return unpacked
1394
1395    def _resolve_object(self, offset, obj_type_num, base_chunks):
1396        self._file.seek(offset)
1397        unpacked, _ = unpack_object(
1398          self._file.read, include_comp=self._include_comp,
1399          compute_crc32=self._compute_crc32)
1400        unpacked.offset = offset
1401        if base_chunks is None:
1402            assert unpacked.pack_type_num == obj_type_num
1403        else:
1404            assert unpacked.pack_type_num in DELTA_TYPES
1405            unpacked.obj_type_num = obj_type_num
1406            unpacked.obj_chunks = apply_delta(base_chunks,
1407                                              unpacked.decomp_chunks)
1408        return unpacked
1409
1410    def _follow_chain(self, offset, obj_type_num, base_chunks):
1411        # Unlike PackData.get_object_at, there is no need to cache offsets as
1412        # this approach by design inflates each object exactly once.
1413        todo = [(offset, obj_type_num, base_chunks)]
1414        for offset, obj_type_num, base_chunks in todo:
1415            unpacked = self._resolve_object(offset, obj_type_num, base_chunks)
1416            yield self._result(unpacked)
1417
1418            unblocked = chain(self._pending_ofs.pop(unpacked.offset, []),
1419                              self._pending_ref.pop(unpacked.sha(), []))
1420            todo.extend(
1421                (new_offset, unpacked.obj_type_num, unpacked.obj_chunks)
1422                for new_offset in unblocked)
1423
1424    def __iter__(self):
1425        return self._walk_all_chains()
1426
1427    def ext_refs(self):
1428        return self._ext_refs
1429
1430
1431class PackIndexer(DeltaChainIterator):
1432    """Delta chain iterator that yields index entries."""
1433
1434    _compute_crc32 = True
1435
1436    def _result(self, unpacked):
1437        return unpacked.sha(), unpacked.offset, unpacked.crc32
1438
1439
1440class PackInflater(DeltaChainIterator):
1441    """Delta chain iterator that yields ShaFile objects."""
1442
1443    def _result(self, unpacked):
1444        return unpacked.sha_file()
1445
1446
1447class SHA1Reader(object):
1448    """Wrapper for file-like object that remembers the SHA1 of its data."""
1449
1450    def __init__(self, f):
1451        self.f = f
1452        self.sha1 = sha1(b'')
1453
1454    def read(self, num=None):
1455        data = self.f.read(num)
1456        self.sha1.update(data)
1457        return data
1458
1459    def check_sha(self):
1460        stored = self.f.read(20)
1461        if stored != self.sha1.digest():
1462            raise ChecksumMismatch(self.sha1.hexdigest(), sha_to_hex(stored))
1463
1464    def close(self):
1465        return self.f.close()
1466
1467    def tell(self):
1468        return self.f.tell()
1469
1470
1471class SHA1Writer(object):
1472    """Wrapper for file-like object that remembers the SHA1 of its data."""
1473
1474    def __init__(self, f):
1475        self.f = f
1476        self.length = 0
1477        self.sha1 = sha1(b'')
1478
1479    def write(self, data):
1480        self.sha1.update(data)
1481        self.f.write(data)
1482        self.length += len(data)
1483
1484    def write_sha(self):
1485        sha = self.sha1.digest()
1486        assert len(sha) == 20
1487        self.f.write(sha)
1488        self.length += len(sha)
1489        return sha
1490
1491    def close(self):
1492        sha = self.write_sha()
1493        self.f.close()
1494        return sha
1495
1496    def offset(self):
1497        return self.length
1498
1499    def tell(self):
1500        return self.f.tell()
1501
1502
1503def pack_object_header(type_num, delta_base, size):
1504    """Create a pack object header for the given object info.
1505
1506    Args:
1507      type_num: Numeric type of the object.
1508      delta_base: Delta base offset or ref, or None for whole objects.
1509      size: Uncompressed object size.
1510    Returns: A header for a packed object.
1511    """
1512    header = []
1513    c = (type_num << 4) | (size & 15)
1514    size >>= 4
1515    while size:
1516        header.append(c | 0x80)
1517        c = size & 0x7f
1518        size >>= 7
1519    header.append(c)
1520    if type_num == OFS_DELTA:
1521        ret = [delta_base & 0x7f]
1522        delta_base >>= 7
1523        while delta_base:
1524            delta_base -= 1
1525            ret.insert(0, 0x80 | (delta_base & 0x7f))
1526            delta_base >>= 7
1527        header.extend(ret)
1528    elif type_num == REF_DELTA:
1529        assert len(delta_base) == 20
1530        header += delta_base
1531    return bytearray(header)
1532
1533
1534def write_pack_object(f, type, object, sha=None, compression_level=-1):
1535    """Write pack object to a file.
1536
1537    Args:
1538      f: File to write to
1539      type: Numeric type of the object
1540      object: Object to write
1541      compression_level: the zlib compression level
1542    Returns: Tuple with offset at which the object was written, and crc32
1543    """
1544    if type in DELTA_TYPES:
1545        delta_base, object = object
1546    else:
1547        delta_base = None
1548    header = bytes(pack_object_header(type, delta_base, len(object)))
1549    comp_data = zlib.compress(object, compression_level)
1550    crc32 = 0
1551    for data in (header, comp_data):
1552        f.write(data)
1553        if sha is not None:
1554            sha.update(data)
1555        crc32 = binascii.crc32(data, crc32)
1556    return crc32 & 0xffffffff
1557
1558
1559def write_pack(filename, objects, deltify=None, delta_window_size=None,
1560               compression_level=-1):
1561    """Write a new pack data file.
1562
1563    Args:
1564      filename: Path to the new pack file (without .pack extension)
1565      objects: Iterable of (object, path) tuples to write.
1566        Should provide __len__
1567      window_size: Delta window size
1568      deltify: Whether to deltify pack objects
1569      compression_level: the zlib compression level
1570    Returns: Tuple with checksum of pack file and index file
1571    """
1572    with GitFile(filename + '.pack', 'wb') as f:
1573        entries, data_sum = write_pack_objects(
1574            f, objects, delta_window_size=delta_window_size, deltify=deltify,
1575            compression_level=compression_level)
1576    entries = sorted([(k, v[0], v[1]) for (k, v) in entries.items()])
1577    with GitFile(filename + '.idx', 'wb') as f:
1578        return data_sum, write_pack_index_v2(f, entries, data_sum)
1579
1580
1581def write_pack_header(f, num_objects):
1582    """Write a pack header for the given number of objects."""
1583    f.write(b'PACK')                          # Pack header
1584    f.write(struct.pack(b'>L', 2))            # Pack version
1585    f.write(struct.pack(b'>L', num_objects))  # Number of objects in pack
1586
1587
1588def deltify_pack_objects(objects, window_size=None):
1589    """Generate deltas for pack objects.
1590
1591    Args:
1592      objects: An iterable of (object, path) tuples to deltify.
1593      window_size: Window size; None for default
1594    Returns: Iterator over type_num, object id, delta_base, content
1595        delta_base is None for full text entries
1596    """
1597    # TODO(jelmer): Use threads
1598    if window_size is None:
1599        window_size = DEFAULT_PACK_DELTA_WINDOW_SIZE
1600    # Build a list of objects ordered by the magic Linus heuristic
1601    # This helps us find good objects to diff against us
1602    magic = []
1603    for obj, path in objects:
1604        magic.append((obj.type_num, path, -obj.raw_length(), obj))
1605    magic.sort()
1606
1607    possible_bases = deque()
1608
1609    for type_num, path, neg_length, o in magic:
1610        raw = o.as_raw_string()
1611        winner = raw
1612        winner_base = None
1613        for base in possible_bases:
1614            if base.type_num != type_num:
1615                continue
1616            delta = create_delta(base.as_raw_string(), raw)
1617            if len(delta) < len(winner):
1618                winner_base = base.sha().digest()
1619                winner = delta
1620        yield type_num, o.sha().digest(), winner_base, winner
1621        possible_bases.appendleft(o)
1622        while len(possible_bases) > window_size:
1623            possible_bases.pop()
1624
1625
1626def pack_objects_to_data(objects):
1627    """Create pack data from objects
1628
1629    Args:
1630      objects: Pack objects
1631    Returns: Tuples with (type_num, hexdigest, delta base, object chunks)
1632    """
1633    count = len(objects)
1634    return (count,
1635            ((o.type_num, o.sha().digest(), None, o.as_raw_string())
1636             for (o, path) in objects))
1637
1638
1639def write_pack_objects(f, objects, delta_window_size=None, deltify=None,
1640                       compression_level=-1):
1641    """Write a new pack data file.
1642
1643    Args:
1644      f: File to write to
1645      objects: Iterable of (object, path) tuples to write.
1646        Should provide __len__
1647      window_size: Sliding window size for searching for deltas;
1648                        Set to None for default window size.
1649      deltify: Whether to deltify objects
1650      compression_level: the zlib compression level to use
1651    Returns: Dict mapping id -> (offset, crc32 checksum), pack checksum
1652    """
1653    if deltify is None:
1654        # PERFORMANCE/TODO(jelmer): This should be enabled but is *much* too
1655        # slow at the moment.
1656        deltify = False
1657    if deltify:
1658        pack_contents = deltify_pack_objects(objects, delta_window_size)
1659        pack_contents_count = len(objects)
1660    else:
1661        pack_contents_count, pack_contents = pack_objects_to_data(objects)
1662
1663    return write_pack_data(
1664        f, pack_contents_count, pack_contents,
1665        compression_level=compression_level)
1666
1667
1668def write_pack_data(
1669        f, num_records, records, progress=None, compression_level=-1):
1670    """Write a new pack data file.
1671
1672    Args:
1673      f: File to write to
1674      num_records: Number of records
1675      records: Iterator over type_num, object_id, delta_base, raw
1676      progress: Function to report progress to
1677      compression_level: the zlib compression level
1678    Returns: Dict mapping id -> (offset, crc32 checksum), pack checksum
1679    """
1680    # Write the pack
1681    entries = {}
1682    f = SHA1Writer(f)
1683    write_pack_header(f, num_records)
1684    for i, (type_num, object_id, delta_base, raw) in enumerate(records):
1685        if progress is not None:
1686            progress((
1687                'writing pack data: %d/%d\r' %
1688                (i, num_records)).encode('ascii'))
1689        offset = f.offset()
1690        if delta_base is not None:
1691            try:
1692                base_offset, base_crc32 = entries[delta_base]
1693            except KeyError:
1694                type_num = REF_DELTA
1695                raw = (delta_base, raw)
1696            else:
1697                type_num = OFS_DELTA
1698                raw = (offset - base_offset, raw)
1699        crc32 = write_pack_object(
1700            f, type_num, raw, compression_level=compression_level)
1701        entries[object_id] = (offset, crc32)
1702    return entries, f.write_sha()
1703
1704
1705def write_pack_index_v1(f, entries, pack_checksum):
1706    """Write a new pack index file.
1707
1708    Args:
1709      f: A file-like object to write to
1710      entries: List of tuples with object name (sha), offset_in_pack,
1711        and crc32_checksum.
1712      pack_checksum: Checksum of the pack file.
1713    Returns: The SHA of the written index file
1714    """
1715    f = SHA1Writer(f)
1716    fan_out_table = defaultdict(lambda: 0)
1717    for (name, offset, entry_checksum) in entries:
1718        fan_out_table[ord(name[:1])] += 1
1719    # Fan-out table
1720    for i in range(0x100):
1721        f.write(struct.pack('>L', fan_out_table[i]))
1722        fan_out_table[i+1] += fan_out_table[i]
1723    for (name, offset, entry_checksum) in entries:
1724        if not (offset <= 0xffffffff):
1725            raise TypeError("pack format 1 only supports offsets < 2Gb")
1726        f.write(struct.pack('>L20s', offset, name))
1727    assert len(pack_checksum) == 20
1728    f.write(pack_checksum)
1729    return f.write_sha()
1730
1731
1732def _delta_encode_size(size):
1733    ret = bytearray()
1734    c = size & 0x7f
1735    size >>= 7
1736    while size:
1737        ret.append(c | 0x80)
1738        c = size & 0x7f
1739        size >>= 7
1740    ret.append(c)
1741    return ret
1742
1743
1744# The length of delta compression copy operations in version 2 packs is limited
1745# to 64K.  To copy more, we use several copy operations.  Version 3 packs allow
1746# 24-bit lengths in copy operations, but we always make version 2 packs.
1747_MAX_COPY_LEN = 0xffff
1748
1749
1750def _encode_copy_operation(start, length):
1751    scratch = []
1752    op = 0x80
1753    for i in range(4):
1754        if start & 0xff << i*8:
1755            scratch.append((start >> i*8) & 0xff)
1756            op |= 1 << i
1757    for i in range(2):
1758        if length & 0xff << i*8:
1759            scratch.append((length >> i*8) & 0xff)
1760            op |= 1 << (4+i)
1761    return bytearray([op] + scratch)
1762
1763
1764def create_delta(base_buf, target_buf):
1765    """Use python difflib to work out how to transform base_buf to target_buf.
1766
1767    Args:
1768      base_buf: Base buffer
1769      target_buf: Target buffer
1770    """
1771    assert isinstance(base_buf, bytes)
1772    assert isinstance(target_buf, bytes)
1773    out_buf = bytearray()
1774    # write delta header
1775    out_buf += _delta_encode_size(len(base_buf))
1776    out_buf += _delta_encode_size(len(target_buf))
1777    # write out delta opcodes
1778    seq = difflib.SequenceMatcher(a=base_buf, b=target_buf)
1779    for opcode, i1, i2, j1, j2 in seq.get_opcodes():
1780        # Git patch opcodes don't care about deletes!
1781        # if opcode == 'replace' or opcode == 'delete':
1782        #    pass
1783        if opcode == 'equal':
1784            # If they are equal, unpacker will use data from base_buf
1785            # Write out an opcode that says what range to use
1786            copy_start = i1
1787            copy_len = i2 - i1
1788            while copy_len > 0:
1789                to_copy = min(copy_len, _MAX_COPY_LEN)
1790                out_buf += _encode_copy_operation(copy_start, to_copy)
1791                copy_start += to_copy
1792                copy_len -= to_copy
1793        if opcode == 'replace' or opcode == 'insert':
1794            # If we are replacing a range or adding one, then we just
1795            # output it to the stream (prefixed by its size)
1796            s = j2 - j1
1797            o = j1
1798            while s > 127:
1799                out_buf.append(127)
1800                out_buf += bytearray(target_buf[o:o+127])
1801                s -= 127
1802                o += 127
1803            out_buf.append(s)
1804            out_buf += bytearray(target_buf[o:o+s])
1805    return bytes(out_buf)
1806
1807
1808def apply_delta(src_buf, delta):
1809    """Based on the similar function in git's patch-delta.c.
1810
1811    Args:
1812      src_buf: Source buffer
1813      delta: Delta instructions
1814    """
1815    if not isinstance(src_buf, bytes):
1816        src_buf = b''.join(src_buf)
1817    if not isinstance(delta, bytes):
1818        delta = b''.join(delta)
1819    out = []
1820    index = 0
1821    delta_length = len(delta)
1822
1823    def get_delta_header_size(delta, index):
1824        size = 0
1825        i = 0
1826        while delta:
1827            cmd = ord(delta[index:index+1])
1828            index += 1
1829            size |= (cmd & ~0x80) << i
1830            i += 7
1831            if not cmd & 0x80:
1832                break
1833        return size, index
1834    src_size, index = get_delta_header_size(delta, index)
1835    dest_size, index = get_delta_header_size(delta, index)
1836    assert src_size == len(src_buf), '%d vs %d' % (src_size, len(src_buf))
1837    while index < delta_length:
1838        cmd = ord(delta[index:index+1])
1839        index += 1
1840        if cmd & 0x80:
1841            cp_off = 0
1842            for i in range(4):
1843                if cmd & (1 << i):
1844                    x = ord(delta[index:index+1])
1845                    index += 1
1846                    cp_off |= x << (i * 8)
1847            cp_size = 0
1848            # Version 3 packs can contain copy sizes larger than 64K.
1849            for i in range(3):
1850                if cmd & (1 << (4+i)):
1851                    x = ord(delta[index:index+1])
1852                    index += 1
1853                    cp_size |= x << (i * 8)
1854            if cp_size == 0:
1855                cp_size = 0x10000
1856            if (cp_off + cp_size < cp_size or
1857                    cp_off + cp_size > src_size or
1858                    cp_size > dest_size):
1859                break
1860            out.append(src_buf[cp_off:cp_off+cp_size])
1861        elif cmd != 0:
1862            out.append(delta[index:index+cmd])
1863            index += cmd
1864        else:
1865            raise ApplyDeltaError('Invalid opcode 0')
1866
1867    if index != delta_length:
1868        raise ApplyDeltaError('delta not empty: %r' % delta[index:])
1869
1870    if dest_size != chunks_length(out):
1871        raise ApplyDeltaError('dest size incorrect')
1872
1873    return out
1874
1875
1876def write_pack_index_v2(f, entries, pack_checksum):
1877    """Write a new pack index file.
1878
1879    Args:
1880      f: File-like object to write to
1881      entries: List of tuples with object name (sha), offset_in_pack, and
1882        crc32_checksum.
1883      pack_checksum: Checksum of the pack file.
1884    Returns: The SHA of the index file written
1885    """
1886    f = SHA1Writer(f)
1887    f.write(b'\377tOc')  # Magic!
1888    f.write(struct.pack('>L', 2))
1889    fan_out_table = defaultdict(lambda: 0)
1890    for (name, offset, entry_checksum) in entries:
1891        fan_out_table[ord(name[:1])] += 1
1892    # Fan-out table
1893    largetable = []
1894    for i in range(0x100):
1895        f.write(struct.pack(b'>L', fan_out_table[i]))
1896        fan_out_table[i+1] += fan_out_table[i]
1897    for (name, offset, entry_checksum) in entries:
1898        f.write(name)
1899    for (name, offset, entry_checksum) in entries:
1900        f.write(struct.pack(b'>L', entry_checksum))
1901    for (name, offset, entry_checksum) in entries:
1902        if offset < 2**31:
1903            f.write(struct.pack(b'>L', offset))
1904        else:
1905            f.write(struct.pack(b'>L', 2**31 + len(largetable)))
1906            largetable.append(offset)
1907    for offset in largetable:
1908        f.write(struct.pack(b'>Q', offset))
1909    assert len(pack_checksum) == 20
1910    f.write(pack_checksum)
1911    return f.write_sha()
1912
1913
1914write_pack_index = write_pack_index_v2
1915
1916
1917class Pack(object):
1918    """A Git pack object."""
1919
1920    def __init__(self, basename, resolve_ext_ref=None):
1921        self._basename = basename
1922        self._data = None
1923        self._idx = None
1924        self._idx_path = self._basename + '.idx'
1925        self._data_path = self._basename + '.pack'
1926        self._data_load = lambda: PackData(self._data_path)
1927        self._idx_load = lambda: load_pack_index(self._idx_path)
1928        self.resolve_ext_ref = resolve_ext_ref
1929
1930    @classmethod
1931    def from_lazy_objects(self, data_fn, idx_fn):
1932        """Create a new pack object from callables to load pack data and
1933        index objects."""
1934        ret = Pack('')
1935        ret._data_load = data_fn
1936        ret._idx_load = idx_fn
1937        return ret
1938
1939    @classmethod
1940    def from_objects(self, data, idx):
1941        """Create a new pack object from pack data and index objects."""
1942        ret = Pack('')
1943        ret._data_load = lambda: data
1944        ret._idx_load = lambda: idx
1945        return ret
1946
1947    def name(self):
1948        """The SHA over the SHAs of the objects in this pack."""
1949        return self.index.objects_sha1()
1950
1951    @property
1952    def data(self):
1953        """The pack data object being used."""
1954        if self._data is None:
1955            self._data = self._data_load()
1956            self._data.pack = self
1957            self.check_length_and_checksum()
1958        return self._data
1959
1960    @property
1961    def index(self):
1962        """The index being used.
1963
1964        Note: This may be an in-memory index
1965        """
1966        if self._idx is None:
1967            self._idx = self._idx_load()
1968        return self._idx
1969
1970    def close(self):
1971        if self._data is not None:
1972            self._data.close()
1973        if self._idx is not None:
1974            self._idx.close()
1975
1976    def __enter__(self):
1977        return self
1978
1979    def __exit__(self, exc_type, exc_val, exc_tb):
1980        self.close()
1981
1982    def __eq__(self, other):
1983        return isinstance(self, type(other)) and self.index == other.index
1984
1985    def __len__(self):
1986        """Number of entries in this pack."""
1987        return len(self.index)
1988
1989    def __repr__(self):
1990        return '%s(%r)' % (self.__class__.__name__, self._basename)
1991
1992    def __iter__(self):
1993        """Iterate over all the sha1s of the objects in this pack."""
1994        return iter(self.index)
1995
1996    def check_length_and_checksum(self):
1997        """Sanity check the length and checksum of the pack index and data."""
1998        assert len(self.index) == len(self.data)
1999        idx_stored_checksum = self.index.get_pack_checksum()
2000        data_stored_checksum = self.data.get_stored_checksum()
2001        if idx_stored_checksum != data_stored_checksum:
2002            raise ChecksumMismatch(sha_to_hex(idx_stored_checksum),
2003                                   sha_to_hex(data_stored_checksum))
2004
2005    def check(self):
2006        """Check the integrity of this pack.
2007
2008        Raises:
2009          ChecksumMismatch: if a checksum for the index or data is wrong
2010        """
2011        self.index.check()
2012        self.data.check()
2013        for obj in self.iterobjects():
2014            obj.check()
2015        # TODO: object connectivity checks
2016
2017    def get_stored_checksum(self):
2018        return self.data.get_stored_checksum()
2019
2020    def __contains__(self, sha1):
2021        """Check whether this pack contains a particular SHA1."""
2022        try:
2023            self.index.object_index(sha1)
2024            return True
2025        except KeyError:
2026            return False
2027
2028    def get_raw_unresolved(self, sha1):
2029        """Get raw unresolved data for a SHA.
2030
2031        Args:
2032          sha1: SHA to return data for
2033        Returns: Tuple with pack object type, delta base (if applicable),
2034            list of data chunks
2035        """
2036        offset = self.index.object_index(sha1)
2037        (obj_type, delta_base, chunks) = self.data.get_compressed_data_at(
2038                offset)
2039        if obj_type == OFS_DELTA:
2040            delta_base = sha_to_hex(
2041                    self.index.object_sha1(offset - delta_base))
2042            obj_type = REF_DELTA
2043        return (obj_type, delta_base, chunks)
2044
2045    def get_raw(self, sha1):
2046        offset = self.index.object_index(sha1)
2047        obj_type, obj = self.data.get_object_at(offset)
2048        type_num, chunks = self.data.resolve_object(offset, obj_type, obj)
2049        return type_num, b''.join(chunks)
2050
2051    def __getitem__(self, sha1):
2052        """Retrieve the specified SHA1."""
2053        type, uncomp = self.get_raw(sha1)
2054        return ShaFile.from_raw_string(type, uncomp, sha=sha1)
2055
2056    def iterobjects(self):
2057        """Iterate over the objects in this pack."""
2058        return iter(PackInflater.for_pack_data(
2059            self.data, resolve_ext_ref=self.resolve_ext_ref))
2060
2061    def pack_tuples(self):
2062        """Provide an iterable for use with write_pack_objects.
2063
2064        Returns: Object that can iterate over (object, path) tuples
2065            and provides __len__
2066        """
2067        class PackTupleIterable(object):
2068
2069            def __init__(self, pack):
2070                self.pack = pack
2071
2072            def __len__(self):
2073                return len(self.pack)
2074
2075            def __iter__(self):
2076                return ((o, None) for o in self.pack.iterobjects())
2077
2078        return PackTupleIterable(self)
2079
2080    def keep(self, msg=None):
2081        """Add a .keep file for the pack, preventing git from garbage collecting it.
2082
2083        Args:
2084          msg: A message written inside the .keep file; can be used later
2085            to determine whether or not a .keep file is obsolete.
2086        Returns: The path of the .keep file, as a string.
2087        """
2088        keepfile_name = '%s.keep' % self._basename
2089        with GitFile(keepfile_name, 'wb') as keepfile:
2090            if msg:
2091                keepfile.write(msg)
2092                keepfile.write(b'\n')
2093        return keepfile_name
2094
2095
2096try:
2097    from dulwich._pack import apply_delta, bisect_find_sha  # noqa: F811
2098except ImportError:
2099    pass
2100