1# Copyright (C) 2008, 2009, 2010 Canonical Ltd
2#
3# This program is free software; you can redistribute it and/or modify
4# it under the terms of the GNU General Public License as published by
5# the Free Software Foundation; either version 2 of the License, or
6# (at your option) any later version.
7#
8# This program is distributed in the hope that it will be useful,
9# but WITHOUT ANY WARRANTY; without even the implied warranty of
10# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
11# GNU General Public License for more details.
12#
13# You should have received a copy of the GNU General Public License
14# along with this program; if not, write to the Free Software
15# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
16#
17# cython: language_level=3
18
19"""Compiled extensions for doing compression."""
20
21
22cdef extern from "python-compat.h":
23    pass
24
25from libc.stdlib cimport (
26    free,
27    )
28from libc.string cimport (
29    memcpy,
30    )
31
32from cpython.bytes cimport (
33    PyBytes_AS_STRING,
34    PyBytes_CheckExact,
35    PyBytes_FromStringAndSize,
36    PyBytes_GET_SIZE,
37    )
38from cpython.object cimport (
39    PyObject,
40    )
41from cpython.mem cimport (
42    PyMem_Free,
43    PyMem_Malloc,
44    )
45
46
47cdef extern from "delta.h":
48    struct source_info:
49        void *buf
50        unsigned long size
51        unsigned long agg_offset
52    struct delta_index:
53        pass
54    ctypedef enum delta_result:
55        DELTA_OK
56        DELTA_OUT_OF_MEMORY
57        DELTA_INDEX_NEEDED
58        DELTA_SOURCE_EMPTY
59        DELTA_SOURCE_BAD
60        DELTA_BUFFER_EMPTY
61        DELTA_SIZE_TOO_BIG
62    delta_result create_delta_index(source_info *src,
63                                    delta_index *old,
64                                    delta_index **fresh,
65                                    int max_entries) nogil
66    delta_result create_delta_index_from_delta(source_info *delta,
67                                               delta_index *old,
68                                               delta_index **fresh) nogil
69    void free_delta_index(delta_index *index) nogil
70    delta_result create_delta(delta_index *indexes,
71                              void *buf, unsigned long bufsize,
72                              unsigned long *delta_size,
73                              unsigned long max_delta_size,
74                              void **delta_data) nogil
75    unsigned long get_delta_hdr_size(unsigned char **datap,
76                                     unsigned char *top) nogil
77    unsigned long sizeof_delta_index(delta_index *index)
78    Py_ssize_t DELTA_SIZE_MIN
79    int get_hash_offset(delta_index *index, int pos, unsigned int *hash_offset)
80    int get_entry_summary(delta_index *index, int pos,
81                          unsigned int *global_offset, unsigned int *hash_val)
82    unsigned int rabin_hash (unsigned char *data)
83
84
85def make_delta_index(source):
86    return DeltaIndex(source)
87
88
89cdef object _translate_delta_failure(delta_result result):
90    if result == DELTA_OUT_OF_MEMORY:
91        return MemoryError("Delta function failed to allocate memory")
92    elif result == DELTA_INDEX_NEEDED:
93        return ValueError("Delta function requires delta_index param")
94    elif result == DELTA_SOURCE_EMPTY:
95        return ValueError("Delta function given empty source_info param")
96    elif result == DELTA_SOURCE_BAD:
97        return RuntimeError("Delta function given invalid source_info param")
98    elif result == DELTA_BUFFER_EMPTY:
99        return ValueError("Delta function given empty buffer params")
100    return AssertionError("Unrecognised delta result code: %d" % result)
101
102
103def _rabin_hash(content):
104    if not PyBytes_CheckExact(content):
105        raise ValueError('content must be a string')
106    if len(content) < 16:
107        raise ValueError('content must be at least 16 bytes long')
108    # Try to cast it to an int, if it can fit
109    return int(rabin_hash(<unsigned char*>(PyBytes_AS_STRING(content))))
110
111
112cdef class DeltaIndex:
113
114    cdef readonly list _sources
115    cdef source_info *_source_infos
116    cdef delta_index *_index
117    cdef public unsigned long _source_offset
118    cdef readonly unsigned int _max_num_sources
119    cdef public int _max_bytes_to_index
120
121    def __init__(self, source=None, max_bytes_to_index=None):
122        self._sources = []
123        self._index = NULL
124        self._max_num_sources = 65000
125        self._source_infos = <source_info *>PyMem_Malloc(
126            sizeof(source_info) * self._max_num_sources)
127        if self._source_infos == NULL:
128            raise MemoryError('failed to allocate memory for DeltaIndex')
129        self._source_offset = 0
130        self._max_bytes_to_index = 0
131        if max_bytes_to_index is not None:
132            self._max_bytes_to_index = max_bytes_to_index
133
134        if source is not None:
135            self.add_source(source, 0)
136
137    def __sizeof__(self):
138        # We want to track the _source_infos allocations, but the referenced
139        # void* are actually tracked in _sources itself.
140        return (sizeof(DeltaIndex)
141                + (sizeof(source_info) * self._max_num_sources)
142                + sizeof_delta_index(self._index))
143
144    def __repr__(self):
145        return '%s(%d, %d)' % (self.__class__.__name__,
146            len(self._sources), self._source_offset)
147
148    def __dealloc__(self):
149        if self._index != NULL:
150            free_delta_index(self._index)
151            self._index = NULL
152        PyMem_Free(self._source_infos)
153
154    def _has_index(self):
155        return (self._index != NULL)
156
157    def _dump_index(self):
158        """Dump the pointers in the index.
159
160        This is an arbitrary layout, used for testing. It is not meant to be
161        used in production code.
162
163        :return: (hash_list, entry_list)
164            hash_list   A list of offsets, so hash[i] points to the 'hash
165                        bucket' starting at the given offset and going until
166                        hash[i+1]
167            entry_list  A list of (text_offset, hash_val). text_offset is the
168                        offset in the "source" texts, and hash_val is the RABIN
169                        hash for that offset.
170                        Note that the entry should be in the hash bucket
171                        defined by
172                        hash[(hash_val & mask)] && hash[(hash_val & mask) + 1]
173        """
174        cdef int pos
175        cdef unsigned int text_offset
176        cdef unsigned int hash_val
177        cdef unsigned int hash_offset
178        if self._index == NULL:
179            return None
180        hash_list = []
181        pos = 0
182        while get_hash_offset(self._index, pos, &hash_offset):
183            hash_list.append(int(hash_offset))
184            pos += 1
185        entry_list = []
186        pos = 0
187        while get_entry_summary(self._index, pos, &text_offset, &hash_val):
188            # Map back using 'int' so that we don't get Long everywhere, when
189            # almost everything is <2**31.
190            val = tuple(map(int, [text_offset, hash_val]))
191            entry_list.append(val)
192            pos += 1
193        return hash_list, entry_list
194
195    def add_delta_source(self, delta, unadded_bytes):
196        """Add a new delta to the source texts.
197
198        :param delta: The text of the delta, this must be a byte string.
199        :param unadded_bytes: Number of bytes that were added to the source
200            that were not indexed.
201        """
202        cdef char *c_delta
203        cdef Py_ssize_t c_delta_size
204        cdef delta_index *index
205        cdef delta_result res
206        cdef unsigned int source_location
207        cdef source_info *src
208        cdef unsigned int num_indexes
209
210        if not PyBytes_CheckExact(delta):
211            raise TypeError('delta is not a bytestring')
212
213        source_location = len(self._sources)
214        if source_location >= self._max_num_sources:
215            self._expand_sources()
216        self._sources.append(delta)
217        c_delta = PyBytes_AS_STRING(delta)
218        c_delta_size = PyBytes_GET_SIZE(delta)
219        src = self._source_infos + source_location
220        src.buf = c_delta
221        src.size = c_delta_size
222        src.agg_offset = self._source_offset + unadded_bytes
223        with nogil:
224            res = create_delta_index_from_delta(src, self._index, &index)
225        if res != DELTA_OK:
226            raise _translate_delta_failure(res)
227        self._source_offset = src.agg_offset + src.size
228        if index != self._index:
229            free_delta_index(self._index)
230            self._index = index
231
232    def add_source(self, source, unadded_bytes):
233        """Add a new bit of source text to the delta indexes.
234
235        :param source: The text in question, this must be a byte string
236        :param unadded_bytes: Assume there are this many bytes that didn't get
237            added between this source and the end of the previous source.
238        :param max_pointers: Add no more than this many entries to the index.
239            By default, we sample every 16 bytes, if that would require more
240            than max_entries, we will reduce the sampling rate.
241            A value of 0 means unlimited, None means use the default limit.
242        """
243        cdef char *c_source
244        cdef Py_ssize_t c_source_size
245        cdef delta_index *index
246        cdef delta_result res
247        cdef unsigned int source_location
248        cdef source_info *src
249        cdef unsigned int num_indexes
250        cdef int max_num_entries
251
252        if not PyBytes_CheckExact(source):
253            raise TypeError('source is not a bytestring')
254
255        source_location = len(self._sources)
256        if source_location >= self._max_num_sources:
257            self._expand_sources()
258        if source_location != 0 and self._index == NULL:
259            # We were lazy about populating the index, create it now
260            self._populate_first_index()
261        self._sources.append(source)
262        c_source = PyBytes_AS_STRING(source)
263        c_source_size = PyBytes_GET_SIZE(source)
264        src = self._source_infos + source_location
265        src.buf = c_source
266        src.size = c_source_size
267
268        src.agg_offset = self._source_offset + unadded_bytes
269        self._source_offset = src.agg_offset + src.size
270        # We delay creating the index on the first insert
271        if source_location != 0:
272            with nogil:
273                res = create_delta_index(src, self._index, &index,
274                                         self._max_bytes_to_index)
275            if res != DELTA_OK:
276                raise _translate_delta_failure(res)
277            if index != self._index:
278                free_delta_index(self._index)
279                self._index = index
280
281    cdef _populate_first_index(self):
282        cdef delta_index *index
283        cdef delta_result res
284        if len(self._sources) != 1 or self._index != NULL:
285            raise AssertionError('_populate_first_index should only be'
286                ' called when we have a single source and no index yet')
287
288        # We know that self._index is already NULL, so create_delta_index
289        # will always create a new index unless there's a malloc failure
290        with nogil:
291            res = create_delta_index(&self._source_infos[0], NULL, &index,
292                                     self._max_bytes_to_index)
293        if res != DELTA_OK:
294            raise _translate_delta_failure(res)
295        self._index = index
296
297    cdef _expand_sources(self):
298        raise RuntimeError('if we move self._source_infos, then we need to'
299                           ' change all of the index pointers as well.')
300
301    def make_delta(self, target_bytes, max_delta_size=0):
302        """Create a delta from the current source to the target bytes."""
303        cdef char *target
304        cdef Py_ssize_t target_size
305        cdef void * delta
306        cdef unsigned long delta_size
307        cdef unsigned long c_max_delta_size
308        cdef delta_result res
309
310        if self._index == NULL:
311            if len(self._sources) == 0:
312                return None
313            # We were just lazy about generating the index
314            self._populate_first_index()
315
316        if not PyBytes_CheckExact(target_bytes):
317            raise TypeError('target is not a bytestring')
318
319        target = PyBytes_AS_STRING(target_bytes)
320        target_size = PyBytes_GET_SIZE(target_bytes)
321
322        # TODO: inline some of create_delta so we at least don't have to double
323        #       malloc, and can instead use PyBytes_FromStringAndSize, to
324        #       allocate the bytes into the final string
325        c_max_delta_size = max_delta_size
326        with nogil:
327            res = create_delta(self._index, target, target_size,
328                               &delta_size, c_max_delta_size, &delta)
329        result = None
330        if res == DELTA_OK:
331            result = PyBytes_FromStringAndSize(<char *>delta, delta_size)
332            free(delta)
333        elif res != DELTA_SIZE_TOO_BIG:
334            raise _translate_delta_failure(res)
335        return result
336
337
338def make_delta(source_bytes, target_bytes):
339    """Create a delta, this is a wrapper around DeltaIndex.make_delta."""
340    di = DeltaIndex(source_bytes)
341    return di.make_delta(target_bytes)
342
343
344def apply_delta(source_bytes, delta_bytes):
345    """Apply a delta generated by make_delta to source_bytes."""
346    cdef char *source
347    cdef Py_ssize_t source_size
348    cdef char *delta
349    cdef Py_ssize_t delta_size
350
351    if not PyBytes_CheckExact(source_bytes):
352        raise TypeError('source is not a bytestring')
353    if not PyBytes_CheckExact(delta_bytes):
354        raise TypeError('delta is not a bytestring')
355    source = PyBytes_AS_STRING(source_bytes)
356    source_size = PyBytes_GET_SIZE(source_bytes)
357    delta = PyBytes_AS_STRING(delta_bytes)
358    delta_size = PyBytes_GET_SIZE(delta_bytes)
359    # Code taken from patch-delta.c, only brought here to give better error
360    # handling, and to avoid double allocating memory
361    if (delta_size < DELTA_SIZE_MIN):
362        # XXX: Invalid delta block
363        raise RuntimeError('delta_size %d smaller than min delta size %d'
364                           % (delta_size, DELTA_SIZE_MIN))
365
366    return _apply_delta(source, source_size, delta, delta_size)
367
368
369cdef unsigned char *_decode_copy_instruction(unsigned char *data,
370    unsigned char cmd, unsigned int *offset,
371    unsigned int *length) nogil: # cannot_raise
372    """Decode a copy instruction from the next few bytes.
373
374    A copy instruction is a variable number of bytes, so we will parse the
375    bytes we care about, and return the new position, as well as the offset and
376    length referred to in the bytes.
377
378    :param bytes: Pointer to the start of bytes after cmd
379    :param cmd: The command code
380    :return: Pointer to the bytes just after the last decode byte
381    """
382    cdef unsigned int off, size, count
383    off = 0
384    size = 0
385    count = 0
386    if (cmd & 0x01):
387        off = data[count]
388        count = count + 1
389    if (cmd & 0x02):
390        off = off | (data[count] << 8)
391        count = count + 1
392    if (cmd & 0x04):
393        off = off | (data[count] << 16)
394        count = count + 1
395    if (cmd & 0x08):
396        off = off | (data[count] << 24)
397        count = count + 1
398    if (cmd & 0x10):
399        size = data[count]
400        count = count + 1
401    if (cmd & 0x20):
402        size = size | (data[count] << 8)
403        count = count + 1
404    if (cmd & 0x40):
405        size = size | (data[count] << 16)
406        count = count + 1
407    if (size == 0):
408        size = 0x10000
409    offset[0] = off
410    length[0] = size
411    return data + count
412
413
414cdef object _apply_delta(char *source, Py_ssize_t source_size,
415                         char *delta, Py_ssize_t delta_size):
416    """common functionality between apply_delta and apply_delta_to_source."""
417    cdef unsigned char *data
418    cdef unsigned char *top
419    cdef unsigned char *dst_buf
420    cdef unsigned char *out
421    cdef unsigned char cmd
422    cdef Py_ssize_t size
423    cdef unsigned int cp_off, cp_size
424    cdef int failed
425
426    data = <unsigned char *>delta
427    top = data + delta_size
428
429    # now the result size
430    size = get_delta_hdr_size(&data, top)
431    result = PyBytes_FromStringAndSize(NULL, size)
432    dst_buf = <unsigned char*>PyBytes_AS_STRING(result)
433
434    failed = 0
435    with nogil:
436        out = dst_buf
437        while (data < top):
438            cmd = data[0]
439            data = data + 1
440            if (cmd & 0x80):
441                # Copy instruction
442                data = _decode_copy_instruction(data, cmd, &cp_off, &cp_size)
443                if (cp_off + cp_size < cp_size or
444                    cp_off + cp_size > <unsigned int>source_size or
445                    cp_size > <unsigned int>size):
446                    failed = 1
447                    break
448                memcpy(out, source + cp_off, cp_size)
449                out = out + cp_size
450                size = size - cp_size
451            else:
452                # Insert instruction
453                if cmd == 0:
454                    # cmd == 0 is reserved for future encoding
455                    # extensions. In the mean time we must fail when
456                    # encountering them (might be data corruption).
457                    failed = 2
458                    break
459                if cmd > size:
460                    failed = 3
461                    break
462                memcpy(out, data, cmd)
463                out = out + cmd
464                data = data + cmd
465                size = size - cmd
466    if failed:
467        if failed == 1:
468            raise ValueError('Something wrong with:'
469                ' cp_off = %s, cp_size = %s'
470                ' source_size = %s, size = %s'
471                % (cp_off, cp_size, source_size, size))
472        elif failed == 2:
473            raise ValueError('Got delta opcode: 0, not supported')
474        elif failed == 3:
475            raise ValueError('Insert instruction longer than remaining'
476                ' bytes: %d > %d' % (cmd, size))
477
478    # sanity check
479    if (data != top or size != 0):
480        raise RuntimeError('Did not extract the number of bytes we expected'
481            ' we were left with %d bytes in "size", and top - data = %d'
482            % (size, <int>(top - data)))
483
484    # *dst_size = out - dst_buf;
485    if (out - dst_buf) != PyBytes_GET_SIZE(result):
486        raise RuntimeError('Number of bytes extracted did not match the'
487            ' size encoded in the delta header.')
488    return result
489
490
491def apply_delta_to_source(source, delta_start, delta_end):
492    """Extract a delta from source bytes, and apply it."""
493    cdef char *c_source
494    cdef Py_ssize_t c_source_size
495    cdef char *c_delta
496    cdef Py_ssize_t c_delta_size
497    cdef Py_ssize_t c_delta_start, c_delta_end
498
499    if not PyBytes_CheckExact(source):
500        raise TypeError('source is not a str')
501    c_source_size = PyBytes_GET_SIZE(source)
502    c_delta_start = delta_start
503    c_delta_end = delta_end
504    if c_delta_start >= c_source_size:
505        raise ValueError('delta starts after source')
506    if c_delta_end > c_source_size:
507        raise ValueError('delta ends after source')
508    if c_delta_start >= c_delta_end:
509        raise ValueError('delta starts after it ends')
510
511    c_delta_size = c_delta_end - c_delta_start
512    c_source = PyBytes_AS_STRING(source)
513    c_delta = c_source + c_delta_start
514    # We don't use source_size, because we know the delta should not refer to
515    # any bytes after it starts
516    return _apply_delta(c_source, c_delta_start, c_delta, c_delta_size)
517
518
519def encode_base128_int(val):
520    """Convert an integer into a 7-bit lsb encoding."""
521    cdef unsigned int c_val
522    cdef Py_ssize_t count
523    cdef unsigned int num_bytes
524    cdef unsigned char c_bytes[8] # max size for 32-bit int is 5 bytes
525
526    c_val = val
527    count = 0
528    while c_val >= 0x80 and count < 8:
529        c_bytes[count] = <unsigned char>((c_val | 0x80) & 0xFF)
530        c_val = c_val >> 7
531        count = count + 1
532    if count >= 8 or c_val >= 0x80:
533        raise ValueError('encode_base128_int overflowed the buffer')
534    c_bytes[count] = <unsigned char>(c_val & 0xFF)
535    count = count + 1
536    return PyBytes_FromStringAndSize(<char *>c_bytes, count)
537
538
539def decode_base128_int(data):
540    """Decode an integer from a 7-bit lsb encoding."""
541    cdef int offset
542    cdef int val
543    cdef unsigned int uval
544    cdef int shift
545    cdef Py_ssize_t num_low_bytes
546    cdef unsigned char *c_bytes
547
548    offset = 0
549    val = 0
550    shift = 0
551    if not PyBytes_CheckExact(data):
552        raise TypeError('bytes is not a string')
553    c_bytes = <unsigned char*>PyBytes_AS_STRING(data)
554    # We take off 1, because we have to be able to decode the non-expanded byte
555    num_low_bytes = PyBytes_GET_SIZE(data) - 1
556    while (c_bytes[offset] & 0x80) and offset < num_low_bytes:
557        val = val | ((c_bytes[offset] & 0x7F) << shift)
558        shift = shift + 7
559        offset = offset + 1
560    if c_bytes[offset] & 0x80:
561        raise ValueError('Data not properly formatted, we ran out of'
562                         ' bytes before 0x80 stopped being set.')
563    val = val | (c_bytes[offset] << shift)
564    offset = offset + 1
565    if val < 0:
566        uval = <unsigned int> val
567        return uval, offset
568    return val, offset
569
570
571