1# Copyright (C) 2008, 2009, 2010 Canonical Ltd 2# 3# This program is free software; you can redistribute it and/or modify 4# it under the terms of the GNU General Public License as published by 5# the Free Software Foundation; either version 2 of the License, or 6# (at your option) any later version. 7# 8# This program is distributed in the hope that it will be useful, 9# but WITHOUT ANY WARRANTY; without even the implied warranty of 10# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 11# GNU General Public License for more details. 12# 13# You should have received a copy of the GNU General Public License 14# along with this program; if not, write to the Free Software 15# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA 16# 17# cython: language_level=3 18 19"""Compiled extensions for doing compression.""" 20 21 22cdef extern from "python-compat.h": 23 pass 24 25from libc.stdlib cimport ( 26 free, 27 ) 28from libc.string cimport ( 29 memcpy, 30 ) 31 32from cpython.bytes cimport ( 33 PyBytes_AS_STRING, 34 PyBytes_CheckExact, 35 PyBytes_FromStringAndSize, 36 PyBytes_GET_SIZE, 37 ) 38from cpython.object cimport ( 39 PyObject, 40 ) 41from cpython.mem cimport ( 42 PyMem_Free, 43 PyMem_Malloc, 44 ) 45 46 47cdef extern from "delta.h": 48 struct source_info: 49 void *buf 50 unsigned long size 51 unsigned long agg_offset 52 struct delta_index: 53 pass 54 ctypedef enum delta_result: 55 DELTA_OK 56 DELTA_OUT_OF_MEMORY 57 DELTA_INDEX_NEEDED 58 DELTA_SOURCE_EMPTY 59 DELTA_SOURCE_BAD 60 DELTA_BUFFER_EMPTY 61 DELTA_SIZE_TOO_BIG 62 delta_result create_delta_index(source_info *src, 63 delta_index *old, 64 delta_index **fresh, 65 int max_entries) nogil 66 delta_result create_delta_index_from_delta(source_info *delta, 67 delta_index *old, 68 delta_index **fresh) nogil 69 void free_delta_index(delta_index *index) nogil 70 delta_result create_delta(delta_index *indexes, 71 void *buf, unsigned long bufsize, 72 unsigned long *delta_size, 73 unsigned long max_delta_size, 74 void **delta_data) nogil 75 unsigned long get_delta_hdr_size(unsigned char **datap, 76 unsigned char *top) nogil 77 unsigned long sizeof_delta_index(delta_index *index) 78 Py_ssize_t DELTA_SIZE_MIN 79 int get_hash_offset(delta_index *index, int pos, unsigned int *hash_offset) 80 int get_entry_summary(delta_index *index, int pos, 81 unsigned int *global_offset, unsigned int *hash_val) 82 unsigned int rabin_hash (unsigned char *data) 83 84 85def make_delta_index(source): 86 return DeltaIndex(source) 87 88 89cdef object _translate_delta_failure(delta_result result): 90 if result == DELTA_OUT_OF_MEMORY: 91 return MemoryError("Delta function failed to allocate memory") 92 elif result == DELTA_INDEX_NEEDED: 93 return ValueError("Delta function requires delta_index param") 94 elif result == DELTA_SOURCE_EMPTY: 95 return ValueError("Delta function given empty source_info param") 96 elif result == DELTA_SOURCE_BAD: 97 return RuntimeError("Delta function given invalid source_info param") 98 elif result == DELTA_BUFFER_EMPTY: 99 return ValueError("Delta function given empty buffer params") 100 return AssertionError("Unrecognised delta result code: %d" % result) 101 102 103def _rabin_hash(content): 104 if not PyBytes_CheckExact(content): 105 raise ValueError('content must be a string') 106 if len(content) < 16: 107 raise ValueError('content must be at least 16 bytes long') 108 # Try to cast it to an int, if it can fit 109 return int(rabin_hash(<unsigned char*>(PyBytes_AS_STRING(content)))) 110 111 112cdef class DeltaIndex: 113 114 cdef readonly list _sources 115 cdef source_info *_source_infos 116 cdef delta_index *_index 117 cdef public unsigned long _source_offset 118 cdef readonly unsigned int _max_num_sources 119 cdef public int _max_bytes_to_index 120 121 def __init__(self, source=None, max_bytes_to_index=None): 122 self._sources = [] 123 self._index = NULL 124 self._max_num_sources = 65000 125 self._source_infos = <source_info *>PyMem_Malloc( 126 sizeof(source_info) * self._max_num_sources) 127 if self._source_infos == NULL: 128 raise MemoryError('failed to allocate memory for DeltaIndex') 129 self._source_offset = 0 130 self._max_bytes_to_index = 0 131 if max_bytes_to_index is not None: 132 self._max_bytes_to_index = max_bytes_to_index 133 134 if source is not None: 135 self.add_source(source, 0) 136 137 def __sizeof__(self): 138 # We want to track the _source_infos allocations, but the referenced 139 # void* are actually tracked in _sources itself. 140 return (sizeof(DeltaIndex) 141 + (sizeof(source_info) * self._max_num_sources) 142 + sizeof_delta_index(self._index)) 143 144 def __repr__(self): 145 return '%s(%d, %d)' % (self.__class__.__name__, 146 len(self._sources), self._source_offset) 147 148 def __dealloc__(self): 149 if self._index != NULL: 150 free_delta_index(self._index) 151 self._index = NULL 152 PyMem_Free(self._source_infos) 153 154 def _has_index(self): 155 return (self._index != NULL) 156 157 def _dump_index(self): 158 """Dump the pointers in the index. 159 160 This is an arbitrary layout, used for testing. It is not meant to be 161 used in production code. 162 163 :return: (hash_list, entry_list) 164 hash_list A list of offsets, so hash[i] points to the 'hash 165 bucket' starting at the given offset and going until 166 hash[i+1] 167 entry_list A list of (text_offset, hash_val). text_offset is the 168 offset in the "source" texts, and hash_val is the RABIN 169 hash for that offset. 170 Note that the entry should be in the hash bucket 171 defined by 172 hash[(hash_val & mask)] && hash[(hash_val & mask) + 1] 173 """ 174 cdef int pos 175 cdef unsigned int text_offset 176 cdef unsigned int hash_val 177 cdef unsigned int hash_offset 178 if self._index == NULL: 179 return None 180 hash_list = [] 181 pos = 0 182 while get_hash_offset(self._index, pos, &hash_offset): 183 hash_list.append(int(hash_offset)) 184 pos += 1 185 entry_list = [] 186 pos = 0 187 while get_entry_summary(self._index, pos, &text_offset, &hash_val): 188 # Map back using 'int' so that we don't get Long everywhere, when 189 # almost everything is <2**31. 190 val = tuple(map(int, [text_offset, hash_val])) 191 entry_list.append(val) 192 pos += 1 193 return hash_list, entry_list 194 195 def add_delta_source(self, delta, unadded_bytes): 196 """Add a new delta to the source texts. 197 198 :param delta: The text of the delta, this must be a byte string. 199 :param unadded_bytes: Number of bytes that were added to the source 200 that were not indexed. 201 """ 202 cdef char *c_delta 203 cdef Py_ssize_t c_delta_size 204 cdef delta_index *index 205 cdef delta_result res 206 cdef unsigned int source_location 207 cdef source_info *src 208 cdef unsigned int num_indexes 209 210 if not PyBytes_CheckExact(delta): 211 raise TypeError('delta is not a bytestring') 212 213 source_location = len(self._sources) 214 if source_location >= self._max_num_sources: 215 self._expand_sources() 216 self._sources.append(delta) 217 c_delta = PyBytes_AS_STRING(delta) 218 c_delta_size = PyBytes_GET_SIZE(delta) 219 src = self._source_infos + source_location 220 src.buf = c_delta 221 src.size = c_delta_size 222 src.agg_offset = self._source_offset + unadded_bytes 223 with nogil: 224 res = create_delta_index_from_delta(src, self._index, &index) 225 if res != DELTA_OK: 226 raise _translate_delta_failure(res) 227 self._source_offset = src.agg_offset + src.size 228 if index != self._index: 229 free_delta_index(self._index) 230 self._index = index 231 232 def add_source(self, source, unadded_bytes): 233 """Add a new bit of source text to the delta indexes. 234 235 :param source: The text in question, this must be a byte string 236 :param unadded_bytes: Assume there are this many bytes that didn't get 237 added between this source and the end of the previous source. 238 :param max_pointers: Add no more than this many entries to the index. 239 By default, we sample every 16 bytes, if that would require more 240 than max_entries, we will reduce the sampling rate. 241 A value of 0 means unlimited, None means use the default limit. 242 """ 243 cdef char *c_source 244 cdef Py_ssize_t c_source_size 245 cdef delta_index *index 246 cdef delta_result res 247 cdef unsigned int source_location 248 cdef source_info *src 249 cdef unsigned int num_indexes 250 cdef int max_num_entries 251 252 if not PyBytes_CheckExact(source): 253 raise TypeError('source is not a bytestring') 254 255 source_location = len(self._sources) 256 if source_location >= self._max_num_sources: 257 self._expand_sources() 258 if source_location != 0 and self._index == NULL: 259 # We were lazy about populating the index, create it now 260 self._populate_first_index() 261 self._sources.append(source) 262 c_source = PyBytes_AS_STRING(source) 263 c_source_size = PyBytes_GET_SIZE(source) 264 src = self._source_infos + source_location 265 src.buf = c_source 266 src.size = c_source_size 267 268 src.agg_offset = self._source_offset + unadded_bytes 269 self._source_offset = src.agg_offset + src.size 270 # We delay creating the index on the first insert 271 if source_location != 0: 272 with nogil: 273 res = create_delta_index(src, self._index, &index, 274 self._max_bytes_to_index) 275 if res != DELTA_OK: 276 raise _translate_delta_failure(res) 277 if index != self._index: 278 free_delta_index(self._index) 279 self._index = index 280 281 cdef _populate_first_index(self): 282 cdef delta_index *index 283 cdef delta_result res 284 if len(self._sources) != 1 or self._index != NULL: 285 raise AssertionError('_populate_first_index should only be' 286 ' called when we have a single source and no index yet') 287 288 # We know that self._index is already NULL, so create_delta_index 289 # will always create a new index unless there's a malloc failure 290 with nogil: 291 res = create_delta_index(&self._source_infos[0], NULL, &index, 292 self._max_bytes_to_index) 293 if res != DELTA_OK: 294 raise _translate_delta_failure(res) 295 self._index = index 296 297 cdef _expand_sources(self): 298 raise RuntimeError('if we move self._source_infos, then we need to' 299 ' change all of the index pointers as well.') 300 301 def make_delta(self, target_bytes, max_delta_size=0): 302 """Create a delta from the current source to the target bytes.""" 303 cdef char *target 304 cdef Py_ssize_t target_size 305 cdef void * delta 306 cdef unsigned long delta_size 307 cdef unsigned long c_max_delta_size 308 cdef delta_result res 309 310 if self._index == NULL: 311 if len(self._sources) == 0: 312 return None 313 # We were just lazy about generating the index 314 self._populate_first_index() 315 316 if not PyBytes_CheckExact(target_bytes): 317 raise TypeError('target is not a bytestring') 318 319 target = PyBytes_AS_STRING(target_bytes) 320 target_size = PyBytes_GET_SIZE(target_bytes) 321 322 # TODO: inline some of create_delta so we at least don't have to double 323 # malloc, and can instead use PyBytes_FromStringAndSize, to 324 # allocate the bytes into the final string 325 c_max_delta_size = max_delta_size 326 with nogil: 327 res = create_delta(self._index, target, target_size, 328 &delta_size, c_max_delta_size, &delta) 329 result = None 330 if res == DELTA_OK: 331 result = PyBytes_FromStringAndSize(<char *>delta, delta_size) 332 free(delta) 333 elif res != DELTA_SIZE_TOO_BIG: 334 raise _translate_delta_failure(res) 335 return result 336 337 338def make_delta(source_bytes, target_bytes): 339 """Create a delta, this is a wrapper around DeltaIndex.make_delta.""" 340 di = DeltaIndex(source_bytes) 341 return di.make_delta(target_bytes) 342 343 344def apply_delta(source_bytes, delta_bytes): 345 """Apply a delta generated by make_delta to source_bytes.""" 346 cdef char *source 347 cdef Py_ssize_t source_size 348 cdef char *delta 349 cdef Py_ssize_t delta_size 350 351 if not PyBytes_CheckExact(source_bytes): 352 raise TypeError('source is not a bytestring') 353 if not PyBytes_CheckExact(delta_bytes): 354 raise TypeError('delta is not a bytestring') 355 source = PyBytes_AS_STRING(source_bytes) 356 source_size = PyBytes_GET_SIZE(source_bytes) 357 delta = PyBytes_AS_STRING(delta_bytes) 358 delta_size = PyBytes_GET_SIZE(delta_bytes) 359 # Code taken from patch-delta.c, only brought here to give better error 360 # handling, and to avoid double allocating memory 361 if (delta_size < DELTA_SIZE_MIN): 362 # XXX: Invalid delta block 363 raise RuntimeError('delta_size %d smaller than min delta size %d' 364 % (delta_size, DELTA_SIZE_MIN)) 365 366 return _apply_delta(source, source_size, delta, delta_size) 367 368 369cdef unsigned char *_decode_copy_instruction(unsigned char *data, 370 unsigned char cmd, unsigned int *offset, 371 unsigned int *length) nogil: # cannot_raise 372 """Decode a copy instruction from the next few bytes. 373 374 A copy instruction is a variable number of bytes, so we will parse the 375 bytes we care about, and return the new position, as well as the offset and 376 length referred to in the bytes. 377 378 :param bytes: Pointer to the start of bytes after cmd 379 :param cmd: The command code 380 :return: Pointer to the bytes just after the last decode byte 381 """ 382 cdef unsigned int off, size, count 383 off = 0 384 size = 0 385 count = 0 386 if (cmd & 0x01): 387 off = data[count] 388 count = count + 1 389 if (cmd & 0x02): 390 off = off | (data[count] << 8) 391 count = count + 1 392 if (cmd & 0x04): 393 off = off | (data[count] << 16) 394 count = count + 1 395 if (cmd & 0x08): 396 off = off | (data[count] << 24) 397 count = count + 1 398 if (cmd & 0x10): 399 size = data[count] 400 count = count + 1 401 if (cmd & 0x20): 402 size = size | (data[count] << 8) 403 count = count + 1 404 if (cmd & 0x40): 405 size = size | (data[count] << 16) 406 count = count + 1 407 if (size == 0): 408 size = 0x10000 409 offset[0] = off 410 length[0] = size 411 return data + count 412 413 414cdef object _apply_delta(char *source, Py_ssize_t source_size, 415 char *delta, Py_ssize_t delta_size): 416 """common functionality between apply_delta and apply_delta_to_source.""" 417 cdef unsigned char *data 418 cdef unsigned char *top 419 cdef unsigned char *dst_buf 420 cdef unsigned char *out 421 cdef unsigned char cmd 422 cdef Py_ssize_t size 423 cdef unsigned int cp_off, cp_size 424 cdef int failed 425 426 data = <unsigned char *>delta 427 top = data + delta_size 428 429 # now the result size 430 size = get_delta_hdr_size(&data, top) 431 result = PyBytes_FromStringAndSize(NULL, size) 432 dst_buf = <unsigned char*>PyBytes_AS_STRING(result) 433 434 failed = 0 435 with nogil: 436 out = dst_buf 437 while (data < top): 438 cmd = data[0] 439 data = data + 1 440 if (cmd & 0x80): 441 # Copy instruction 442 data = _decode_copy_instruction(data, cmd, &cp_off, &cp_size) 443 if (cp_off + cp_size < cp_size or 444 cp_off + cp_size > <unsigned int>source_size or 445 cp_size > <unsigned int>size): 446 failed = 1 447 break 448 memcpy(out, source + cp_off, cp_size) 449 out = out + cp_size 450 size = size - cp_size 451 else: 452 # Insert instruction 453 if cmd == 0: 454 # cmd == 0 is reserved for future encoding 455 # extensions. In the mean time we must fail when 456 # encountering them (might be data corruption). 457 failed = 2 458 break 459 if cmd > size: 460 failed = 3 461 break 462 memcpy(out, data, cmd) 463 out = out + cmd 464 data = data + cmd 465 size = size - cmd 466 if failed: 467 if failed == 1: 468 raise ValueError('Something wrong with:' 469 ' cp_off = %s, cp_size = %s' 470 ' source_size = %s, size = %s' 471 % (cp_off, cp_size, source_size, size)) 472 elif failed == 2: 473 raise ValueError('Got delta opcode: 0, not supported') 474 elif failed == 3: 475 raise ValueError('Insert instruction longer than remaining' 476 ' bytes: %d > %d' % (cmd, size)) 477 478 # sanity check 479 if (data != top or size != 0): 480 raise RuntimeError('Did not extract the number of bytes we expected' 481 ' we were left with %d bytes in "size", and top - data = %d' 482 % (size, <int>(top - data))) 483 484 # *dst_size = out - dst_buf; 485 if (out - dst_buf) != PyBytes_GET_SIZE(result): 486 raise RuntimeError('Number of bytes extracted did not match the' 487 ' size encoded in the delta header.') 488 return result 489 490 491def apply_delta_to_source(source, delta_start, delta_end): 492 """Extract a delta from source bytes, and apply it.""" 493 cdef char *c_source 494 cdef Py_ssize_t c_source_size 495 cdef char *c_delta 496 cdef Py_ssize_t c_delta_size 497 cdef Py_ssize_t c_delta_start, c_delta_end 498 499 if not PyBytes_CheckExact(source): 500 raise TypeError('source is not a str') 501 c_source_size = PyBytes_GET_SIZE(source) 502 c_delta_start = delta_start 503 c_delta_end = delta_end 504 if c_delta_start >= c_source_size: 505 raise ValueError('delta starts after source') 506 if c_delta_end > c_source_size: 507 raise ValueError('delta ends after source') 508 if c_delta_start >= c_delta_end: 509 raise ValueError('delta starts after it ends') 510 511 c_delta_size = c_delta_end - c_delta_start 512 c_source = PyBytes_AS_STRING(source) 513 c_delta = c_source + c_delta_start 514 # We don't use source_size, because we know the delta should not refer to 515 # any bytes after it starts 516 return _apply_delta(c_source, c_delta_start, c_delta, c_delta_size) 517 518 519def encode_base128_int(val): 520 """Convert an integer into a 7-bit lsb encoding.""" 521 cdef unsigned int c_val 522 cdef Py_ssize_t count 523 cdef unsigned int num_bytes 524 cdef unsigned char c_bytes[8] # max size for 32-bit int is 5 bytes 525 526 c_val = val 527 count = 0 528 while c_val >= 0x80 and count < 8: 529 c_bytes[count] = <unsigned char>((c_val | 0x80) & 0xFF) 530 c_val = c_val >> 7 531 count = count + 1 532 if count >= 8 or c_val >= 0x80: 533 raise ValueError('encode_base128_int overflowed the buffer') 534 c_bytes[count] = <unsigned char>(c_val & 0xFF) 535 count = count + 1 536 return PyBytes_FromStringAndSize(<char *>c_bytes, count) 537 538 539def decode_base128_int(data): 540 """Decode an integer from a 7-bit lsb encoding.""" 541 cdef int offset 542 cdef int val 543 cdef unsigned int uval 544 cdef int shift 545 cdef Py_ssize_t num_low_bytes 546 cdef unsigned char *c_bytes 547 548 offset = 0 549 val = 0 550 shift = 0 551 if not PyBytes_CheckExact(data): 552 raise TypeError('bytes is not a string') 553 c_bytes = <unsigned char*>PyBytes_AS_STRING(data) 554 # We take off 1, because we have to be able to decode the non-expanded byte 555 num_low_bytes = PyBytes_GET_SIZE(data) - 1 556 while (c_bytes[offset] & 0x80) and offset < num_low_bytes: 557 val = val | ((c_bytes[offset] & 0x7F) << shift) 558 shift = shift + 7 559 offset = offset + 1 560 if c_bytes[offset] & 0x80: 561 raise ValueError('Data not properly formatted, we ran out of' 562 ' bytes before 0x80 stopped being set.') 563 val = val | (c_bytes[offset] << shift) 564 offset = offset + 1 565 if val < 0: 566 uval = <unsigned int> val 567 return uval, offset 568 return val, offset 569 570 571