1#!/usr/bin/python -u 2# 3# Python Bindings for LZMA 4# 5# Copyright (c) 2004-2015 by Joachim Bauch, mail@joachim-bauch.de 6# 7-Zip Copyright (C) 1999-2010 Igor Pavlov 7# LZMA SDK Copyright (C) 1999-2010 Igor Pavlov 8# 9# This library is free software; you can redistribute it and/or 10# modify it under the terms of the GNU Lesser General Public 11# License as published by the Free Software Foundation; either 12# version 2.1 of the License, or (at your option) any later version. 13# 14# This library is distributed in the hope that it will be useful, 15# but WITHOUT ANY WARRANTY; without even the implied warranty of 16# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 17# Lesser General Public License for more details. 18# 19# You should have received a copy of the GNU Lesser General Public 20# License along with this library; if not, write to the Free Software 21# Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA 22# 23# $Id$ 24# 25"""Read 7zip format archives.""" 26 27from array import array 28from binascii import unhexlify 29from datetime import datetime 30import pylzma 31from struct import pack, unpack 32from zlib import crc32 33import zlib 34import bz2 35import os 36import sys 37try: 38 from io import BytesIO 39except ImportError: 40 from cStringIO import StringIO as BytesIO 41try: 42 from functools import reduce 43except ImportError: 44 # reduce is available in functools starting with Python 2.6 45 pass 46 47try: 48 from pytz import UTC 49except ImportError: 50 # pytz is optional, define own "UTC" timestamp 51 # reference implementation from Python documentation 52 from datetime import timedelta, tzinfo 53 54 ZERO = timedelta(0) 55 56 class UTC(tzinfo): 57 """UTC""" 58 59 def utcoffset(self, dt): 60 return ZERO 61 62 def tzname(self, dt): 63 return "UTC" 64 65 def dst(self, dt): 66 return ZERO 67 68 def _call__(self): 69 return self 70 71 UTC = UTC() 72 73try: 74 unicode 75except NameError: 76 # Python 3.x 77 def unicode(s, encoding): 78 return s 79else: 80 def bytes(s, encoding): 81 return s 82 83try: 84 long 85except NameError: 86 # Python 3.x 87 long = int 88 89try: 90 xrange 91except NameError: 92 # Python 3.x 93 xrange = range 94 95IS_PYTHON3 = sys.version_info[0] == 3 96 97NEED_BYTESWAP = sys.byteorder != 'little' 98 99if array('L').itemsize == 4: 100 ARRAY_TYPE_UINT32 = 'L' 101else: 102 assert array('I').itemsize == 4 103 ARRAY_TYPE_UINT32 = 'I' 104 105READ_BLOCKSIZE = 16384 106 107MAGIC_7Z = unhexlify('377abcaf271c') # '7z\xbc\xaf\x27\x1c' 108 109PROPERTY_END = unhexlify('00') # '\x00' 110PROPERTY_HEADER = unhexlify('01') # '\x01' 111PROPERTY_ARCHIVE_PROPERTIES = unhexlify('02') # '\x02' 112PROPERTY_ADDITIONAL_STREAMS_INFO = unhexlify('03') # '\x03' 113PROPERTY_MAIN_STREAMS_INFO = unhexlify('04') # '\x04' 114PROPERTY_FILES_INFO = unhexlify('05') # '\x05' 115PROPERTY_PACK_INFO = unhexlify('06') # '\x06' 116PROPERTY_UNPACK_INFO = unhexlify('07') # '\x07' 117PROPERTY_SUBSTREAMS_INFO = unhexlify('08') # '\x08' 118PROPERTY_SIZE = unhexlify('09') # '\x09' 119PROPERTY_CRC = unhexlify('0a') # '\x0a' 120PROPERTY_FOLDER = unhexlify('0b') # '\x0b' 121PROPERTY_CODERS_UNPACK_SIZE = unhexlify('0c') # '\x0c' 122PROPERTY_NUM_UNPACK_STREAM = unhexlify('0d') # '\x0d' 123PROPERTY_EMPTY_STREAM = unhexlify('0e') # '\x0e' 124PROPERTY_EMPTY_FILE = unhexlify('0f') # '\x0f' 125PROPERTY_ANTI = unhexlify('10') # '\x10' 126PROPERTY_NAME = unhexlify('11') # '\x11' 127PROPERTY_CREATION_TIME = unhexlify('12') # '\x12' 128PROPERTY_LAST_ACCESS_TIME = unhexlify('13') # '\x13' 129PROPERTY_LAST_WRITE_TIME = unhexlify('14') # '\x14' 130PROPERTY_ATTRIBUTES = unhexlify('15') # '\x15' 131PROPERTY_COMMENT = unhexlify('16') # '\x16' 132PROPERTY_ENCODED_HEADER = unhexlify('17') # '\x17' 133PROPERTY_START_POS = unhexlify('18') # '\x18' 134PROPERTY_DUMMY = unhexlify('19') # '\x19' 135 136COMPRESSION_METHOD_COPY = unhexlify('00') # '\x00' 137COMPRESSION_METHOD_LZMA = unhexlify('03') # '\x03' 138COMPRESSION_METHOD_CRYPTO = unhexlify('06') # '\x06' 139COMPRESSION_METHOD_MISC = unhexlify('04') # '\x04' 140COMPRESSION_METHOD_MISC_ZIP = unhexlify('0401') # '\x04\x01' 141COMPRESSION_METHOD_MISC_BZIP = unhexlify('0402') # '\x04\x02' 142COMPRESSION_METHOD_7Z_AES256_SHA256 = unhexlify('06f10701') # '\x06\xf1\x07\x01' 143COMPRESSION_METHOD_LZMA2 = unhexlify('21') # '\x21' 144 145FILE_ATTRIBUTE_DIRECTORY = 0x10 146FILE_ATTRIBUTE_READONLY = 0x01 147FILE_ATTRIBUTE_HIDDEN = 0x02 148FILE_ATTRIBUTE_SYSTEM = 0x04 149FILE_ATTRIBUTE_ARCHIVE = 0x20 150 151# number of seconds between 1601/01/01 and 1970/01/01 (UTC) 152# used to adjust 7z FILETIME to Python timestamp 153TIMESTAMP_ADJUST = -11644473600 154 155def toTimestamp(filetime): 156 """Convert 7z FILETIME to Python timestamp.""" 157 # FILETIME is 100-nanosecond intervals since 1601/01/01 (UTC) 158 return (filetime / 10000000.0) + TIMESTAMP_ADJUST 159 160def calculate_crc32(data, value=None, blocksize=1024*1024): 161 """Calculate CRC32 of strings with arbitrary lengths.""" 162 length = len(data) 163 pos = blocksize 164 if value: 165 value = crc32(data[:pos], value) 166 else: 167 value = crc32(data[:pos]) 168 while pos < length: 169 value = crc32(data[pos:pos+blocksize], value) 170 pos += blocksize 171 172 return value & 0xffffffff 173 174class ArchiveError(Exception): 175 pass 176 177class FormatError(ArchiveError): 178 pass 179 180class EncryptedArchiveError(ArchiveError): 181 pass 182 183class UnsupportedCompressionMethodError(ArchiveError): 184 pass 185 186class DecryptionError(ArchiveError): 187 pass 188 189class NoPasswordGivenError(DecryptionError): 190 pass 191 192class WrongPasswordError(DecryptionError): 193 pass 194 195class DecompressionError(ArchiveError): 196 pass 197 198class ArchiveTimestamp(long): 199 """Windows FILETIME timestamp.""" 200 201 def __repr__(self): 202 return '%s(%d)' % (type(self).__name__, self) 203 204 def as_datetime(self): 205 """Convert FILETIME to Python datetime object.""" 206 return datetime.fromtimestamp(toTimestamp(self), UTC) 207 208class Base(object): 209 """ base class with support for various basic read/write functions """ 210 211 def _readReal64Bit(self, file): 212 res = file.read(8) 213 a, b = unpack('<LL', res) 214 return b << 32 | a, res 215 216 def _read64Bit(self, file): 217 b = ord(file.read(1)) 218 mask = 0x80 219 for i in xrange(8): 220 if b & mask == 0: 221 bytes = array('B', file.read(i)) 222 bytes.reverse() 223 value = (bytes and reduce(lambda x, y: x << 8 | y, bytes)) or 0 224 highpart = b & (mask - 1) 225 return value + (highpart << (i * 8)) 226 227 mask >>= 1 228 229 def _readBoolean(self, file, count, checkall=0): 230 if checkall: 231 alldefined = file.read(1) 232 if alldefined != unhexlify('00'): 233 return [True] * count 234 235 result = [] 236 b = 0 237 mask = 0 238 for i in xrange(count): 239 if mask == 0: 240 b = ord(file.read(1)) 241 mask = 0x80 242 result.append(b & mask != 0) 243 mask >>= 1 244 245 return result 246 247 def checkcrc(self, crc, data): 248 check = calculate_crc32(data) 249 return crc == check 250 251 252class PackInfo(Base): 253 """ informations about packed streams """ 254 255 def __init__(self, file): 256 self.packpos = self._read64Bit(file) 257 self.numstreams = self._read64Bit(file) 258 id = file.read(1) 259 if id == PROPERTY_SIZE: 260 self.packsizes = [self._read64Bit(file) for x in xrange(self.numstreams)] 261 id = file.read(1) 262 263 if id == PROPERTY_CRC: 264 self.crcs = [self._read64Bit(file) for x in xrange(self.numstreams)] 265 id = file.read(1) 266 267 if id != PROPERTY_END: 268 raise FormatError('end id expected but %s found' % repr(id)) 269 270class Folder(Base): 271 """ a "Folder" represents a stream of compressed data """ 272 273 solid = False 274 275 def __init__(self, file): 276 numcoders = self._read64Bit(file) 277 self.coders = [] 278 self.digestdefined = False 279 totalin = 0 280 self.totalout = 0 281 for i in xrange(numcoders): 282 while True: 283 b = ord(file.read(1)) 284 methodsize = b & 0xf 285 issimple = b & 0x10 == 0 286 noattributes = b & 0x20 == 0 287 last_alternative = b & 0x80 == 0 288 c = {} 289 c['method'] = file.read(methodsize) 290 if not issimple: 291 c['numinstreams'] = self._read64Bit(file) 292 c['numoutstreams'] = self._read64Bit(file) 293 else: 294 c['numinstreams'] = 1 295 c['numoutstreams'] = 1 296 totalin += c['numinstreams'] 297 self.totalout += c['numoutstreams'] 298 if not noattributes: 299 c['properties'] = file.read(self._read64Bit(file)) 300 self.coders.append(c) 301 if last_alternative: 302 break 303 304 numbindpairs = self.totalout - 1 305 self.bindpairs = [] 306 for i in xrange(numbindpairs): 307 self.bindpairs.append((self._read64Bit(file), self._read64Bit(file), )) 308 309 numpackedstreams = totalin - numbindpairs 310 self.packed_indexes = [] 311 if numpackedstreams == 1: 312 for i in xrange(totalin): 313 if self.findInBindPair(i) < 0: 314 self.packed_indexes.append(i) 315 elif numpackedstreams > 1: 316 for i in xrange(numpackedstreams): 317 self.packed_indexes.append(self._read64Bit(file)) 318 319 def getUnpackSize(self): 320 if not self.unpacksizes: 321 return 0 322 323 for i in xrange(len(self.unpacksizes)-1, -1, -1): 324 if self.findOutBindPair(i): 325 return self.unpacksizes[i] 326 327 raise TypeError('not found') 328 329 def findInBindPair(self, index): 330 for idx, (a, b) in enumerate(self.bindpairs): 331 if a == index: 332 return idx 333 return -1 334 335 def findOutBindPair(self, index): 336 for idx, (a, b) in enumerate(self.bindpairs): 337 if b == index: 338 return idx 339 return -1 340 341 def isEncrypted(self): 342 return COMPRESSION_METHOD_7Z_AES256_SHA256 in [x['method'] for x in self.coders] 343 344class Digests(Base): 345 """ holds a list of checksums """ 346 347 def __init__(self, file, count): 348 self.defined = self._readBoolean(file, count, checkall=1) 349 self.crcs = array(ARRAY_TYPE_UINT32, file.read(4*count)) 350 if NEED_BYTESWAP: 351 self.crcs.byteswap() 352 353UnpackDigests = Digests 354 355class UnpackInfo(Base): 356 """ combines multiple folders """ 357 358 def __init__(self, file): 359 id = file.read(1) 360 if id != PROPERTY_FOLDER: 361 raise FormatError('folder id expected but %s found' % repr(id)) 362 self.numfolders = self._read64Bit(file) 363 self.folders = [] 364 external = file.read(1) 365 if external == unhexlify('00'): 366 self.folders = [Folder(file) for x in xrange(self.numfolders)] 367 elif external == unhexlify('01'): 368 self.datastreamidx = self._read64Bit(file) 369 else: 370 raise FormatError('0x00 or 0x01 expected but %s found' % repr(external)) 371 372 id = file.read(1) 373 if id != PROPERTY_CODERS_UNPACK_SIZE: 374 raise FormatError('coders unpack size id expected but %s found' % repr(id)) 375 376 for folder in self.folders: 377 folder.unpacksizes = [self._read64Bit(file) for x in xrange(folder.totalout)] 378 379 id = file.read(1) 380 if id == PROPERTY_CRC: 381 digests = UnpackDigests(file, self.numfolders) 382 for idx, folder in enumerate(self.folders): 383 folder.digestdefined = digests.defined[idx] 384 folder.crc = digests.crcs[idx] 385 386 id = file.read(1) 387 388 if id != PROPERTY_END: 389 raise FormatError('end id expected but %s found' % repr(id)) 390 391class SubstreamsInfo(Base): 392 """ defines the substreams of a folder """ 393 394 def __init__(self, file, numfolders, folders): 395 self.digests = [] 396 self.digestsdefined = [] 397 id = file.read(1) 398 if id == PROPERTY_NUM_UNPACK_STREAM: 399 self.numunpackstreams = [self._read64Bit(file) for x in xrange(numfolders)] 400 id = file.read(1) 401 else: 402 self.numunpackstreams = [1]*numfolders 403 404 if id == PROPERTY_SIZE: 405 self.unpacksizes = [] 406 for i in xrange(len(self.numunpackstreams)): 407 sum = 0 408 for j in xrange(1, self.numunpackstreams[i]): 409 size = self._read64Bit(file) 410 self.unpacksizes.append(size) 411 sum += size 412 self.unpacksizes.append(folders[i].getUnpackSize() - sum) 413 414 id = file.read(1) 415 416 numdigests = 0 417 numdigeststotal = 0 418 for i in xrange(numfolders): 419 numsubstreams = self.numunpackstreams[i] 420 if numsubstreams != 1 or not folders[i].digestdefined: 421 numdigests += numsubstreams 422 numdigeststotal += numsubstreams 423 424 if id == PROPERTY_CRC: 425 digests = Digests(file, numdigests) 426 didx = 0 427 for i in xrange(numfolders): 428 folder = folders[i] 429 numsubstreams = self.numunpackstreams[i] 430 if numsubstreams == 1 and folder.digestdefined: 431 self.digestsdefined.append(True) 432 self.digests.append(folder.crc) 433 else: 434 for j in xrange(numsubstreams): 435 self.digestsdefined.append(digests.defined[didx]) 436 self.digests.append(digests.crcs[didx]) 437 didx += 1 438 439 id = file.read(1) 440 441 if id != PROPERTY_END: 442 raise FormatError('end id expected but %r found' % id) 443 444 if not self.digestsdefined: 445 self.digestsdefined = [False] * numdigeststotal 446 self.digests = [0] * numdigeststotal 447 448class StreamsInfo(Base): 449 """ informations about compressed streams """ 450 451 def __init__(self, file): 452 id = file.read(1) 453 if id == PROPERTY_PACK_INFO: 454 self.packinfo = PackInfo(file) 455 id = file.read(1) 456 457 if id == PROPERTY_UNPACK_INFO: 458 self.unpackinfo = UnpackInfo(file) 459 id = file.read(1) 460 461 if id == PROPERTY_SUBSTREAMS_INFO: 462 self.substreamsinfo = SubstreamsInfo(file, self.unpackinfo.numfolders, self.unpackinfo.folders) 463 id = file.read(1) 464 465 if id != PROPERTY_END: 466 raise FormatError('end id expected but %s found' % repr(id)) 467 468class FilesInfo(Base): 469 """ holds file properties """ 470 471 def _readTimes(self, file, files, name): 472 defined = self._readBoolean(file, len(files), checkall=1) 473 474 # NOTE: the "external" flag is currently ignored, should be 0x00 475 external = file.read(1) 476 for i in xrange(len(files)): 477 if defined[i]: 478 files[i][name] = ArchiveTimestamp(self._readReal64Bit(file)[0]) 479 else: 480 files[i][name] = None 481 482 def __init__(self, file): 483 self.numfiles = self._read64Bit(file) 484 self.files = [{'emptystream': False} for x in xrange(self.numfiles)] 485 numemptystreams = 0 486 while True: 487 typ = self._read64Bit(file) 488 if typ > 255: 489 raise FormatError('invalid type, must be below 256, is %d' % typ) 490 491 typ = pack('B', typ) 492 if typ == PROPERTY_END: 493 break 494 495 size = self._read64Bit(file) 496 if typ == PROPERTY_DUMMY: 497 # Added by newer versions of 7z to adjust padding. 498 file.seek(size, os.SEEK_CUR) 499 continue 500 501 buffer = BytesIO(file.read(size)) 502 if typ == PROPERTY_EMPTY_STREAM: 503 isempty = self._readBoolean(buffer, self.numfiles) 504 list(map(lambda x, y: x.update({'emptystream': y}), self.files, isempty)) 505 for x in isempty: 506 if x: numemptystreams += 1 507 emptyfiles = [False] * numemptystreams 508 antifiles = [False] * numemptystreams 509 elif typ == PROPERTY_EMPTY_FILE: 510 emptyfiles = self._readBoolean(buffer, numemptystreams) 511 elif typ == PROPERTY_ANTI: 512 antifiles = self._readBoolean(buffer, numemptystreams) 513 elif typ == PROPERTY_NAME: 514 external = buffer.read(1) 515 if external != unhexlify('00'): 516 self.dataindex = self._read64Bit(buffer) 517 # XXX: evaluate external 518 raise NotImplementedError 519 520 for f in self.files: 521 name = '' 522 while True: 523 ch = buffer.read(2) 524 if ch == unhexlify('0000'): 525 f['filename'] = name 526 break 527 name += ch.decode('utf-16') 528 elif typ == PROPERTY_CREATION_TIME: 529 self._readTimes(buffer, self.files, 'creationtime') 530 elif typ == PROPERTY_LAST_ACCESS_TIME: 531 self._readTimes(buffer, self.files, 'lastaccesstime') 532 elif typ == PROPERTY_LAST_WRITE_TIME: 533 self._readTimes(buffer, self.files, 'lastwritetime') 534 elif typ == PROPERTY_ATTRIBUTES: 535 defined = self._readBoolean(buffer, self.numfiles, checkall=1) 536 external = buffer.read(1) 537 if external != unhexlify('00'): 538 self.dataindex = self._read64Bit(buffer) 539 # XXX: evaluate external 540 raise NotImplementedError 541 542 for idx, f in enumerate(self.files): 543 if defined[idx]: 544 f['attributes'] = unpack('<L', buffer.read(4))[0] 545 else: 546 f['attributes'] = None 547 else: 548 raise FormatError('invalid type %r' % (typ)) 549 550class Header(Base): 551 """ the archive header """ 552 553 def __init__(self, file): 554 id = file.read(1) 555 if id == PROPERTY_ARCHIVE_PROPERTIES: 556 self.properties = ArchiveProperties(file) 557 id = file.read(1) 558 559 if id == PROPERTY_ADDITIONAL_STREAMS_INFO: 560 self.additional_streams = StreamsInfo(file) 561 id = file.read(1) 562 563 if id == PROPERTY_MAIN_STREAMS_INFO: 564 self.main_streams = StreamsInfo(file) 565 id = file.read(1) 566 567 if id == PROPERTY_FILES_INFO: 568 self.files = FilesInfo(file) 569 id = file.read(1) 570 571 if id != PROPERTY_END: 572 raise FormatError('end id expected but %s found' % (repr(id))) 573 574class ArchiveFile(Base): 575 """ wrapper around a file in the archive """ 576 577 def __init__(self, info, start, src_start, folder, archive, maxsize=None): 578 self.digest = None 579 self._archive = archive 580 self._file = archive._file 581 self._start = start 582 self._src_start = src_start 583 self._folder = folder 584 # maxsize is only valid for solid archives 585 self._maxsize = maxsize 586 for k, v in info.items(): 587 setattr(self, k, v) 588 self.size = self.uncompressed = self._uncompressed[-1] 589 if not hasattr(self, 'filename'): 590 # compressed file is stored without a name, generate one 591 try: 592 basefilename = self._file.name 593 except AttributeError: 594 # 7z archive file doesn't have a name 595 self.filename = 'contents' 596 else: 597 self.filename = os.path.splitext(os.path.basename(basefilename))[0] 598 self.reset() 599 self._decoders = { 600 COMPRESSION_METHOD_COPY: '_read_copy', 601 COMPRESSION_METHOD_LZMA: '_read_lzma', 602 COMPRESSION_METHOD_LZMA2: '_read_lzma2', 603 COMPRESSION_METHOD_MISC_ZIP: '_read_zip', 604 COMPRESSION_METHOD_MISC_BZIP: '_read_bzip', 605 COMPRESSION_METHOD_7Z_AES256_SHA256: '_read_7z_aes256_sha256', 606 } 607 608 def _is_encrypted(self): 609 return self._folder.isEncrypted() 610 611 def reset(self): 612 self.pos = 0 613 614 def read(self): 615 if not self.size: 616 return '' 617 elif not self._folder.coders: 618 raise TypeError("file has no coder informations") 619 620 data = None 621 num_coders = len(self._folder.coders) 622 for level, coder in enumerate(self._folder.coders): 623 method = coder['method'] 624 decoder = None 625 while method and decoder is None: 626 decoder = self._decoders.get(method, None) 627 method = method[:-1] 628 629 if decoder is None: 630 raise UnsupportedCompressionMethodError(repr(coder['method'])) 631 632 data = getattr(self, decoder)(coder, data, level, num_coders) 633 634 return data 635 636 def _read_copy(self, coder, input, level, num_coders): 637 size = self._uncompressed[level] 638 if not input: 639 self._file.seek(self._src_start) 640 input = self._file.read(size) 641 return input[self._start:self._start+size] 642 643 def _read_from_decompressor(self, coder, decompressor, input, level, num_coders, can_partial_decompress=True, with_cache=False): 644 size = self._uncompressed[level] 645 data = '' 646 idx = 0 647 cnt = 0 648 properties = coder.get('properties', None) 649 if properties: 650 decompressor.decompress(properties) 651 total = self.compressed 652 is_last_coder = (level + 1) == num_coders 653 if not input and is_last_coder: 654 remaining = self._start+size 655 out = BytesIO() 656 cache = getattr(self._folder, '_decompress_cache', None) 657 if cache is not None: 658 data, pos, decompressor = cache 659 out.write(data) 660 remaining -= len(data) 661 self._file.seek(pos) 662 else: 663 self._file.seek(self._src_start) 664 checkremaining = is_last_coder and not self._folder.solid and can_partial_decompress 665 while remaining > 0: 666 data = self._file.read(READ_BLOCKSIZE) 667 if checkremaining or (with_cache and len(data) < READ_BLOCKSIZE): 668 tmp = decompressor.decompress(data, remaining) 669 else: 670 tmp = decompressor.decompress(data) 671 if not tmp and not data: 672 raise DecompressionError('end of stream while decompressing') 673 out.write(tmp) 674 remaining -= len(tmp) 675 676 data = out.getvalue() 677 if with_cache and self._folder.solid: 678 # don't decompress start of solid archive for next file 679 # TODO: limit size of cached data 680 self._folder._decompress_cache = (data, self._file.tell(), decompressor) 681 else: 682 if not input: 683 self._file.seek(self._src_start) 684 input = self._file.read(total) 685 if is_last_coder and can_partial_decompress: 686 data = decompressor.decompress(input, self._start+size) 687 else: 688 data = decompressor.decompress(input) 689 if can_partial_decompress and not is_last_coder: 690 return data 691 692 return data[self._start:self._start+size] 693 694 def _read_lzma(self, coder, input, level, num_coders): 695 size = self._uncompressed[level] 696 is_last_coder = (level + 1) == num_coders 697 if is_last_coder and not self._folder.solid: 698 dec = pylzma.decompressobj(maxlength=self._start+size) 699 else: 700 dec = pylzma.decompressobj() 701 try: 702 return self._read_from_decompressor(coder, dec, input, level, num_coders, with_cache=True) 703 except ValueError: 704 if self._is_encrypted(): 705 raise WrongPasswordError('invalid password') 706 707 raise 708 709 def _read_lzma2(self, coder, input, level, num_coders): 710 size = self._uncompressed[level] 711 is_last_coder = (level + 1) == num_coders 712 if is_last_coder and not self._folder.solid: 713 dec = pylzma.decompressobj(maxlength=self._start+size, lzma2=True) 714 else: 715 dec = pylzma.decompressobj(lzma2=True) 716 try: 717 return self._read_from_decompressor(coder, dec, input, level, num_coders, with_cache=True) 718 except ValueError: 719 if self._is_encrypted(): 720 raise WrongPasswordError('invalid password') 721 722 raise 723 724 def _read_zip(self, coder, input, level, num_coders): 725 dec = zlib.decompressobj(-15) 726 return self._read_from_decompressor(coder, dec, input, level, num_coders) 727 728 def _read_bzip(self, coder, input, level, num_coders): 729 dec = bz2.BZ2Decompressor() 730 return self._read_from_decompressor(coder, dec, input, level, num_coders, can_partial_decompress=False) 731 732 def _read_7z_aes256_sha256(self, coder, input, level, num_coders): 733 if not self._archive.password: 734 raise NoPasswordGivenError() 735 736 # TODO: this needs some sanity checks 737 firstbyte = coder['properties'][0] 738 if not IS_PYTHON3: 739 firstbyte = ord(firstbyte) 740 numcyclespower = firstbyte & 0x3f 741 if firstbyte & 0xc0 != 0: 742 saltsize = (firstbyte >> 7) & 1 743 ivsize = (firstbyte >> 6) & 1 744 745 secondbyte = coder['properties'][1] 746 if not IS_PYTHON3: 747 secondbyte = ord(secondbyte) 748 saltsize += (secondbyte >> 4) 749 ivsize += (secondbyte & 0x0f) 750 751 assert len(coder['properties']) == 2+saltsize+ivsize 752 salt = coder['properties'][2:2+saltsize] 753 iv = coder['properties'][2+saltsize:2+saltsize+ivsize] 754 assert len(salt) == saltsize 755 assert len(iv) == ivsize 756 assert numcyclespower <= 24 757 if ivsize < 16: 758 iv += bytes('\x00'*(16-ivsize), 'ascii') 759 else: 760 salt = iv = bytes('', 'ascii') 761 762 password = self._archive.password.encode('utf-16-le') 763 key = pylzma.calculate_key(password, numcyclespower, salt=salt) 764 cipher = pylzma.AESDecrypt(key, iv=iv) 765 if not input: 766 self._file.seek(self._src_start) 767 input = self._file.read(self.compressed) 768 result = cipher.decrypt(input) 769 return result 770 771 def checkcrc(self): 772 if self.digest is None: 773 return True 774 775 self.reset() 776 data = self.read() 777 return super(ArchiveFile, self).checkcrc(self.digest, data) 778 779 780class Archive7z(Base): 781 """ the archive itself """ 782 783 def __init__(self, file, password=None): 784 self._file = file 785 self.password = password 786 self.header = file.read(len(MAGIC_7Z)) 787 if self.header != MAGIC_7Z: 788 raise FormatError('not a 7z file') 789 self.version = unpack('BB', file.read(2)) 790 791 self.startheadercrc = unpack('<L', file.read(4))[0] 792 self.nextheaderofs, data = self._readReal64Bit(file) 793 crc = calculate_crc32(data) 794 self.nextheadersize, data = self._readReal64Bit(file) 795 crc = calculate_crc32(data, crc) 796 data = file.read(4) 797 self.nextheadercrc = unpack('<L', data)[0] 798 crc = calculate_crc32(data, crc) 799 if crc != self.startheadercrc: 800 raise FormatError('invalid header data') 801 self.afterheader = file.tell() 802 803 file.seek(self.nextheaderofs, 1) 804 buffer = BytesIO(file.read(self.nextheadersize)) 805 if not self.checkcrc(self.nextheadercrc, buffer.getvalue()): 806 raise FormatError('invalid header data') 807 808 while True: 809 id = buffer.read(1) 810 if not id or id == PROPERTY_HEADER: 811 break 812 813 if id != PROPERTY_ENCODED_HEADER: 814 raise TypeError('Unknown field: %r' % (id)) 815 816 streams = StreamsInfo(buffer) 817 file.seek(self.afterheader + 0) 818 data = bytes('', 'ascii') 819 src_start = self.afterheader 820 for folder in streams.unpackinfo.folders: 821 if folder.isEncrypted() and not password: 822 raise NoPasswordGivenError() 823 824 src_start += streams.packinfo.packpos 825 uncompressed = folder.unpacksizes 826 if not isinstance(uncompressed, (list, tuple)): 827 uncompressed = [uncompressed] * len(folder.coders) 828 info = { 829 'compressed': streams.packinfo.packsizes[0], 830 '_uncompressed': uncompressed, 831 } 832 tmp = ArchiveFile(info, 0, src_start, folder, self) 833 uncompressed_size = uncompressed[-1] 834 folderdata = tmp.read()[:uncompressed_size] 835 src_start += uncompressed_size 836 837 if folder.digestdefined: 838 if not self.checkcrc(folder.crc, folderdata): 839 raise FormatError('invalid block data') 840 841 data += folderdata 842 843 buffer = BytesIO(data) 844 845 self.files = [] 846 self.files_map = {} 847 if not id: 848 # empty archive 849 self.solid = False 850 self.numfiles = 0 851 self.filenames = [] 852 return 853 854 self.header = Header(buffer) 855 files = self.header.files 856 if hasattr(self.header, 'main_streams'): 857 folders = self.header.main_streams.unpackinfo.folders 858 packinfo = self.header.main_streams.packinfo 859 subinfo = self.header.main_streams.substreamsinfo 860 packsizes = packinfo.packsizes 861 self.solid = packinfo.numstreams == 1 862 if hasattr(subinfo, 'unpacksizes'): 863 unpacksizes = subinfo.unpacksizes 864 else: 865 unpacksizes = [x.unpacksizes for x in folders] 866 else: 867 # TODO(fancycode): is it necessary to provide empty values for folder, packinfo, etc? 868 self.solid = False 869 870 fidx = 0 871 obidx = 0 872 streamidx = 0 873 src_pos = self.afterheader 874 pos = 0 875 folder_pos = src_pos 876 maxsize = (self.solid and packinfo.packsizes[0]) or None 877 for info in files.files: 878 # Skip all directory entries. 879 attributes = info.get('attributes', None) 880 if attributes and attributes & FILE_ATTRIBUTE_DIRECTORY != 0: 881 continue 882 883 if not info['emptystream']: 884 folder = folders[fidx] 885 if streamidx == 0: 886 folder.solid = subinfo.numunpackstreams[fidx] > 1 887 888 maxsize = (folder.solid and packinfo.packsizes[fidx]) or None 889 uncompressed = unpacksizes[obidx] 890 if not isinstance(uncompressed, (list, tuple)): 891 uncompressed = [uncompressed] * len(folder.coders) 892 if pos > 0: 893 # file is part of solid archive 894 assert fidx < len(packsizes), 'Folder outside index for solid archive' 895 info['compressed'] = packsizes[fidx] 896 elif fidx < len(packsizes): 897 # file is compressed 898 info['compressed'] = packsizes[fidx] 899 else: 900 # file is not compressed 901 info['compressed'] = uncompressed 902 info['_uncompressed'] = uncompressed 903 else: 904 info['compressed'] = 0 905 info['_uncompressed'] = [0] 906 folder = None 907 maxsize = 0 908 909 file = ArchiveFile(info, pos, src_pos, folder, self, maxsize=maxsize) 910 if folder is not None and subinfo.digestsdefined[obidx]: 911 file.digest = subinfo.digests[obidx] 912 self.files.append(file) 913 if folder is not None and folder.solid: 914 pos += unpacksizes[obidx] 915 else: 916 src_pos += info['compressed'] 917 obidx += 1 918 streamidx += 1 919 if folder is not None and streamidx >= subinfo.numunpackstreams[fidx]: 920 pos = 0 921 folder_pos += packinfo.packsizes[fidx] 922 src_pos = folder_pos 923 fidx += 1 924 streamidx = 0 925 926 self.numfiles = len(self.files) 927 self.filenames = list(map(lambda x: x.filename, self.files)) 928 self.files_map.update([(x.filename, x) for x in self.files]) 929 930 # interface like TarFile 931 932 def getmember(self, name): 933 if isinstance(name, (int, long)): 934 try: 935 return self.files[name] 936 except IndexError: 937 return None 938 939 return self.files_map.get(name, None) 940 941 def getmembers(self): 942 return self.files 943 944 def getnames(self): 945 return self.filenames 946 947 def list(self, verbose=True, file=sys.stdout): 948 file.write('total %d files in %sarchive\n' % (self.numfiles, (self.solid and 'solid ') or '')) 949 if not verbose: 950 file.write('\n'.join(self.filenames) + '\n') 951 return 952 953 for f in self.files: 954 extra = (f.compressed and '%10d ' % (f.compressed)) or ' ' 955 file.write('%10d%s%.8x %s\n' % (f.size, extra, f.digest, f.filename)) 956 957if __name__ == '__main__': 958 f = Archive7z(open('test.7z', 'rb')) 959 #f = Archive7z(open('pylzma.7z', 'rb')) 960 f.list() 961