1#------------------------------------------------------------------- 2# tarfile.py 3#------------------------------------------------------------------- 4# Copyright (C) 2002 Lars Gustaebel <lars@gustaebel.de> 5# All rights reserved. 6# 7# Permission is hereby granted, free of charge, to any person 8# obtaining a copy of this software and associated documentation 9# files (the "Software"), to deal in the Software without 10# restriction, including without limitation the rights to use, 11# copy, modify, merge, publish, distribute, sublicense, and/or sell 12# copies of the Software, and to permit persons to whom the 13# Software is furnished to do so, subject to the following 14# conditions: 15# 16# The above copyright notice and this permission notice shall be 17# included in all copies or substantial portions of the Software. 18# 19# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, 20# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES 21# OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND 22# NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT 23# HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, 24# WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING 25# FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR 26# OTHER DEALINGS IN THE SOFTWARE. 27# 28from __future__ import print_function 29 30"""Read from and write to tar format archives. 31""" 32 33__version__ = "$Revision$" 34 35version = "0.9.0" 36__author__ = "Lars Gust\u00e4bel (lars@gustaebel.de)" 37__date__ = "$Date: 2011-02-25 17:42:01 +0200 (Fri, 25 Feb 2011) $" 38__cvsid__ = "$Id: tarfile.py 88586 2011-02-25 15:42:01Z marc-andre.lemburg $" 39__credits__ = "Gustavo Niemeyer, Niels Gust\u00e4bel, Richard Townsend." 40 41#--------- 42# Imports 43#--------- 44import sys 45import os 46import stat 47import errno 48import time 49import struct 50import copy 51import re 52 53try: 54 import grp, pwd 55except ImportError: 56 grp = pwd = None 57 58# os.symlink on Windows prior to 6.0 raises NotImplementedError 59symlink_exception = (AttributeError, NotImplementedError) 60try: 61 # WindowsError (1314) will be raised if the caller does not hold the 62 # SeCreateSymbolicLinkPrivilege privilege 63 symlink_exception += (WindowsError,) 64except NameError: 65 pass 66 67# from tarfile import * 68__all__ = ["TarFile", "TarInfo", "is_tarfile", "TarError"] 69 70if sys.version_info[0] < 3: 71 import __builtin__ as builtins 72else: 73 import builtins 74 75_open = builtins.open # Since 'open' is TarFile.open 76 77#--------------------------------------------------------- 78# tar constants 79#--------------------------------------------------------- 80NUL = b"\0" # the null character 81BLOCKSIZE = 512 # length of processing blocks 82RECORDSIZE = BLOCKSIZE * 20 # length of records 83GNU_MAGIC = b"ustar \0" # magic gnu tar string 84POSIX_MAGIC = b"ustar\x0000" # magic posix tar string 85 86LENGTH_NAME = 100 # maximum length of a filename 87LENGTH_LINK = 100 # maximum length of a linkname 88LENGTH_PREFIX = 155 # maximum length of the prefix field 89 90REGTYPE = b"0" # regular file 91AREGTYPE = b"\0" # regular file 92LNKTYPE = b"1" # link (inside tarfile) 93SYMTYPE = b"2" # symbolic link 94CHRTYPE = b"3" # character special device 95BLKTYPE = b"4" # block special device 96DIRTYPE = b"5" # directory 97FIFOTYPE = b"6" # fifo special device 98CONTTYPE = b"7" # contiguous file 99 100GNUTYPE_LONGNAME = b"L" # GNU tar longname 101GNUTYPE_LONGLINK = b"K" # GNU tar longlink 102GNUTYPE_SPARSE = b"S" # GNU tar sparse file 103 104XHDTYPE = b"x" # POSIX.1-2001 extended header 105XGLTYPE = b"g" # POSIX.1-2001 global header 106SOLARIS_XHDTYPE = b"X" # Solaris extended header 107 108USTAR_FORMAT = 0 # POSIX.1-1988 (ustar) format 109GNU_FORMAT = 1 # GNU tar format 110PAX_FORMAT = 2 # POSIX.1-2001 (pax) format 111DEFAULT_FORMAT = GNU_FORMAT 112 113#--------------------------------------------------------- 114# tarfile constants 115#--------------------------------------------------------- 116# File types that tarfile supports: 117SUPPORTED_TYPES = (REGTYPE, AREGTYPE, LNKTYPE, 118 SYMTYPE, DIRTYPE, FIFOTYPE, 119 CONTTYPE, CHRTYPE, BLKTYPE, 120 GNUTYPE_LONGNAME, GNUTYPE_LONGLINK, 121 GNUTYPE_SPARSE) 122 123# File types that will be treated as a regular file. 124REGULAR_TYPES = (REGTYPE, AREGTYPE, 125 CONTTYPE, GNUTYPE_SPARSE) 126 127# File types that are part of the GNU tar format. 128GNU_TYPES = (GNUTYPE_LONGNAME, GNUTYPE_LONGLINK, 129 GNUTYPE_SPARSE) 130 131# Fields from a pax header that override a TarInfo attribute. 132PAX_FIELDS = ("path", "linkpath", "size", "mtime", 133 "uid", "gid", "uname", "gname") 134 135# Fields from a pax header that are affected by hdrcharset. 136PAX_NAME_FIELDS = set(("path", "linkpath", "uname", "gname")) 137 138# Fields in a pax header that are numbers, all other fields 139# are treated as strings. 140PAX_NUMBER_FIELDS = { 141 "atime": float, 142 "ctime": float, 143 "mtime": float, 144 "uid": int, 145 "gid": int, 146 "size": int 147} 148 149#--------------------------------------------------------- 150# Bits used in the mode field, values in octal. 151#--------------------------------------------------------- 152S_IFLNK = 0o120000 # symbolic link 153S_IFREG = 0o100000 # regular file 154S_IFBLK = 0o060000 # block device 155S_IFDIR = 0o040000 # directory 156S_IFCHR = 0o020000 # character device 157S_IFIFO = 0o010000 # fifo 158 159TSUID = 0o4000 # set UID on execution 160TSGID = 0o2000 # set GID on execution 161TSVTX = 0o1000 # reserved 162 163TUREAD = 0o400 # read by owner 164TUWRITE = 0o200 # write by owner 165TUEXEC = 0o100 # execute/search by owner 166TGREAD = 0o040 # read by group 167TGWRITE = 0o020 # write by group 168TGEXEC = 0o010 # execute/search by group 169TOREAD = 0o004 # read by other 170TOWRITE = 0o002 # write by other 171TOEXEC = 0o001 # execute/search by other 172 173#--------------------------------------------------------- 174# initialization 175#--------------------------------------------------------- 176if os.name in ("nt", "ce"): 177 ENCODING = "utf-8" 178else: 179 ENCODING = sys.getfilesystemencoding() 180 181#--------------------------------------------------------- 182# Some useful functions 183#--------------------------------------------------------- 184 185def stn(s, length, encoding, errors): 186 """Convert a string to a null-terminated bytes object. 187 """ 188 s = s.encode(encoding, errors) 189 return s[:length] + (length - len(s)) * NUL 190 191def nts(s, encoding, errors): 192 """Convert a null-terminated bytes object to a string. 193 """ 194 p = s.find(b"\0") 195 if p != -1: 196 s = s[:p] 197 return s.decode(encoding, errors) 198 199def nti(s): 200 """Convert a number field to a python number. 201 """ 202 # There are two possible encodings for a number field, see 203 # itn() below. 204 if s[0] != chr(0o200): 205 try: 206 n = int(nts(s, "ascii", "strict") or "0", 8) 207 except ValueError: 208 raise InvalidHeaderError("invalid header") 209 else: 210 n = 0 211 for i in range(len(s) - 1): 212 n <<= 8 213 n += ord(s[i + 1]) 214 return n 215 216def itn(n, digits=8, format=DEFAULT_FORMAT): 217 """Convert a python number to a number field. 218 """ 219 # POSIX 1003.1-1988 requires numbers to be encoded as a string of 220 # octal digits followed by a null-byte, this allows values up to 221 # (8**(digits-1))-1. GNU tar allows storing numbers greater than 222 # that if necessary. A leading 0o200 byte indicates this particular 223 # encoding, the following digits-1 bytes are a big-endian 224 # representation. This allows values up to (256**(digits-1))-1. 225 if 0 <= n < 8 ** (digits - 1): 226 s = ("%0*o" % (digits - 1, n)).encode("ascii") + NUL 227 else: 228 if format != GNU_FORMAT or n >= 256 ** (digits - 1): 229 raise ValueError("overflow in number field") 230 231 if n < 0: 232 # XXX We mimic GNU tar's behaviour with negative numbers, 233 # this could raise OverflowError. 234 n = struct.unpack("L", struct.pack("l", n))[0] 235 236 s = bytearray() 237 for i in range(digits - 1): 238 s.insert(0, n & 0o377) 239 n >>= 8 240 s.insert(0, 0o200) 241 return s 242 243def calc_chksums(buf): 244 """Calculate the checksum for a member's header by summing up all 245 characters except for the chksum field which is treated as if 246 it was filled with spaces. According to the GNU tar sources, 247 some tars (Sun and NeXT) calculate chksum with signed char, 248 which will be different if there are chars in the buffer with 249 the high bit set. So we calculate two checksums, unsigned and 250 signed. 251 """ 252 unsigned_chksum = 256 + sum(struct.unpack("148B", buf[:148]) + struct.unpack("356B", buf[156:512])) 253 signed_chksum = 256 + sum(struct.unpack("148b", buf[:148]) + struct.unpack("356b", buf[156:512])) 254 return unsigned_chksum, signed_chksum 255 256def copyfileobj(src, dst, length=None): 257 """Copy length bytes from fileobj src to fileobj dst. 258 If length is None, copy the entire content. 259 """ 260 if length == 0: 261 return 262 if length is None: 263 while True: 264 buf = src.read(16*1024) 265 if not buf: 266 break 267 dst.write(buf) 268 return 269 270 BUFSIZE = 16 * 1024 271 blocks, remainder = divmod(length, BUFSIZE) 272 for b in range(blocks): 273 buf = src.read(BUFSIZE) 274 if len(buf) < BUFSIZE: 275 raise IOError("end of file reached") 276 dst.write(buf) 277 278 if remainder != 0: 279 buf = src.read(remainder) 280 if len(buf) < remainder: 281 raise IOError("end of file reached") 282 dst.write(buf) 283 return 284 285filemode_table = ( 286 ((S_IFLNK, "l"), 287 (S_IFREG, "-"), 288 (S_IFBLK, "b"), 289 (S_IFDIR, "d"), 290 (S_IFCHR, "c"), 291 (S_IFIFO, "p")), 292 293 ((TUREAD, "r"),), 294 ((TUWRITE, "w"),), 295 ((TUEXEC|TSUID, "s"), 296 (TSUID, "S"), 297 (TUEXEC, "x")), 298 299 ((TGREAD, "r"),), 300 ((TGWRITE, "w"),), 301 ((TGEXEC|TSGID, "s"), 302 (TSGID, "S"), 303 (TGEXEC, "x")), 304 305 ((TOREAD, "r"),), 306 ((TOWRITE, "w"),), 307 ((TOEXEC|TSVTX, "t"), 308 (TSVTX, "T"), 309 (TOEXEC, "x")) 310) 311 312def filemode(mode): 313 """Convert a file's mode to a string of the form 314 -rwxrwxrwx. 315 Used by TarFile.list() 316 """ 317 perm = [] 318 for table in filemode_table: 319 for bit, char in table: 320 if mode & bit == bit: 321 perm.append(char) 322 break 323 else: 324 perm.append("-") 325 return "".join(perm) 326 327class TarError(Exception): 328 """Base exception.""" 329 pass 330class ExtractError(TarError): 331 """General exception for extract errors.""" 332 pass 333class ReadError(TarError): 334 """Exception for unreadable tar archives.""" 335 pass 336class CompressionError(TarError): 337 """Exception for unavailable compression methods.""" 338 pass 339class StreamError(TarError): 340 """Exception for unsupported operations on stream-like TarFiles.""" 341 pass 342class HeaderError(TarError): 343 """Base exception for header errors.""" 344 pass 345class EmptyHeaderError(HeaderError): 346 """Exception for empty headers.""" 347 pass 348class TruncatedHeaderError(HeaderError): 349 """Exception for truncated headers.""" 350 pass 351class EOFHeaderError(HeaderError): 352 """Exception for end of file headers.""" 353 pass 354class InvalidHeaderError(HeaderError): 355 """Exception for invalid headers.""" 356 pass 357class SubsequentHeaderError(HeaderError): 358 """Exception for missing and invalid extended headers.""" 359 pass 360 361#--------------------------- 362# internal stream interface 363#--------------------------- 364class _LowLevelFile(object): 365 """Low-level file object. Supports reading and writing. 366 It is used instead of a regular file object for streaming 367 access. 368 """ 369 370 def __init__(self, name, mode): 371 mode = { 372 "r": os.O_RDONLY, 373 "w": os.O_WRONLY | os.O_CREAT | os.O_TRUNC, 374 }[mode] 375 if hasattr(os, "O_BINARY"): 376 mode |= os.O_BINARY 377 self.fd = os.open(name, mode, 0o666) 378 379 def close(self): 380 os.close(self.fd) 381 382 def read(self, size): 383 return os.read(self.fd, size) 384 385 def write(self, s): 386 os.write(self.fd, s) 387 388class _Stream(object): 389 """Class that serves as an adapter between TarFile and 390 a stream-like object. The stream-like object only 391 needs to have a read() or write() method and is accessed 392 blockwise. Use of gzip or bzip2 compression is possible. 393 A stream-like object could be for example: sys.stdin, 394 sys.stdout, a socket, a tape device etc. 395 396 _Stream is intended to be used only internally. 397 """ 398 399 def __init__(self, name, mode, comptype, fileobj, bufsize): 400 """Construct a _Stream object. 401 """ 402 self._extfileobj = True 403 if fileobj is None: 404 fileobj = _LowLevelFile(name, mode) 405 self._extfileobj = False 406 407 if comptype == '*': 408 # Enable transparent compression detection for the 409 # stream interface 410 fileobj = _StreamProxy(fileobj) 411 comptype = fileobj.getcomptype() 412 413 self.name = name or "" 414 self.mode = mode 415 self.comptype = comptype 416 self.fileobj = fileobj 417 self.bufsize = bufsize 418 self.buf = b"" 419 self.pos = 0 420 self.closed = False 421 422 try: 423 if comptype == "gz": 424 try: 425 import zlib 426 except ImportError: 427 raise CompressionError("zlib module is not available") 428 self.zlib = zlib 429 self.crc = zlib.crc32(b"") 430 if mode == "r": 431 self._init_read_gz() 432 else: 433 self._init_write_gz() 434 435 if comptype == "bz2": 436 try: 437 import bz2 438 except ImportError: 439 raise CompressionError("bz2 module is not available") 440 if mode == "r": 441 self.dbuf = b"" 442 self.cmp = bz2.BZ2Decompressor() 443 else: 444 self.cmp = bz2.BZ2Compressor() 445 except: 446 if not self._extfileobj: 447 self.fileobj.close() 448 self.closed = True 449 raise 450 451 def __del__(self): 452 if hasattr(self, "closed") and not self.closed: 453 self.close() 454 455 def _init_write_gz(self): 456 """Initialize for writing with gzip compression. 457 """ 458 self.cmp = self.zlib.compressobj(9, self.zlib.DEFLATED, 459 -self.zlib.MAX_WBITS, 460 self.zlib.DEF_MEM_LEVEL, 461 0) 462 timestamp = struct.pack("<L", int(time.time())) 463 self.__write(b"\037\213\010\010" + timestamp + b"\002\377") 464 if self.name.endswith(".gz"): 465 self.name = self.name[:-3] 466 # RFC1952 says we must use ISO-8859-1 for the FNAME field. 467 self.__write(self.name.encode("iso-8859-1", "replace") + NUL) 468 469 def write(self, s): 470 """Write string s to the stream. 471 """ 472 if self.comptype == "gz": 473 self.crc = self.zlib.crc32(s, self.crc) 474 self.pos += len(s) 475 if self.comptype != "tar": 476 s = self.cmp.compress(s) 477 self.__write(s) 478 479 def __write(self, s): 480 """Write string s to the stream if a whole new block 481 is ready to be written. 482 """ 483 self.buf += s 484 while len(self.buf) > self.bufsize: 485 self.fileobj.write(self.buf[:self.bufsize]) 486 self.buf = self.buf[self.bufsize:] 487 488 def close(self): 489 """Close the _Stream object. No operation should be 490 done on it afterwards. 491 """ 492 if self.closed: 493 return 494 495 if self.mode == "w" and self.comptype != "tar": 496 self.buf += self.cmp.flush() 497 498 if self.mode == "w" and self.buf: 499 self.fileobj.write(self.buf) 500 self.buf = b"" 501 if self.comptype == "gz": 502 # The native zlib crc is an unsigned 32-bit integer, but 503 # the Python wrapper implicitly casts that to a signed C 504 # long. So, on a 32-bit box self.crc may "look negative", 505 # while the same crc on a 64-bit box may "look positive". 506 # To avoid irksome warnings from the `struct` module, force 507 # it to look positive on all boxes. 508 self.fileobj.write(struct.pack("<L", self.crc & 0xffffffff)) 509 self.fileobj.write(struct.pack("<L", self.pos & 0xffffFFFF)) 510 511 if not self._extfileobj: 512 self.fileobj.close() 513 514 self.closed = True 515 516 def _init_read_gz(self): 517 """Initialize for reading a gzip compressed fileobj. 518 """ 519 self.cmp = self.zlib.decompressobj(-self.zlib.MAX_WBITS) 520 self.dbuf = b"" 521 522 # taken from gzip.GzipFile with some alterations 523 if self.__read(2) != b"\037\213": 524 raise ReadError("not a gzip file") 525 if self.__read(1) != b"\010": 526 raise CompressionError("unsupported compression method") 527 528 flag = ord(self.__read(1)) 529 self.__read(6) 530 531 if flag & 4: 532 xlen = ord(self.__read(1)) + 256 * ord(self.__read(1)) 533 self.read(xlen) 534 if flag & 8: 535 while True: 536 s = self.__read(1) 537 if not s or s == NUL: 538 break 539 if flag & 16: 540 while True: 541 s = self.__read(1) 542 if not s or s == NUL: 543 break 544 if flag & 2: 545 self.__read(2) 546 547 def tell(self): 548 """Return the stream's file pointer position. 549 """ 550 return self.pos 551 552 def seek(self, pos=0): 553 """Set the stream's file pointer to pos. Negative seeking 554 is forbidden. 555 """ 556 if pos - self.pos >= 0: 557 blocks, remainder = divmod(pos - self.pos, self.bufsize) 558 for i in range(blocks): 559 self.read(self.bufsize) 560 self.read(remainder) 561 else: 562 raise StreamError("seeking backwards is not allowed") 563 return self.pos 564 565 def read(self, size=None): 566 """Return the next size number of bytes from the stream. 567 If size is not defined, return all bytes of the stream 568 up to EOF. 569 """ 570 if size is None: 571 t = [] 572 while True: 573 buf = self._read(self.bufsize) 574 if not buf: 575 break 576 t.append(buf) 577 buf = "".join(t) 578 else: 579 buf = self._read(size) 580 self.pos += len(buf) 581 return buf 582 583 def _read(self, size): 584 """Return size bytes from the stream. 585 """ 586 if self.comptype == "tar": 587 return self.__read(size) 588 589 c = len(self.dbuf) 590 while c < size: 591 buf = self.__read(self.bufsize) 592 if not buf: 593 break 594 try: 595 buf = self.cmp.decompress(buf) 596 except IOError: 597 raise ReadError("invalid compressed data") 598 self.dbuf += buf 599 c += len(buf) 600 buf = self.dbuf[:size] 601 self.dbuf = self.dbuf[size:] 602 return buf 603 604 def __read(self, size): 605 """Return size bytes from stream. If internal buffer is empty, 606 read another block from the stream. 607 """ 608 c = len(self.buf) 609 while c < size: 610 buf = self.fileobj.read(self.bufsize) 611 if not buf: 612 break 613 self.buf += buf 614 c += len(buf) 615 buf = self.buf[:size] 616 self.buf = self.buf[size:] 617 return buf 618# class _Stream 619 620class _StreamProxy(object): 621 """Small proxy class that enables transparent compression 622 detection for the Stream interface (mode 'r|*'). 623 """ 624 625 def __init__(self, fileobj): 626 self.fileobj = fileobj 627 self.buf = self.fileobj.read(BLOCKSIZE) 628 629 def read(self, size): 630 self.read = self.fileobj.read 631 return self.buf 632 633 def getcomptype(self): 634 if self.buf.startswith(b"\037\213\010"): 635 return "gz" 636 if self.buf.startswith(b"BZh91"): 637 return "bz2" 638 return "tar" 639 640 def close(self): 641 self.fileobj.close() 642# class StreamProxy 643 644class _BZ2Proxy(object): 645 """Small proxy class that enables external file object 646 support for "r:bz2" and "w:bz2" modes. This is actually 647 a workaround for a limitation in bz2 module's BZ2File 648 class which (unlike gzip.GzipFile) has no support for 649 a file object argument. 650 """ 651 652 blocksize = 16 * 1024 653 654 def __init__(self, fileobj, mode): 655 self.fileobj = fileobj 656 self.mode = mode 657 self.name = getattr(self.fileobj, "name", None) 658 self.init() 659 660 def init(self): 661 import bz2 662 self.pos = 0 663 if self.mode == "r": 664 self.bz2obj = bz2.BZ2Decompressor() 665 self.fileobj.seek(0) 666 self.buf = b"" 667 else: 668 self.bz2obj = bz2.BZ2Compressor() 669 670 def read(self, size): 671 x = len(self.buf) 672 while x < size: 673 raw = self.fileobj.read(self.blocksize) 674 if not raw: 675 break 676 data = self.bz2obj.decompress(raw) 677 self.buf += data 678 x += len(data) 679 680 buf = self.buf[:size] 681 self.buf = self.buf[size:] 682 self.pos += len(buf) 683 return buf 684 685 def seek(self, pos): 686 if pos < self.pos: 687 self.init() 688 self.read(pos - self.pos) 689 690 def tell(self): 691 return self.pos 692 693 def write(self, data): 694 self.pos += len(data) 695 raw = self.bz2obj.compress(data) 696 self.fileobj.write(raw) 697 698 def close(self): 699 if self.mode == "w": 700 raw = self.bz2obj.flush() 701 self.fileobj.write(raw) 702# class _BZ2Proxy 703 704#------------------------ 705# Extraction file object 706#------------------------ 707class _FileInFile(object): 708 """A thin wrapper around an existing file object that 709 provides a part of its data as an individual file 710 object. 711 """ 712 713 def __init__(self, fileobj, offset, size, blockinfo=None): 714 self.fileobj = fileobj 715 self.offset = offset 716 self.size = size 717 self.position = 0 718 719 if blockinfo is None: 720 blockinfo = [(0, size)] 721 722 # Construct a map with data and zero blocks. 723 self.map_index = 0 724 self.map = [] 725 lastpos = 0 726 realpos = self.offset 727 for offset, size in blockinfo: 728 if offset > lastpos: 729 self.map.append((False, lastpos, offset, None)) 730 self.map.append((True, offset, offset + size, realpos)) 731 realpos += size 732 lastpos = offset + size 733 if lastpos < self.size: 734 self.map.append((False, lastpos, self.size, None)) 735 736 def seekable(self): 737 if not hasattr(self.fileobj, "seekable"): 738 # XXX gzip.GzipFile and bz2.BZ2File 739 return True 740 return self.fileobj.seekable() 741 742 def tell(self): 743 """Return the current file position. 744 """ 745 return self.position 746 747 def seek(self, position): 748 """Seek to a position in the file. 749 """ 750 self.position = position 751 752 def read(self, size=None): 753 """Read data from the file. 754 """ 755 if size is None: 756 size = self.size - self.position 757 else: 758 size = min(size, self.size - self.position) 759 760 buf = b"" 761 while size > 0: 762 while True: 763 data, start, stop, offset = self.map[self.map_index] 764 if start <= self.position < stop: 765 break 766 else: 767 self.map_index += 1 768 if self.map_index == len(self.map): 769 self.map_index = 0 770 length = min(size, stop - self.position) 771 if data: 772 self.fileobj.seek(offset + (self.position - start)) 773 buf += self.fileobj.read(length) 774 else: 775 buf += NUL * length 776 size -= length 777 self.position += length 778 return buf 779#class _FileInFile 780 781 782class ExFileObject(object): 783 """File-like object for reading an archive member. 784 Is returned by TarFile.extractfile(). 785 """ 786 blocksize = 1024 787 788 def __init__(self, tarfile, tarinfo): 789 self.fileobj = _FileInFile(tarfile.fileobj, 790 tarinfo.offset_data, 791 tarinfo.size, 792 tarinfo.sparse) 793 self.name = tarinfo.name 794 self.mode = "r" 795 self.closed = False 796 self.size = tarinfo.size 797 798 self.position = 0 799 self.buffer = b"" 800 801 def readable(self): 802 return True 803 804 def writable(self): 805 return False 806 807 def seekable(self): 808 return self.fileobj.seekable() 809 810 def read(self, size=None): 811 """Read at most size bytes from the file. If size is not 812 present or None, read all data until EOF is reached. 813 """ 814 if self.closed: 815 raise ValueError("I/O operation on closed file") 816 817 buf = b"" 818 if self.buffer: 819 if size is None: 820 buf = self.buffer 821 self.buffer = b"" 822 else: 823 buf = self.buffer[:size] 824 self.buffer = self.buffer[size:] 825 826 if size is None: 827 buf += self.fileobj.read() 828 else: 829 buf += self.fileobj.read(size - len(buf)) 830 831 self.position += len(buf) 832 return buf 833 834 # XXX TextIOWrapper uses the read1() method. 835 read1 = read 836 837 def readline(self, size=-1): 838 """Read one entire line from the file. If size is present 839 and non-negative, return a string with at most that 840 size, which may be an incomplete line. 841 """ 842 if self.closed: 843 raise ValueError("I/O operation on closed file") 844 845 pos = self.buffer.find(b"\n") + 1 846 if pos == 0: 847 # no newline found. 848 while True: 849 buf = self.fileobj.read(self.blocksize) 850 self.buffer += buf 851 if not buf or b"\n" in buf: 852 pos = self.buffer.find(b"\n") + 1 853 if pos == 0: 854 # no newline found. 855 pos = len(self.buffer) 856 break 857 858 if size != -1: 859 pos = min(size, pos) 860 861 buf = self.buffer[:pos] 862 self.buffer = self.buffer[pos:] 863 self.position += len(buf) 864 return buf 865 866 def readlines(self): 867 """Return a list with all remaining lines. 868 """ 869 result = [] 870 while True: 871 line = self.readline() 872 if not line: break 873 result.append(line) 874 return result 875 876 def tell(self): 877 """Return the current file position. 878 """ 879 if self.closed: 880 raise ValueError("I/O operation on closed file") 881 882 return self.position 883 884 def seek(self, pos, whence=os.SEEK_SET): 885 """Seek to a position in the file. 886 """ 887 if self.closed: 888 raise ValueError("I/O operation on closed file") 889 890 if whence == os.SEEK_SET: 891 self.position = min(max(pos, 0), self.size) 892 elif whence == os.SEEK_CUR: 893 if pos < 0: 894 self.position = max(self.position + pos, 0) 895 else: 896 self.position = min(self.position + pos, self.size) 897 elif whence == os.SEEK_END: 898 self.position = max(min(self.size + pos, self.size), 0) 899 else: 900 raise ValueError("Invalid argument") 901 902 self.buffer = b"" 903 self.fileobj.seek(self.position) 904 905 def close(self): 906 """Close the file object. 907 """ 908 self.closed = True 909 910 def __iter__(self): 911 """Get an iterator over the file's lines. 912 """ 913 while True: 914 line = self.readline() 915 if not line: 916 break 917 yield line 918#class ExFileObject 919 920#------------------ 921# Exported Classes 922#------------------ 923class TarInfo(object): 924 """Informational class which holds the details about an 925 archive member given by a tar header block. 926 TarInfo objects are returned by TarFile.getmember(), 927 TarFile.getmembers() and TarFile.gettarinfo() and are 928 usually created internally. 929 """ 930 931 __slots__ = ("name", "mode", "uid", "gid", "size", "mtime", 932 "chksum", "type", "linkname", "uname", "gname", 933 "devmajor", "devminor", 934 "offset", "offset_data", "pax_headers", "sparse", 935 "tarfile", "_sparse_structs", "_link_target") 936 937 def __init__(self, name=""): 938 """Construct a TarInfo object. name is the optional name 939 of the member. 940 """ 941 self.name = name # member name 942 self.mode = 0o644 # file permissions 943 self.uid = 0 # user id 944 self.gid = 0 # group id 945 self.size = 0 # file size 946 self.mtime = 0 # modification time 947 self.chksum = 0 # header checksum 948 self.type = REGTYPE # member type 949 self.linkname = "" # link name 950 self.uname = "" # user name 951 self.gname = "" # group name 952 self.devmajor = 0 # device major number 953 self.devminor = 0 # device minor number 954 955 self.offset = 0 # the tar header starts here 956 self.offset_data = 0 # the file's data starts here 957 958 self.sparse = None # sparse member information 959 self.pax_headers = {} # pax header information 960 961 # In pax headers the "name" and "linkname" field are called 962 # "path" and "linkpath". 963 def _getpath(self): 964 return self.name 965 def _setpath(self, name): 966 self.name = name 967 path = property(_getpath, _setpath) 968 969 def _getlinkpath(self): 970 return self.linkname 971 def _setlinkpath(self, linkname): 972 self.linkname = linkname 973 linkpath = property(_getlinkpath, _setlinkpath) 974 975 def __repr__(self): 976 return "<%s %r at %#x>" % (self.__class__.__name__,self.name,id(self)) 977 978 def get_info(self): 979 """Return the TarInfo's attributes as a dictionary. 980 """ 981 info = { 982 "name": self.name, 983 "mode": self.mode & 0o7777, 984 "uid": self.uid, 985 "gid": self.gid, 986 "size": self.size, 987 "mtime": self.mtime, 988 "chksum": self.chksum, 989 "type": self.type, 990 "linkname": self.linkname, 991 "uname": self.uname, 992 "gname": self.gname, 993 "devmajor": self.devmajor, 994 "devminor": self.devminor 995 } 996 997 if info["type"] == DIRTYPE and not info["name"].endswith("/"): 998 info["name"] += "/" 999 1000 return info 1001 1002 def tobuf(self, format=DEFAULT_FORMAT, encoding=ENCODING, errors="surrogateescape"): 1003 """Return a tar header as a string of 512 byte blocks. 1004 """ 1005 info = self.get_info() 1006 1007 if format == USTAR_FORMAT: 1008 return self.create_ustar_header(info, encoding, errors) 1009 elif format == GNU_FORMAT: 1010 return self.create_gnu_header(info, encoding, errors) 1011 elif format == PAX_FORMAT: 1012 return self.create_pax_header(info, encoding) 1013 else: 1014 raise ValueError("invalid format") 1015 1016 def create_ustar_header(self, info, encoding, errors): 1017 """Return the object as a ustar header block. 1018 """ 1019 info["magic"] = POSIX_MAGIC 1020 1021 if len(info["linkname"]) > LENGTH_LINK: 1022 raise ValueError("linkname is too long") 1023 1024 if len(info["name"]) > LENGTH_NAME: 1025 info["prefix"], info["name"] = self._posix_split_name(info["name"]) 1026 1027 return self._create_header(info, USTAR_FORMAT, encoding, errors) 1028 1029 def create_gnu_header(self, info, encoding, errors): 1030 """Return the object as a GNU header block sequence. 1031 """ 1032 info["magic"] = GNU_MAGIC 1033 1034 buf = b"" 1035 if len(info["linkname"]) > LENGTH_LINK: 1036 buf += self._create_gnu_long_header(info["linkname"], GNUTYPE_LONGLINK, encoding, errors) 1037 1038 if len(info["name"]) > LENGTH_NAME: 1039 buf += self._create_gnu_long_header(info["name"], GNUTYPE_LONGNAME, encoding, errors) 1040 1041 return buf + self._create_header(info, GNU_FORMAT, encoding, errors) 1042 1043 def create_pax_header(self, info, encoding): 1044 """Return the object as a ustar header block. If it cannot be 1045 represented this way, prepend a pax extended header sequence 1046 with supplement information. 1047 """ 1048 info["magic"] = POSIX_MAGIC 1049 pax_headers = self.pax_headers.copy() 1050 1051 # Test string fields for values that exceed the field length or cannot 1052 # be represented in ASCII encoding. 1053 for name, hname, length in ( 1054 ("name", "path", LENGTH_NAME), ("linkname", "linkpath", LENGTH_LINK), 1055 ("uname", "uname", 32), ("gname", "gname", 32)): 1056 1057 if hname in pax_headers: 1058 # The pax header has priority. 1059 continue 1060 1061 # Try to encode the string as ASCII. 1062 try: 1063 info[name].encode("ascii", "strict") 1064 except UnicodeEncodeError: 1065 pax_headers[hname] = info[name] 1066 continue 1067 1068 if len(info[name]) > length: 1069 pax_headers[hname] = info[name] 1070 1071 # Test number fields for values that exceed the field limit or values 1072 # that like to be stored as float. 1073 for name, digits in (("uid", 8), ("gid", 8), ("size", 12), ("mtime", 12)): 1074 if name in pax_headers: 1075 # The pax header has priority. Avoid overflow. 1076 info[name] = 0 1077 continue 1078 1079 val = info[name] 1080 if not 0 <= val < 8 ** (digits - 1) or isinstance(val, float): 1081 pax_headers[name] = str(val) 1082 info[name] = 0 1083 1084 # Create a pax extended header if necessary. 1085 if pax_headers: 1086 buf = self._create_pax_generic_header(pax_headers, XHDTYPE, encoding) 1087 else: 1088 buf = b"" 1089 1090 return buf + self._create_header(info, USTAR_FORMAT, "ascii", "replace") 1091 1092 @classmethod 1093 def create_pax_global_header(cls, pax_headers): 1094 """Return the object as a pax global header block sequence. 1095 """ 1096 return cls._create_pax_generic_header(pax_headers, XGLTYPE, "utf8") 1097 1098 def _posix_split_name(self, name): 1099 """Split a name longer than 100 chars into a prefix 1100 and a name part. 1101 """ 1102 prefix = name[:LENGTH_PREFIX + 1] 1103 while prefix and prefix[-1] != "/": 1104 prefix = prefix[:-1] 1105 1106 name = name[len(prefix):] 1107 prefix = prefix[:-1] 1108 1109 if not prefix or len(name) > LENGTH_NAME: 1110 raise ValueError("name is too long") 1111 return prefix, name 1112 1113 @staticmethod 1114 def _create_header(info, format, encoding, errors): 1115 """Return a header block. info is a dictionary with file 1116 information, format must be one of the *_FORMAT constants. 1117 """ 1118 parts = [ 1119 stn(info.get("name", ""), 100, encoding, errors), 1120 itn(info.get("mode", 0) & 0o7777, 8, format), 1121 itn(info.get("uid", 0), 8, format), 1122 itn(info.get("gid", 0), 8, format), 1123 itn(info.get("size", 0), 12, format), 1124 itn(info.get("mtime", 0), 12, format), 1125 b" ", # checksum field 1126 info.get("type", REGTYPE), 1127 stn(info.get("linkname", ""), 100, encoding, errors), 1128 info.get("magic", POSIX_MAGIC), 1129 stn(info.get("uname", ""), 32, encoding, errors), 1130 stn(info.get("gname", ""), 32, encoding, errors), 1131 itn(info.get("devmajor", 0), 8, format), 1132 itn(info.get("devminor", 0), 8, format), 1133 stn(info.get("prefix", ""), 155, encoding, errors) 1134 ] 1135 1136 buf = struct.pack("%ds" % BLOCKSIZE, b"".join(parts)) 1137 chksum = calc_chksums(buf[-BLOCKSIZE:])[0] 1138 buf = buf[:-364] + ("%06o\0" % chksum).encode("ascii") + buf[-357:] 1139 return buf 1140 1141 @staticmethod 1142 def _create_payload(payload): 1143 """Return the string payload filled with zero bytes 1144 up to the next 512 byte border. 1145 """ 1146 blocks, remainder = divmod(len(payload), BLOCKSIZE) 1147 if remainder > 0: 1148 payload += (BLOCKSIZE - remainder) * NUL 1149 return payload 1150 1151 @classmethod 1152 def _create_gnu_long_header(cls, name, type, encoding, errors): 1153 """Return a GNUTYPE_LONGNAME or GNUTYPE_LONGLINK sequence 1154 for name. 1155 """ 1156 name = name.encode(encoding, errors) + NUL 1157 1158 info = {} 1159 info["name"] = "././@LongLink" 1160 info["type"] = type 1161 info["size"] = len(name) 1162 info["magic"] = GNU_MAGIC 1163 1164 # create extended header + name blocks. 1165 return cls._create_header(info, USTAR_FORMAT, encoding, errors) + \ 1166 cls._create_payload(name) 1167 1168 @classmethod 1169 def _create_pax_generic_header(cls, pax_headers, type, encoding): 1170 """Return a POSIX.1-2008 extended or global header sequence 1171 that contains a list of keyword, value pairs. The values 1172 must be strings. 1173 """ 1174 # Check if one of the fields contains surrogate characters and thereby 1175 # forces hdrcharset=BINARY, see _proc_pax() for more information. 1176 binary = False 1177 for keyword, value in pax_headers.items(): 1178 try: 1179 value.encode("utf8", "strict") 1180 except UnicodeEncodeError: 1181 binary = True 1182 break 1183 1184 records = b"" 1185 if binary: 1186 # Put the hdrcharset field at the beginning of the header. 1187 records += b"21 hdrcharset=BINARY\n" 1188 1189 for keyword, value in pax_headers.items(): 1190 keyword = keyword.encode("utf8") 1191 if binary: 1192 # Try to restore the original byte representation of `value'. 1193 # Needless to say, that the encoding must match the string. 1194 value = value.encode(encoding, "surrogateescape") 1195 else: 1196 value = value.encode("utf8") 1197 1198 l = len(keyword) + len(value) + 3 # ' ' + '=' + '\n' 1199 n = p = 0 1200 while True: 1201 n = l + len(str(p)) 1202 if n == p: 1203 break 1204 p = n 1205 records += bytes(str(p), "ascii") + b" " + keyword + b"=" + value + b"\n" 1206 1207 # We use a hardcoded "././@PaxHeader" name like star does 1208 # instead of the one that POSIX recommends. 1209 info = {} 1210 info["name"] = "././@PaxHeader" 1211 info["type"] = type 1212 info["size"] = len(records) 1213 info["magic"] = POSIX_MAGIC 1214 1215 # Create pax header + record blocks. 1216 return cls._create_header(info, USTAR_FORMAT, "ascii", "replace") + \ 1217 cls._create_payload(records) 1218 1219 @classmethod 1220 def frombuf(cls, buf, encoding, errors): 1221 """Construct a TarInfo object from a 512 byte bytes object. 1222 """ 1223 if len(buf) == 0: 1224 raise EmptyHeaderError("empty header") 1225 if len(buf) != BLOCKSIZE: 1226 raise TruncatedHeaderError("truncated header") 1227 if buf.count(NUL) == BLOCKSIZE: 1228 raise EOFHeaderError("end of file header") 1229 1230 chksum = nti(buf[148:156]) 1231 if chksum not in calc_chksums(buf): 1232 raise InvalidHeaderError("bad checksum") 1233 1234 obj = cls() 1235 obj.name = nts(buf[0:100], encoding, errors) 1236 obj.mode = nti(buf[100:108]) 1237 obj.uid = nti(buf[108:116]) 1238 obj.gid = nti(buf[116:124]) 1239 obj.size = nti(buf[124:136]) 1240 obj.mtime = nti(buf[136:148]) 1241 obj.chksum = chksum 1242 obj.type = buf[156:157] 1243 obj.linkname = nts(buf[157:257], encoding, errors) 1244 obj.uname = nts(buf[265:297], encoding, errors) 1245 obj.gname = nts(buf[297:329], encoding, errors) 1246 obj.devmajor = nti(buf[329:337]) 1247 obj.devminor = nti(buf[337:345]) 1248 prefix = nts(buf[345:500], encoding, errors) 1249 1250 # Old V7 tar format represents a directory as a regular 1251 # file with a trailing slash. 1252 if obj.type == AREGTYPE and obj.name.endswith("/"): 1253 obj.type = DIRTYPE 1254 1255 # The old GNU sparse format occupies some of the unused 1256 # space in the buffer for up to 4 sparse structures. 1257 # Save the them for later processing in _proc_sparse(). 1258 if obj.type == GNUTYPE_SPARSE: 1259 pos = 386 1260 structs = [] 1261 for i in range(4): 1262 try: 1263 offset = nti(buf[pos:pos + 12]) 1264 numbytes = nti(buf[pos + 12:pos + 24]) 1265 except ValueError: 1266 break 1267 structs.append((offset, numbytes)) 1268 pos += 24 1269 isextended = bool(buf[482]) 1270 origsize = nti(buf[483:495]) 1271 obj._sparse_structs = (structs, isextended, origsize) 1272 1273 # Remove redundant slashes from directories. 1274 if obj.isdir(): 1275 obj.name = obj.name.rstrip("/") 1276 1277 # Reconstruct a ustar longname. 1278 if prefix and obj.type not in GNU_TYPES: 1279 obj.name = prefix + "/" + obj.name 1280 return obj 1281 1282 @classmethod 1283 def fromtarfile(cls, tarfile): 1284 """Return the next TarInfo object from TarFile object 1285 tarfile. 1286 """ 1287 buf = tarfile.fileobj.read(BLOCKSIZE) 1288 obj = cls.frombuf(buf, tarfile.encoding, tarfile.errors) 1289 obj.offset = tarfile.fileobj.tell() - BLOCKSIZE 1290 return obj._proc_member(tarfile) 1291 1292 #-------------------------------------------------------------------------- 1293 # The following are methods that are called depending on the type of a 1294 # member. The entry point is _proc_member() which can be overridden in a 1295 # subclass to add custom _proc_*() methods. A _proc_*() method MUST 1296 # implement the following 1297 # operations: 1298 # 1. Set self.offset_data to the position where the data blocks begin, 1299 # if there is data that follows. 1300 # 2. Set tarfile.offset to the position where the next member's header will 1301 # begin. 1302 # 3. Return self or another valid TarInfo object. 1303 def _proc_member(self, tarfile): 1304 """Choose the right processing method depending on 1305 the type and call it. 1306 """ 1307 if self.type in (GNUTYPE_LONGNAME, GNUTYPE_LONGLINK): 1308 return self._proc_gnulong(tarfile) 1309 elif self.type == GNUTYPE_SPARSE: 1310 return self._proc_sparse(tarfile) 1311 elif self.type in (XHDTYPE, XGLTYPE, SOLARIS_XHDTYPE): 1312 return self._proc_pax(tarfile) 1313 else: 1314 return self._proc_builtin(tarfile) 1315 1316 def _proc_builtin(self, tarfile): 1317 """Process a builtin type or an unknown type which 1318 will be treated as a regular file. 1319 """ 1320 self.offset_data = tarfile.fileobj.tell() 1321 offset = self.offset_data 1322 if self.isreg() or self.type not in SUPPORTED_TYPES: 1323 # Skip the following data blocks. 1324 offset += self._block(self.size) 1325 tarfile.offset = offset 1326 1327 # Patch the TarInfo object with saved global 1328 # header information. 1329 self._apply_pax_info(tarfile.pax_headers, tarfile.encoding, tarfile.errors) 1330 1331 return self 1332 1333 def _proc_gnulong(self, tarfile): 1334 """Process the blocks that hold a GNU longname 1335 or longlink member. 1336 """ 1337 buf = tarfile.fileobj.read(self._block(self.size)) 1338 1339 # Fetch the next header and process it. 1340 try: 1341 next = self.fromtarfile(tarfile) 1342 except HeaderError: 1343 raise SubsequentHeaderError("missing or bad subsequent header") 1344 1345 # Patch the TarInfo object from the next header with 1346 # the longname information. 1347 next.offset = self.offset 1348 if self.type == GNUTYPE_LONGNAME: 1349 next.name = nts(buf, tarfile.encoding, tarfile.errors) 1350 elif self.type == GNUTYPE_LONGLINK: 1351 next.linkname = nts(buf, tarfile.encoding, tarfile.errors) 1352 1353 return next 1354 1355 def _proc_sparse(self, tarfile): 1356 """Process a GNU sparse header plus extra headers. 1357 """ 1358 # We already collected some sparse structures in frombuf(). 1359 structs, isextended, origsize = self._sparse_structs 1360 del self._sparse_structs 1361 1362 # Collect sparse structures from extended header blocks. 1363 while isextended: 1364 buf = tarfile.fileobj.read(BLOCKSIZE) 1365 pos = 0 1366 for i in range(21): 1367 try: 1368 offset = nti(buf[pos:pos + 12]) 1369 numbytes = nti(buf[pos + 12:pos + 24]) 1370 except ValueError: 1371 break 1372 if offset and numbytes: 1373 structs.append((offset, numbytes)) 1374 pos += 24 1375 isextended = bool(buf[504]) 1376 self.sparse = structs 1377 1378 self.offset_data = tarfile.fileobj.tell() 1379 tarfile.offset = self.offset_data + self._block(self.size) 1380 self.size = origsize 1381 return self 1382 1383 def _proc_pax(self, tarfile): 1384 """Process an extended or global header as described in 1385 POSIX.1-2008. 1386 """ 1387 # Read the header information. 1388 buf = tarfile.fileobj.read(self._block(self.size)) 1389 1390 # A pax header stores supplemental information for either 1391 # the following file (extended) or all following files 1392 # (global). 1393 if self.type == XGLTYPE: 1394 pax_headers = tarfile.pax_headers 1395 else: 1396 pax_headers = tarfile.pax_headers.copy() 1397 1398 # Check if the pax header contains a hdrcharset field. This tells us 1399 # the encoding of the path, linkpath, uname and gname fields. Normally, 1400 # these fields are UTF-8 encoded but since POSIX.1-2008 tar 1401 # implementations are allowed to store them as raw binary strings if 1402 # the translation to UTF-8 fails. 1403 match = re.search(br"\d+ hdrcharset=([^\n]+)\n", buf) 1404 if match is not None: 1405 pax_headers["hdrcharset"] = match.group(1).decode("utf8") 1406 1407 # For the time being, we don't care about anything other than "BINARY". 1408 # The only other value that is currently allowed by the standard is 1409 # "ISO-IR 10646 2000 UTF-8" in other words UTF-8. 1410 hdrcharset = pax_headers.get("hdrcharset") 1411 if hdrcharset == "BINARY": 1412 encoding = tarfile.encoding 1413 else: 1414 encoding = "utf8" 1415 1416 # Parse pax header information. A record looks like that: 1417 # "%d %s=%s\n" % (length, keyword, value). length is the size 1418 # of the complete record including the length field itself and 1419 # the newline. keyword and value are both UTF-8 encoded strings. 1420 regex = re.compile(br"(\d+) ([^=]+)=") 1421 pos = 0 1422 while True: 1423 match = regex.match(buf, pos) 1424 if not match: 1425 break 1426 1427 length, keyword = match.groups() 1428 length = int(length) 1429 value = buf[match.end(2) + 1:match.start(1) + length - 1] 1430 1431 # Normally, we could just use "utf8" as the encoding and "strict" 1432 # as the error handler, but we better not take the risk. For 1433 # example, GNU tar <= 1.23 is known to store filenames it cannot 1434 # translate to UTF-8 as raw strings (unfortunately without a 1435 # hdrcharset=BINARY header). 1436 # We first try the strict standard encoding, and if that fails we 1437 # fall back on the user's encoding and error handler. 1438 keyword = self._decode_pax_field(keyword, "utf8", "utf8", 1439 tarfile.errors) 1440 if keyword in PAX_NAME_FIELDS: 1441 value = self._decode_pax_field(value, encoding, tarfile.encoding, 1442 tarfile.errors) 1443 else: 1444 value = self._decode_pax_field(value, "utf8", "utf8", 1445 tarfile.errors) 1446 1447 pax_headers[keyword] = value 1448 pos += length 1449 1450 # Fetch the next header. 1451 try: 1452 next = self.fromtarfile(tarfile) 1453 except HeaderError: 1454 raise SubsequentHeaderError("missing or bad subsequent header") 1455 1456 # Process GNU sparse information. 1457 if "GNU.sparse.map" in pax_headers: 1458 # GNU extended sparse format version 0.1. 1459 self._proc_gnusparse_01(next, pax_headers) 1460 1461 elif "GNU.sparse.size" in pax_headers: 1462 # GNU extended sparse format version 0.0. 1463 self._proc_gnusparse_00(next, pax_headers, buf) 1464 1465 elif pax_headers.get("GNU.sparse.major") == "1" and pax_headers.get("GNU.sparse.minor") == "0": 1466 # GNU extended sparse format version 1.0. 1467 self._proc_gnusparse_10(next, pax_headers, tarfile) 1468 1469 if self.type in (XHDTYPE, SOLARIS_XHDTYPE): 1470 # Patch the TarInfo object with the extended header info. 1471 next._apply_pax_info(pax_headers, tarfile.encoding, tarfile.errors) 1472 next.offset = self.offset 1473 1474 if "size" in pax_headers: 1475 # If the extended header replaces the size field, 1476 # we need to recalculate the offset where the next 1477 # header starts. 1478 offset = next.offset_data 1479 if next.isreg() or next.type not in SUPPORTED_TYPES: 1480 offset += next._block(next.size) 1481 tarfile.offset = offset 1482 1483 return next 1484 1485 def _proc_gnusparse_00(self, next, pax_headers, buf): 1486 """Process a GNU tar extended sparse header, version 0.0. 1487 """ 1488 offsets = [] 1489 for match in re.finditer(br"\d+ GNU.sparse.offset=(\d+)\n", buf): 1490 offsets.append(int(match.group(1))) 1491 numbytes = [] 1492 for match in re.finditer(br"\d+ GNU.sparse.numbytes=(\d+)\n", buf): 1493 numbytes.append(int(match.group(1))) 1494 next.sparse = list(zip(offsets, numbytes)) 1495 1496 def _proc_gnusparse_01(self, next, pax_headers): 1497 """Process a GNU tar extended sparse header, version 0.1. 1498 """ 1499 sparse = [int(x) for x in pax_headers["GNU.sparse.map"].split(",")] 1500 next.sparse = list(zip(sparse[::2], sparse[1::2])) 1501 1502 def _proc_gnusparse_10(self, next, pax_headers, tarfile): 1503 """Process a GNU tar extended sparse header, version 1.0. 1504 """ 1505 fields = None 1506 sparse = [] 1507 buf = tarfile.fileobj.read(BLOCKSIZE) 1508 fields, buf = buf.split(b"\n", 1) 1509 fields = int(fields) 1510 while len(sparse) < fields * 2: 1511 if b"\n" not in buf: 1512 buf += tarfile.fileobj.read(BLOCKSIZE) 1513 number, buf = buf.split(b"\n", 1) 1514 sparse.append(int(number)) 1515 next.offset_data = tarfile.fileobj.tell() 1516 next.sparse = list(zip(sparse[::2], sparse[1::2])) 1517 1518 def _apply_pax_info(self, pax_headers, encoding, errors): 1519 """Replace fields with supplemental information from a previous 1520 pax extended or global header. 1521 """ 1522 for keyword, value in pax_headers.items(): 1523 if keyword == "GNU.sparse.name": 1524 setattr(self, "path", value) 1525 elif keyword == "GNU.sparse.size": 1526 setattr(self, "size", int(value)) 1527 elif keyword == "GNU.sparse.realsize": 1528 setattr(self, "size", int(value)) 1529 elif keyword in PAX_FIELDS: 1530 if keyword in PAX_NUMBER_FIELDS: 1531 try: 1532 value = PAX_NUMBER_FIELDS[keyword](value) 1533 except ValueError: 1534 value = 0 1535 if keyword == "path": 1536 value = value.rstrip("/") 1537 setattr(self, keyword, value) 1538 1539 self.pax_headers = pax_headers.copy() 1540 1541 def _decode_pax_field(self, value, encoding, fallback_encoding, fallback_errors): 1542 """Decode a single field from a pax record. 1543 """ 1544 try: 1545 return value.decode(encoding, "strict") 1546 except UnicodeDecodeError: 1547 return value.decode(fallback_encoding, fallback_errors) 1548 1549 def _block(self, count): 1550 """Round up a byte count by BLOCKSIZE and return it, 1551 e.g. _block(834) => 1024. 1552 """ 1553 blocks, remainder = divmod(count, BLOCKSIZE) 1554 if remainder: 1555 blocks += 1 1556 return blocks * BLOCKSIZE 1557 1558 def isreg(self): 1559 return self.type in REGULAR_TYPES 1560 def isfile(self): 1561 return self.isreg() 1562 def isdir(self): 1563 return self.type == DIRTYPE 1564 def issym(self): 1565 return self.type == SYMTYPE 1566 def islnk(self): 1567 return self.type == LNKTYPE 1568 def ischr(self): 1569 return self.type == CHRTYPE 1570 def isblk(self): 1571 return self.type == BLKTYPE 1572 def isfifo(self): 1573 return self.type == FIFOTYPE 1574 def issparse(self): 1575 return self.sparse is not None 1576 def isdev(self): 1577 return self.type in (CHRTYPE, BLKTYPE, FIFOTYPE) 1578# class TarInfo 1579 1580class TarFile(object): 1581 """The TarFile Class provides an interface to tar archives. 1582 """ 1583 1584 debug = 0 # May be set from 0 (no msgs) to 3 (all msgs) 1585 1586 dereference = False # If true, add content of linked file to the 1587 # tar file, else the link. 1588 1589 ignore_zeros = False # If true, skips empty or invalid blocks and 1590 # continues processing. 1591 1592 errorlevel = 1 # If 0, fatal errors only appear in debug 1593 # messages (if debug >= 0). If > 0, errors 1594 # are passed to the caller as exceptions. 1595 1596 format = DEFAULT_FORMAT # The format to use when creating an archive. 1597 1598 encoding = ENCODING # Encoding for 8-bit character strings. 1599 1600 errors = None # Error handler for unicode conversion. 1601 1602 tarinfo = TarInfo # The default TarInfo class to use. 1603 1604 fileobject = ExFileObject # The default ExFileObject class to use. 1605 1606 def __init__(self, name=None, mode="r", fileobj=None, format=None, 1607 tarinfo=None, dereference=None, ignore_zeros=None, encoding=None, 1608 errors="surrogateescape", pax_headers=None, debug=None, errorlevel=None): 1609 """Open an (uncompressed) tar archive `name'. `mode' is either 'r' to 1610 read from an existing archive, 'a' to append data to an existing 1611 file or 'w' to create a new file overwriting an existing one. `mode' 1612 defaults to 'r'. 1613 If `fileobj' is given, it is used for reading or writing data. If it 1614 can be determined, `mode' is overridden by `fileobj's mode. 1615 `fileobj' is not closed, when TarFile is closed. 1616 """ 1617 if len(mode) > 1 or mode not in "raw": 1618 raise ValueError("mode must be 'r', 'a' or 'w'") 1619 self.mode = mode 1620 self._mode = {"r": "rb", "a": "r+b", "w": "wb"}[mode] 1621 1622 if not fileobj: 1623 if self.mode == "a" and not os.path.exists(name): 1624 # Create nonexistent files in append mode. 1625 self.mode = "w" 1626 self._mode = "wb" 1627 fileobj = bltn_open(name, self._mode) 1628 self._extfileobj = False 1629 else: 1630 if name is None and hasattr(fileobj, "name"): 1631 name = fileobj.name 1632 if hasattr(fileobj, "mode"): 1633 self._mode = fileobj.mode 1634 self._extfileobj = True 1635 self.name = os.path.abspath(name) if name else None 1636 self.fileobj = fileobj 1637 1638 # Init attributes. 1639 if format is not None: 1640 self.format = format 1641 if tarinfo is not None: 1642 self.tarinfo = tarinfo 1643 if dereference is not None: 1644 self.dereference = dereference 1645 if ignore_zeros is not None: 1646 self.ignore_zeros = ignore_zeros 1647 if encoding is not None: 1648 self.encoding = encoding 1649 self.errors = errors 1650 1651 if pax_headers is not None and self.format == PAX_FORMAT: 1652 self.pax_headers = pax_headers 1653 else: 1654 self.pax_headers = {} 1655 1656 if debug is not None: 1657 self.debug = debug 1658 if errorlevel is not None: 1659 self.errorlevel = errorlevel 1660 1661 # Init datastructures. 1662 self.closed = False 1663 self.members = [] # list of members as TarInfo objects 1664 self._loaded = False # flag if all members have been read 1665 self.offset = self.fileobj.tell() 1666 # current position in the archive file 1667 self.inodes = {} # dictionary caching the inodes of 1668 # archive members already added 1669 1670 try: 1671 if self.mode == "r": 1672 self.firstmember = None 1673 self.firstmember = self.next() 1674 1675 if self.mode == "a": 1676 # Move to the end of the archive, 1677 # before the first empty block. 1678 while True: 1679 self.fileobj.seek(self.offset) 1680 try: 1681 tarinfo = self.tarinfo.fromtarfile(self) 1682 self.members.append(tarinfo) 1683 except EOFHeaderError: 1684 self.fileobj.seek(self.offset) 1685 break 1686 except HeaderError as e: 1687 raise ReadError(str(e)) 1688 1689 if self.mode in "aw": 1690 self._loaded = True 1691 1692 if self.pax_headers: 1693 buf = self.tarinfo.create_pax_global_header(self.pax_headers.copy()) 1694 self.fileobj.write(buf) 1695 self.offset += len(buf) 1696 except: 1697 if not self._extfileobj: 1698 self.fileobj.close() 1699 self.closed = True 1700 raise 1701 1702 #-------------------------------------------------------------------------- 1703 # Below are the classmethods which act as alternate constructors to the 1704 # TarFile class. The open() method is the only one that is needed for 1705 # public use; it is the "super"-constructor and is able to select an 1706 # adequate "sub"-constructor for a particular compression using the mapping 1707 # from OPEN_METH. 1708 # 1709 # This concept allows one to subclass TarFile without losing the comfort of 1710 # the super-constructor. A sub-constructor is registered and made available 1711 # by adding it to the mapping in OPEN_METH. 1712 1713 @classmethod 1714 def open(cls, name=None, mode="r", fileobj=None, bufsize=RECORDSIZE, **kwargs): 1715 """Open a tar archive for reading, writing or appending. Return 1716 an appropriate TarFile class. 1717 1718 mode: 1719 'r' or 'r:*' open for reading with transparent compression 1720 'r:' open for reading exclusively uncompressed 1721 'r:gz' open for reading with gzip compression 1722 'r:bz2' open for reading with bzip2 compression 1723 'a' or 'a:' open for appending, creating the file if necessary 1724 'w' or 'w:' open for writing without compression 1725 'w:gz' open for writing with gzip compression 1726 'w:bz2' open for writing with bzip2 compression 1727 1728 'r|*' open a stream of tar blocks with transparent compression 1729 'r|' open an uncompressed stream of tar blocks for reading 1730 'r|gz' open a gzip compressed stream of tar blocks 1731 'r|bz2' open a bzip2 compressed stream of tar blocks 1732 'w|' open an uncompressed stream for writing 1733 'w|gz' open a gzip compressed stream for writing 1734 'w|bz2' open a bzip2 compressed stream for writing 1735 """ 1736 1737 if not name and not fileobj: 1738 raise ValueError("nothing to open") 1739 1740 if mode in ("r", "r:*"): 1741 # Find out which *open() is appropriate for opening the file. 1742 for comptype in cls.OPEN_METH: 1743 func = getattr(cls, cls.OPEN_METH[comptype]) 1744 if fileobj is not None: 1745 saved_pos = fileobj.tell() 1746 try: 1747 return func(name, "r", fileobj, **kwargs) 1748 except (ReadError, CompressionError) as e: 1749 if fileobj is not None: 1750 fileobj.seek(saved_pos) 1751 continue 1752 raise ReadError("file could not be opened successfully") 1753 1754 elif ":" in mode: 1755 filemode, comptype = mode.split(":", 1) 1756 filemode = filemode or "r" 1757 comptype = comptype or "tar" 1758 1759 # Select the *open() function according to 1760 # given compression. 1761 if comptype in cls.OPEN_METH: 1762 func = getattr(cls, cls.OPEN_METH[comptype]) 1763 else: 1764 raise CompressionError("unknown compression type %r" % comptype) 1765 return func(name, filemode, fileobj, **kwargs) 1766 1767 elif "|" in mode: 1768 filemode, comptype = mode.split("|", 1) 1769 filemode = filemode or "r" 1770 comptype = comptype or "tar" 1771 1772 if filemode not in "rw": 1773 raise ValueError("mode must be 'r' or 'w'") 1774 1775 stream = _Stream(name, filemode, comptype, fileobj, bufsize) 1776 try: 1777 t = cls(name, filemode, stream, **kwargs) 1778 except: 1779 stream.close() 1780 raise 1781 t._extfileobj = False 1782 return t 1783 1784 elif mode in "aw": 1785 return cls.taropen(name, mode, fileobj, **kwargs) 1786 1787 raise ValueError("undiscernible mode") 1788 1789 @classmethod 1790 def taropen(cls, name, mode="r", fileobj=None, **kwargs): 1791 """Open uncompressed tar archive name for reading or writing. 1792 """ 1793 if len(mode) > 1 or mode not in "raw": 1794 raise ValueError("mode must be 'r', 'a' or 'w'") 1795 return cls(name, mode, fileobj, **kwargs) 1796 1797 @classmethod 1798 def gzopen(cls, name, mode="r", fileobj=None, compresslevel=9, **kwargs): 1799 """Open gzip compressed tar archive name for reading or writing. 1800 Appending is not allowed. 1801 """ 1802 if len(mode) > 1 or mode not in "rw": 1803 raise ValueError("mode must be 'r' or 'w'") 1804 1805 try: 1806 import gzip 1807 gzip.GzipFile 1808 except (ImportError, AttributeError): 1809 raise CompressionError("gzip module is not available") 1810 1811 extfileobj = fileobj is not None 1812 try: 1813 fileobj = gzip.GzipFile(name, mode + "b", compresslevel, fileobj) 1814 t = cls.taropen(name, mode, fileobj, **kwargs) 1815 except IOError: 1816 if not extfileobj and fileobj is not None: 1817 fileobj.close() 1818 if fileobj is None: 1819 raise 1820 raise ReadError("not a gzip file") 1821 except: 1822 if not extfileobj and fileobj is not None: 1823 fileobj.close() 1824 raise 1825 t._extfileobj = extfileobj 1826 return t 1827 1828 @classmethod 1829 def bz2open(cls, name, mode="r", fileobj=None, compresslevel=9, **kwargs): 1830 """Open bzip2 compressed tar archive name for reading or writing. 1831 Appending is not allowed. 1832 """ 1833 if len(mode) > 1 or mode not in "rw": 1834 raise ValueError("mode must be 'r' or 'w'.") 1835 1836 try: 1837 import bz2 1838 except ImportError: 1839 raise CompressionError("bz2 module is not available") 1840 1841 if fileobj is not None: 1842 fileobj = _BZ2Proxy(fileobj, mode) 1843 else: 1844 fileobj = bz2.BZ2File(name, mode, compresslevel=compresslevel) 1845 1846 try: 1847 t = cls.taropen(name, mode, fileobj, **kwargs) 1848 except (IOError, EOFError): 1849 fileobj.close() 1850 raise ReadError("not a bzip2 file") 1851 t._extfileobj = False 1852 return t 1853 1854 # All *open() methods are registered here. 1855 OPEN_METH = { 1856 "tar": "taropen", # uncompressed tar 1857 "gz": "gzopen", # gzip compressed tar 1858 "bz2": "bz2open" # bzip2 compressed tar 1859 } 1860 1861 #-------------------------------------------------------------------------- 1862 # The public methods which TarFile provides: 1863 1864 def close(self): 1865 """Close the TarFile. In write-mode, two finishing zero blocks are 1866 appended to the archive. 1867 """ 1868 if self.closed: 1869 return 1870 1871 if self.mode in "aw": 1872 self.fileobj.write(NUL * (BLOCKSIZE * 2)) 1873 self.offset += (BLOCKSIZE * 2) 1874 # fill up the end with zero-blocks 1875 # (like option -b20 for tar does) 1876 blocks, remainder = divmod(self.offset, RECORDSIZE) 1877 if remainder > 0: 1878 self.fileobj.write(NUL * (RECORDSIZE - remainder)) 1879 1880 if not self._extfileobj: 1881 self.fileobj.close() 1882 self.closed = True 1883 1884 def getmember(self, name): 1885 """Return a TarInfo object for member `name'. If `name' can not be 1886 found in the archive, KeyError is raised. If a member occurs more 1887 than once in the archive, its last occurrence is assumed to be the 1888 most up-to-date version. 1889 """ 1890 tarinfo = self._getmember(name) 1891 if tarinfo is None: 1892 raise KeyError("filename %r not found" % name) 1893 return tarinfo 1894 1895 def getmembers(self): 1896 """Return the members of the archive as a list of TarInfo objects. The 1897 list has the same order as the members in the archive. 1898 """ 1899 self._check() 1900 if not self._loaded: # if we want to obtain a list of 1901 self._load() # all members, we first have to 1902 # scan the whole archive. 1903 return self.members 1904 1905 def getnames(self): 1906 """Return the members of the archive as a list of their names. It has 1907 the same order as the list returned by getmembers(). 1908 """ 1909 return [tarinfo.name for tarinfo in self.getmembers()] 1910 1911 def gettarinfo(self, name=None, arcname=None, fileobj=None): 1912 """Create a TarInfo object for either the file `name' or the file 1913 object `fileobj' (using os.fstat on its file descriptor). You can 1914 modify some of the TarInfo's attributes before you add it using 1915 addfile(). If given, `arcname' specifies an alternative name for the 1916 file in the archive. 1917 """ 1918 self._check("aw") 1919 1920 # When fileobj is given, replace name by 1921 # fileobj's real name. 1922 if fileobj is not None: 1923 name = fileobj.name 1924 1925 # Building the name of the member in the archive. 1926 # Backward slashes are converted to forward slashes, 1927 # Absolute paths are turned to relative paths. 1928 if arcname is None: 1929 arcname = name 1930 drv, arcname = os.path.splitdrive(arcname) 1931 arcname = arcname.replace(os.sep, "/") 1932 arcname = arcname.lstrip("/") 1933 1934 # Now, fill the TarInfo object with 1935 # information specific for the file. 1936 tarinfo = self.tarinfo() 1937 tarinfo.tarfile = self 1938 1939 # Use os.stat or os.lstat, depending on platform 1940 # and if symlinks shall be resolved. 1941 if fileobj is None: 1942 if hasattr(os, "lstat") and not self.dereference: 1943 statres = os.lstat(name) 1944 else: 1945 statres = os.stat(name) 1946 else: 1947 statres = os.fstat(fileobj.fileno()) 1948 linkname = "" 1949 1950 stmd = statres.st_mode 1951 if stat.S_ISREG(stmd): 1952 inode = (statres.st_ino, statres.st_dev) 1953 if not self.dereference and statres.st_nlink > 1 and \ 1954 inode in self.inodes and arcname != self.inodes[inode]: 1955 # Is it a hardlink to an already 1956 # archived file? 1957 type = LNKTYPE 1958 linkname = self.inodes[inode] 1959 else: 1960 # The inode is added only if its valid. 1961 # For win32 it is always 0. 1962 type = REGTYPE 1963 if inode[0]: 1964 self.inodes[inode] = arcname 1965 elif stat.S_ISDIR(stmd): 1966 type = DIRTYPE 1967 elif stat.S_ISFIFO(stmd): 1968 type = FIFOTYPE 1969 elif stat.S_ISLNK(stmd): 1970 type = SYMTYPE 1971 linkname = os.readlink(name) 1972 elif stat.S_ISCHR(stmd): 1973 type = CHRTYPE 1974 elif stat.S_ISBLK(stmd): 1975 type = BLKTYPE 1976 else: 1977 return None 1978 1979 # Fill the TarInfo object with all 1980 # information we can get. 1981 tarinfo.name = arcname 1982 tarinfo.mode = stmd 1983 tarinfo.uid = statres.st_uid 1984 tarinfo.gid = statres.st_gid 1985 if type == REGTYPE: 1986 tarinfo.size = statres.st_size 1987 else: 1988 tarinfo.size = 0 1989 tarinfo.mtime = statres.st_mtime 1990 tarinfo.type = type 1991 tarinfo.linkname = linkname 1992 if pwd: 1993 try: 1994 tarinfo.uname = pwd.getpwuid(tarinfo.uid)[0] 1995 except KeyError: 1996 pass 1997 if grp: 1998 try: 1999 tarinfo.gname = grp.getgrgid(tarinfo.gid)[0] 2000 except KeyError: 2001 pass 2002 2003 if type in (CHRTYPE, BLKTYPE): 2004 if hasattr(os, "major") and hasattr(os, "minor"): 2005 tarinfo.devmajor = os.major(statres.st_rdev) 2006 tarinfo.devminor = os.minor(statres.st_rdev) 2007 return tarinfo 2008 2009 def list(self, verbose=True): 2010 """Print a table of contents to sys.stdout. If `verbose' is False, only 2011 the names of the members are printed. If it is True, an `ls -l'-like 2012 output is produced. 2013 """ 2014 self._check() 2015 2016 for tarinfo in self: 2017 if verbose: 2018 print(filemode(tarinfo.mode), end=' ') 2019 print("%s/%s" % (tarinfo.uname or tarinfo.uid, 2020 tarinfo.gname or tarinfo.gid), end=' ') 2021 if tarinfo.ischr() or tarinfo.isblk(): 2022 print("%10s" % ("%d,%d" \ 2023 % (tarinfo.devmajor, tarinfo.devminor)), end=' ') 2024 else: 2025 print("%10d" % tarinfo.size, end=' ') 2026 print("%d-%02d-%02d %02d:%02d:%02d" \ 2027 % time.localtime(tarinfo.mtime)[:6], end=' ') 2028 2029 print(tarinfo.name + ("/" if tarinfo.isdir() else ""), end=' ') 2030 2031 if verbose: 2032 if tarinfo.issym(): 2033 print("->", tarinfo.linkname, end=' ') 2034 if tarinfo.islnk(): 2035 print("link to", tarinfo.linkname, end=' ') 2036 print() 2037 2038 def add(self, name, arcname=None, recursive=True, exclude=None, filter=None): 2039 """Add the file `name' to the archive. `name' may be any type of file 2040 (directory, fifo, symbolic link, etc.). If given, `arcname' 2041 specifies an alternative name for the file in the archive. 2042 Directories are added recursively by default. This can be avoided by 2043 setting `recursive' to False. `exclude' is a function that should 2044 return True for each filename to be excluded. `filter' is a function 2045 that expects a TarInfo object argument and returns the changed 2046 TarInfo object, if it returns None the TarInfo object will be 2047 excluded from the archive. 2048 """ 2049 self._check("aw") 2050 2051 if arcname is None: 2052 arcname = name 2053 2054 # Exclude pathnames. 2055 if exclude is not None: 2056 import warnings 2057 warnings.warn("use the filter argument instead", 2058 DeprecationWarning, 2) 2059 if exclude(name): 2060 self._dbg(2, "tarfile: Excluded %r" % name) 2061 return 2062 2063 # Skip if somebody tries to archive the archive... 2064 if self.name is not None and os.path.abspath(name) == self.name: 2065 self._dbg(2, "tarfile: Skipped %r" % name) 2066 return 2067 2068 self._dbg(1, name) 2069 2070 # Create a TarInfo object from the file. 2071 tarinfo = self.gettarinfo(name, arcname) 2072 2073 if tarinfo is None: 2074 self._dbg(1, "tarfile: Unsupported type %r" % name) 2075 return 2076 2077 # Change or exclude the TarInfo object. 2078 if filter is not None: 2079 tarinfo = filter(tarinfo) 2080 if tarinfo is None: 2081 self._dbg(2, "tarfile: Excluded %r" % name) 2082 return 2083 2084 # Append the tar header and data to the archive. 2085 if tarinfo.isreg(): 2086 f = bltn_open(name, "rb") 2087 self.addfile(tarinfo, f) 2088 f.close() 2089 2090 elif tarinfo.isdir(): 2091 self.addfile(tarinfo) 2092 if recursive: 2093 for f in os.listdir(name): 2094 self.add(os.path.join(name, f), os.path.join(arcname, f), 2095 recursive, exclude, filter=filter) 2096 2097 else: 2098 self.addfile(tarinfo) 2099 2100 def addfile(self, tarinfo, fileobj=None): 2101 """Add the TarInfo object `tarinfo' to the archive. If `fileobj' is 2102 given, tarinfo.size bytes are read from it and added to the archive. 2103 You can create TarInfo objects using gettarinfo(). 2104 On Windows platforms, `fileobj' should always be opened with mode 2105 'rb' to avoid irritation about the file size. 2106 """ 2107 self._check("aw") 2108 2109 tarinfo = copy.copy(tarinfo) 2110 2111 buf = tarinfo.tobuf(self.format, self.encoding, self.errors) 2112 self.fileobj.write(buf) 2113 self.offset += len(buf) 2114 2115 # If there's data to follow, append it. 2116 if fileobj is not None: 2117 copyfileobj(fileobj, self.fileobj, tarinfo.size) 2118 blocks, remainder = divmod(tarinfo.size, BLOCKSIZE) 2119 if remainder > 0: 2120 self.fileobj.write(NUL * (BLOCKSIZE - remainder)) 2121 blocks += 1 2122 self.offset += blocks * BLOCKSIZE 2123 2124 self.members.append(tarinfo) 2125 2126 def extractall(self, path=".", members=None): 2127 """Extract all members from the archive to the current working 2128 directory and set owner, modification time and permissions on 2129 directories afterwards. `path' specifies a different directory 2130 to extract to. `members' is optional and must be a subset of the 2131 list returned by getmembers(). 2132 """ 2133 directories = [] 2134 2135 if members is None: 2136 members = self 2137 2138 for tarinfo in members: 2139 if tarinfo.isdir(): 2140 # Extract directories with a safe mode. 2141 directories.append(tarinfo) 2142 tarinfo = copy.copy(tarinfo) 2143 tarinfo.mode = 0o700 2144 # Do not set_attrs directories, as we will do that further down 2145 self.extract(tarinfo, path, set_attrs=not tarinfo.isdir()) 2146 2147 # Reverse sort directories. 2148 directories.sort(key=lambda a: a.name) 2149 directories.reverse() 2150 2151 # Set correct owner, mtime and filemode on directories. 2152 for tarinfo in directories: 2153 dirpath = os.path.join(path, tarinfo.name) 2154 try: 2155 self.chown(tarinfo, dirpath) 2156 self.utime(tarinfo, dirpath) 2157 self.chmod(tarinfo, dirpath) 2158 except ExtractError as e: 2159 if self.errorlevel > 1: 2160 raise 2161 else: 2162 self._dbg(1, "tarfile: %s" % e) 2163 2164 def extract(self, member, path="", set_attrs=True): 2165 """Extract a member from the archive to the current working directory, 2166 using its full name. Its file information is extracted as accurately 2167 as possible. `member' may be a filename or a TarInfo object. You can 2168 specify a different directory using `path'. File attributes (owner, 2169 mtime, mode) are set unless `set_attrs' is False. 2170 """ 2171 self._check("r") 2172 2173 if isinstance(member, str): 2174 tarinfo = self.getmember(member) 2175 else: 2176 tarinfo = member 2177 2178 # Prepare the link target for makelink(). 2179 if tarinfo.islnk(): 2180 tarinfo._link_target = os.path.join(path, tarinfo.linkname) 2181 2182 try: 2183 self._extract_member(tarinfo, os.path.join(path, tarinfo.name), 2184 set_attrs=set_attrs) 2185 except EnvironmentError as e: 2186 if self.errorlevel > 0: 2187 raise 2188 else: 2189 if e.filename is None: 2190 self._dbg(1, "tarfile: %s" % e.strerror) 2191 else: 2192 self._dbg(1, "tarfile: %s %r" % (e.strerror, e.filename)) 2193 except ExtractError as e: 2194 if self.errorlevel > 1: 2195 raise 2196 else: 2197 self._dbg(1, "tarfile: %s" % e) 2198 2199 def extractfile(self, member): 2200 """Extract a member from the archive as a file object. `member' may be 2201 a filename or a TarInfo object. If `member' is a regular file, a 2202 file-like object is returned. If `member' is a link, a file-like 2203 object is constructed from the link's target. If `member' is none of 2204 the above, None is returned. 2205 The file-like object is read-only and provides the following 2206 methods: read(), readline(), readlines(), seek() and tell() 2207 """ 2208 self._check("r") 2209 2210 if isinstance(member, str): 2211 tarinfo = self.getmember(member) 2212 else: 2213 tarinfo = member 2214 2215 if tarinfo.isreg(): 2216 return self.fileobject(self, tarinfo) 2217 2218 elif tarinfo.type not in SUPPORTED_TYPES: 2219 # If a member's type is unknown, it is treated as a 2220 # regular file. 2221 return self.fileobject(self, tarinfo) 2222 2223 elif tarinfo.islnk() or tarinfo.issym(): 2224 if isinstance(self.fileobj, _Stream): 2225 # A small but ugly workaround for the case that someone tries 2226 # to extract a (sym)link as a file-object from a non-seekable 2227 # stream of tar blocks. 2228 raise StreamError("cannot extract (sym)link as file object") 2229 else: 2230 # A (sym)link's file object is its target's file object. 2231 return self.extractfile(self._find_link_target(tarinfo)) 2232 else: 2233 # If there's no data associated with the member (directory, chrdev, 2234 # blkdev, etc.), return None instead of a file object. 2235 return None 2236 2237 def _extract_member(self, tarinfo, targetpath, set_attrs=True): 2238 """Extract the TarInfo object tarinfo to a physical 2239 file called targetpath. 2240 """ 2241 # Fetch the TarInfo object for the given name 2242 # and build the destination pathname, replacing 2243 # forward slashes to platform specific separators. 2244 targetpath = targetpath.rstrip("/") 2245 targetpath = targetpath.replace("/", os.sep) 2246 2247 # Create all upper directories. 2248 upperdirs = os.path.dirname(targetpath) 2249 if upperdirs and not os.path.exists(upperdirs): 2250 # Create directories that are not part of the archive with 2251 # default permissions. 2252 os.makedirs(upperdirs) 2253 2254 if tarinfo.islnk() or tarinfo.issym(): 2255 self._dbg(1, "%s -> %s" % (tarinfo.name, tarinfo.linkname)) 2256 else: 2257 self._dbg(1, tarinfo.name) 2258 2259 if tarinfo.isreg(): 2260 self.makefile(tarinfo, targetpath) 2261 elif tarinfo.isdir(): 2262 self.makedir(tarinfo, targetpath) 2263 elif tarinfo.isfifo(): 2264 self.makefifo(tarinfo, targetpath) 2265 elif tarinfo.ischr() or tarinfo.isblk(): 2266 self.makedev(tarinfo, targetpath) 2267 elif tarinfo.islnk() or tarinfo.issym(): 2268 self.makelink(tarinfo, targetpath) 2269 elif tarinfo.type not in SUPPORTED_TYPES: 2270 self.makeunknown(tarinfo, targetpath) 2271 else: 2272 self.makefile(tarinfo, targetpath) 2273 2274 if set_attrs: 2275 self.chown(tarinfo, targetpath) 2276 if not tarinfo.issym(): 2277 self.chmod(tarinfo, targetpath) 2278 self.utime(tarinfo, targetpath) 2279 2280 #-------------------------------------------------------------------------- 2281 # Below are the different file methods. They are called via 2282 # _extract_member() when extract() is called. They can be replaced in a 2283 # subclass to implement other functionality. 2284 2285 def makedir(self, tarinfo, targetpath): 2286 """Make a directory called targetpath. 2287 """ 2288 try: 2289 # Use a safe mode for the directory, the real mode is set 2290 # later in _extract_member(). 2291 os.mkdir(targetpath, 0o700) 2292 except EnvironmentError as e: 2293 if e.errno != errno.EEXIST: 2294 raise 2295 2296 def makefile(self, tarinfo, targetpath): 2297 """Make a file called targetpath. 2298 """ 2299 source = self.fileobj 2300 source.seek(tarinfo.offset_data) 2301 target = bltn_open(targetpath, "wb") 2302 if tarinfo.sparse is not None: 2303 for offset, size in tarinfo.sparse: 2304 target.seek(offset) 2305 copyfileobj(source, target, size) 2306 else: 2307 copyfileobj(source, target, tarinfo.size) 2308 target.seek(tarinfo.size) 2309 target.truncate() 2310 target.close() 2311 2312 def makeunknown(self, tarinfo, targetpath): 2313 """Make a file from a TarInfo object with an unknown type 2314 at targetpath. 2315 """ 2316 self.makefile(tarinfo, targetpath) 2317 self._dbg(1, "tarfile: Unknown file type %r, " \ 2318 "extracted as regular file." % tarinfo.type) 2319 2320 def makefifo(self, tarinfo, targetpath): 2321 """Make a fifo called targetpath. 2322 """ 2323 if hasattr(os, "mkfifo"): 2324 os.mkfifo(targetpath) 2325 else: 2326 raise ExtractError("fifo not supported by system") 2327 2328 def makedev(self, tarinfo, targetpath): 2329 """Make a character or block device called targetpath. 2330 """ 2331 if not hasattr(os, "mknod") or not hasattr(os, "makedev"): 2332 raise ExtractError("special devices not supported by system") 2333 2334 mode = tarinfo.mode 2335 if tarinfo.isblk(): 2336 mode |= stat.S_IFBLK 2337 else: 2338 mode |= stat.S_IFCHR 2339 2340 os.mknod(targetpath, mode, 2341 os.makedev(tarinfo.devmajor, tarinfo.devminor)) 2342 2343 def makelink(self, tarinfo, targetpath): 2344 """Make a (symbolic) link called targetpath. If it cannot be created 2345 (platform limitation), we try to make a copy of the referenced file 2346 instead of a link. 2347 """ 2348 try: 2349 # For systems that support symbolic and hard links. 2350 if tarinfo.issym(): 2351 os.symlink(tarinfo.linkname, targetpath) 2352 else: 2353 # See extract(). 2354 if os.path.exists(tarinfo._link_target): 2355 os.link(tarinfo._link_target, targetpath) 2356 else: 2357 self._extract_member(self._find_link_target(tarinfo), 2358 targetpath) 2359 except symlink_exception: 2360 if tarinfo.issym(): 2361 linkpath = os.path.join(os.path.dirname(tarinfo.name), 2362 tarinfo.linkname) 2363 else: 2364 linkpath = tarinfo.linkname 2365 else: 2366 try: 2367 self._extract_member(self._find_link_target(tarinfo), 2368 targetpath) 2369 except KeyError: 2370 raise ExtractError("unable to resolve link inside archive") 2371 2372 def chown(self, tarinfo, targetpath): 2373 """Set owner of targetpath according to tarinfo. 2374 """ 2375 if pwd and hasattr(os, "geteuid") and os.geteuid() == 0: 2376 # We have to be root to do so. 2377 try: 2378 g = grp.getgrnam(tarinfo.gname)[2] 2379 except KeyError: 2380 g = tarinfo.gid 2381 try: 2382 u = pwd.getpwnam(tarinfo.uname)[2] 2383 except KeyError: 2384 u = tarinfo.uid 2385 try: 2386 if tarinfo.issym() and hasattr(os, "lchown"): 2387 os.lchown(targetpath, u, g) 2388 else: 2389 if sys.platform != "os2emx": 2390 os.chown(targetpath, u, g) 2391 except EnvironmentError as e: 2392 raise ExtractError("could not change owner") 2393 2394 def chmod(self, tarinfo, targetpath): 2395 """Set file permissions of targetpath according to tarinfo. 2396 """ 2397 if hasattr(os, 'chmod'): 2398 try: 2399 os.chmod(targetpath, tarinfo.mode) 2400 except EnvironmentError as e: 2401 raise ExtractError("could not change mode") 2402 2403 def utime(self, tarinfo, targetpath): 2404 """Set modification time of targetpath according to tarinfo. 2405 """ 2406 if not hasattr(os, 'utime'): 2407 return 2408 try: 2409 os.utime(targetpath, (tarinfo.mtime, tarinfo.mtime)) 2410 except EnvironmentError as e: 2411 raise ExtractError("could not change modification time") 2412 2413 #-------------------------------------------------------------------------- 2414 def next(self): 2415 """Return the next member of the archive as a TarInfo object, when 2416 TarFile is opened for reading. Return None if there is no more 2417 available. 2418 """ 2419 self._check("ra") 2420 if self.firstmember is not None: 2421 m = self.firstmember 2422 self.firstmember = None 2423 return m 2424 2425 # Read the next block. 2426 self.fileobj.seek(self.offset) 2427 tarinfo = None 2428 while True: 2429 try: 2430 tarinfo = self.tarinfo.fromtarfile(self) 2431 except EOFHeaderError as e: 2432 if self.ignore_zeros: 2433 self._dbg(2, "0x%X: %s" % (self.offset, e)) 2434 self.offset += BLOCKSIZE 2435 continue 2436 except InvalidHeaderError as e: 2437 if self.ignore_zeros: 2438 self._dbg(2, "0x%X: %s" % (self.offset, e)) 2439 self.offset += BLOCKSIZE 2440 continue 2441 elif self.offset == 0: 2442 raise ReadError(str(e)) 2443 except EmptyHeaderError: 2444 if self.offset == 0: 2445 raise ReadError("empty file") 2446 except TruncatedHeaderError as e: 2447 if self.offset == 0: 2448 raise ReadError(str(e)) 2449 except SubsequentHeaderError as e: 2450 raise ReadError(str(e)) 2451 break 2452 2453 if tarinfo is not None: 2454 self.members.append(tarinfo) 2455 else: 2456 self._loaded = True 2457 2458 return tarinfo 2459 2460 #-------------------------------------------------------------------------- 2461 # Little helper methods: 2462 2463 def _getmember(self, name, tarinfo=None, normalize=False): 2464 """Find an archive member by name from bottom to top. 2465 If tarinfo is given, it is used as the starting point. 2466 """ 2467 # Ensure that all members have been loaded. 2468 members = self.getmembers() 2469 2470 # Limit the member search list up to tarinfo. 2471 if tarinfo is not None: 2472 members = members[:members.index(tarinfo)] 2473 2474 if normalize: 2475 name = os.path.normpath(name) 2476 2477 for member in reversed(members): 2478 if normalize: 2479 member_name = os.path.normpath(member.name) 2480 else: 2481 member_name = member.name 2482 2483 if name == member_name: 2484 return member 2485 2486 def _load(self): 2487 """Read through the entire archive file and look for readable 2488 members. 2489 """ 2490 while True: 2491 tarinfo = self.next() 2492 if tarinfo is None: 2493 break 2494 self._loaded = True 2495 2496 def _check(self, mode=None): 2497 """Check if TarFile is still open, and if the operation's mode 2498 corresponds to TarFile's mode. 2499 """ 2500 if self.closed: 2501 raise IOError("%s is closed" % self.__class__.__name__) 2502 if mode is not None and self.mode not in mode: 2503 raise IOError("bad operation for mode %r" % self.mode) 2504 2505 def _find_link_target(self, tarinfo): 2506 """Find the target member of a symlink or hardlink member in the 2507 archive. 2508 """ 2509 if tarinfo.issym(): 2510 # Always search the entire archive. 2511 linkname = os.path.dirname(tarinfo.name) + "/" + tarinfo.linkname 2512 limit = None 2513 else: 2514 # Search the archive before the link, because a hard link is 2515 # just a reference to an already archived file. 2516 linkname = tarinfo.linkname 2517 limit = tarinfo 2518 2519 member = self._getmember(linkname, tarinfo=limit, normalize=True) 2520 if member is None: 2521 raise KeyError("linkname %r not found" % linkname) 2522 return member 2523 2524 def __iter__(self): 2525 """Provide an iterator object. 2526 """ 2527 if self._loaded: 2528 return iter(self.members) 2529 else: 2530 return TarIter(self) 2531 2532 def _dbg(self, level, msg): 2533 """Write debugging output to sys.stderr. 2534 """ 2535 if level <= self.debug: 2536 print(msg, file=sys.stderr) 2537 2538 def __enter__(self): 2539 self._check() 2540 return self 2541 2542 def __exit__(self, type, value, traceback): 2543 if type is None: 2544 self.close() 2545 else: 2546 # An exception occurred. We must not call close() because 2547 # it would try to write end-of-archive blocks and padding. 2548 if not self._extfileobj: 2549 self.fileobj.close() 2550 self.closed = True 2551# class TarFile 2552 2553class TarIter(object): 2554 """Iterator Class. 2555 2556 for tarinfo in TarFile(...): 2557 suite... 2558 """ 2559 2560 def __init__(self, tarfile): 2561 """Construct a TarIter object. 2562 """ 2563 self.tarfile = tarfile 2564 self.index = 0 2565 def __iter__(self): 2566 """Return iterator object. 2567 """ 2568 return self 2569 2570 def __next__(self): 2571 """Return the next item using TarFile's next() method. 2572 When all members have been read, set TarFile as _loaded. 2573 """ 2574 # Fix for SF #1100429: Under rare circumstances it can 2575 # happen that getmembers() is called during iteration, 2576 # which will cause TarIter to stop prematurely. 2577 if not self.tarfile._loaded: 2578 tarinfo = self.tarfile.next() 2579 if not tarinfo: 2580 self.tarfile._loaded = True 2581 raise StopIteration 2582 else: 2583 try: 2584 tarinfo = self.tarfile.members[self.index] 2585 except IndexError: 2586 raise StopIteration 2587 self.index += 1 2588 return tarinfo 2589 2590 next = __next__ # for Python 2.x 2591 2592#-------------------- 2593# exported functions 2594#-------------------- 2595def is_tarfile(name): 2596 """Return True if name points to a tar archive that we 2597 are able to handle, else return False. 2598 """ 2599 try: 2600 t = open(name) 2601 t.close() 2602 return True 2603 except TarError: 2604 return False 2605 2606bltn_open = open 2607open = TarFile.open 2608