1#-------------------------------------------------------------------
2# tarfile.py
3#-------------------------------------------------------------------
4# Copyright (C) 2002 Lars Gustaebel <lars@gustaebel.de>
5# All rights reserved.
6#
7# Permission  is  hereby granted,  free  of charge,  to  any person
8# obtaining a  copy of  this software  and associated documentation
9# files  (the  "Software"),  to   deal  in  the  Software   without
10# restriction,  including  without limitation  the  rights to  use,
11# copy, modify, merge, publish, distribute, sublicense, and/or sell
12# copies  of  the  Software,  and to  permit  persons  to  whom the
13# Software  is  furnished  to  do  so,  subject  to  the  following
14# conditions:
15#
16# The above copyright  notice and this  permission notice shall  be
17# included in all copies or substantial portions of the Software.
18#
19# THE SOFTWARE IS PROVIDED "AS  IS", WITHOUT WARRANTY OF ANY  KIND,
20# EXPRESS OR IMPLIED, INCLUDING  BUT NOT LIMITED TO  THE WARRANTIES
21# OF  MERCHANTABILITY,  FITNESS   FOR  A  PARTICULAR   PURPOSE  AND
22# NONINFRINGEMENT.  IN  NO  EVENT SHALL  THE  AUTHORS  OR COPYRIGHT
23# HOLDERS  BE LIABLE  FOR ANY  CLAIM, DAMAGES  OR OTHER  LIABILITY,
24# WHETHER  IN AN  ACTION OF  CONTRACT, TORT  OR OTHERWISE,  ARISING
25# FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
26# OTHER DEALINGS IN THE SOFTWARE.
27#
28from __future__ import print_function
29
30"""Read from and write to tar format archives.
31"""
32
33__version__ = "$Revision$"
34
35version     = "0.9.0"
36__author__  = "Lars Gust\u00e4bel (lars@gustaebel.de)"
37__date__    = "$Date: 2011-02-25 17:42:01 +0200 (Fri, 25 Feb 2011) $"
38__cvsid__   = "$Id: tarfile.py 88586 2011-02-25 15:42:01Z marc-andre.lemburg $"
39__credits__ = "Gustavo Niemeyer, Niels Gust\u00e4bel, Richard Townsend."
40
41#---------
42# Imports
43#---------
44import sys
45import os
46import stat
47import errno
48import time
49import struct
50import copy
51import re
52
53try:
54    import grp, pwd
55except ImportError:
56    grp = pwd = None
57
58# os.symlink on Windows prior to 6.0 raises NotImplementedError
59symlink_exception = (AttributeError, NotImplementedError)
60try:
61    # WindowsError (1314) will be raised if the caller does not hold the
62    # SeCreateSymbolicLinkPrivilege privilege
63    symlink_exception += (WindowsError,)
64except NameError:
65    pass
66
67# from tarfile import *
68__all__ = ["TarFile", "TarInfo", "is_tarfile", "TarError"]
69
70if sys.version_info[0] < 3:
71    import __builtin__ as builtins
72else:
73    import builtins
74
75_open = builtins.open   # Since 'open' is TarFile.open
76
77#---------------------------------------------------------
78# tar constants
79#---------------------------------------------------------
80NUL = b"\0"                     # the null character
81BLOCKSIZE = 512                 # length of processing blocks
82RECORDSIZE = BLOCKSIZE * 20     # length of records
83GNU_MAGIC = b"ustar  \0"        # magic gnu tar string
84POSIX_MAGIC = b"ustar\x0000"    # magic posix tar string
85
86LENGTH_NAME = 100               # maximum length of a filename
87LENGTH_LINK = 100               # maximum length of a linkname
88LENGTH_PREFIX = 155             # maximum length of the prefix field
89
90REGTYPE = b"0"                  # regular file
91AREGTYPE = b"\0"                # regular file
92LNKTYPE = b"1"                  # link (inside tarfile)
93SYMTYPE = b"2"                  # symbolic link
94CHRTYPE = b"3"                  # character special device
95BLKTYPE = b"4"                  # block special device
96DIRTYPE = b"5"                  # directory
97FIFOTYPE = b"6"                 # fifo special device
98CONTTYPE = b"7"                 # contiguous file
99
100GNUTYPE_LONGNAME = b"L"         # GNU tar longname
101GNUTYPE_LONGLINK = b"K"         # GNU tar longlink
102GNUTYPE_SPARSE = b"S"           # GNU tar sparse file
103
104XHDTYPE = b"x"                  # POSIX.1-2001 extended header
105XGLTYPE = b"g"                  # POSIX.1-2001 global header
106SOLARIS_XHDTYPE = b"X"          # Solaris extended header
107
108USTAR_FORMAT = 0                # POSIX.1-1988 (ustar) format
109GNU_FORMAT = 1                  # GNU tar format
110PAX_FORMAT = 2                  # POSIX.1-2001 (pax) format
111DEFAULT_FORMAT = GNU_FORMAT
112
113#---------------------------------------------------------
114# tarfile constants
115#---------------------------------------------------------
116# File types that tarfile supports:
117SUPPORTED_TYPES = (REGTYPE, AREGTYPE, LNKTYPE,
118                   SYMTYPE, DIRTYPE, FIFOTYPE,
119                   CONTTYPE, CHRTYPE, BLKTYPE,
120                   GNUTYPE_LONGNAME, GNUTYPE_LONGLINK,
121                   GNUTYPE_SPARSE)
122
123# File types that will be treated as a regular file.
124REGULAR_TYPES = (REGTYPE, AREGTYPE,
125                 CONTTYPE, GNUTYPE_SPARSE)
126
127# File types that are part of the GNU tar format.
128GNU_TYPES = (GNUTYPE_LONGNAME, GNUTYPE_LONGLINK,
129             GNUTYPE_SPARSE)
130
131# Fields from a pax header that override a TarInfo attribute.
132PAX_FIELDS = ("path", "linkpath", "size", "mtime",
133              "uid", "gid", "uname", "gname")
134
135# Fields from a pax header that are affected by hdrcharset.
136PAX_NAME_FIELDS = set(("path", "linkpath", "uname", "gname"))
137
138# Fields in a pax header that are numbers, all other fields
139# are treated as strings.
140PAX_NUMBER_FIELDS = {
141    "atime": float,
142    "ctime": float,
143    "mtime": float,
144    "uid": int,
145    "gid": int,
146    "size": int
147}
148
149#---------------------------------------------------------
150# Bits used in the mode field, values in octal.
151#---------------------------------------------------------
152S_IFLNK = 0o120000        # symbolic link
153S_IFREG = 0o100000        # regular file
154S_IFBLK = 0o060000        # block device
155S_IFDIR = 0o040000        # directory
156S_IFCHR = 0o020000        # character device
157S_IFIFO = 0o010000        # fifo
158
159TSUID   = 0o4000          # set UID on execution
160TSGID   = 0o2000          # set GID on execution
161TSVTX   = 0o1000          # reserved
162
163TUREAD  = 0o400           # read by owner
164TUWRITE = 0o200           # write by owner
165TUEXEC  = 0o100           # execute/search by owner
166TGREAD  = 0o040           # read by group
167TGWRITE = 0o020           # write by group
168TGEXEC  = 0o010           # execute/search by group
169TOREAD  = 0o004           # read by other
170TOWRITE = 0o002           # write by other
171TOEXEC  = 0o001           # execute/search by other
172
173#---------------------------------------------------------
174# initialization
175#---------------------------------------------------------
176if os.name in ("nt", "ce"):
177    ENCODING = "utf-8"
178else:
179    ENCODING = sys.getfilesystemencoding()
180
181#---------------------------------------------------------
182# Some useful functions
183#---------------------------------------------------------
184
185def stn(s, length, encoding, errors):
186    """Convert a string to a null-terminated bytes object.
187    """
188    s = s.encode(encoding, errors)
189    return s[:length] + (length - len(s)) * NUL
190
191def nts(s, encoding, errors):
192    """Convert a null-terminated bytes object to a string.
193    """
194    p = s.find(b"\0")
195    if p != -1:
196        s = s[:p]
197    return s.decode(encoding, errors)
198
199def nti(s):
200    """Convert a number field to a python number.
201    """
202    # There are two possible encodings for a number field, see
203    # itn() below.
204    if s[0] != chr(0o200):
205        try:
206            n = int(nts(s, "ascii", "strict") or "0", 8)
207        except ValueError:
208            raise InvalidHeaderError("invalid header")
209    else:
210        n = 0
211        for i in range(len(s) - 1):
212            n <<= 8
213            n += ord(s[i + 1])
214    return n
215
216def itn(n, digits=8, format=DEFAULT_FORMAT):
217    """Convert a python number to a number field.
218    """
219    # POSIX 1003.1-1988 requires numbers to be encoded as a string of
220    # octal digits followed by a null-byte, this allows values up to
221    # (8**(digits-1))-1. GNU tar allows storing numbers greater than
222    # that if necessary. A leading 0o200 byte indicates this particular
223    # encoding, the following digits-1 bytes are a big-endian
224    # representation. This allows values up to (256**(digits-1))-1.
225    if 0 <= n < 8 ** (digits - 1):
226        s = ("%0*o" % (digits - 1, n)).encode("ascii") + NUL
227    else:
228        if format != GNU_FORMAT or n >= 256 ** (digits - 1):
229            raise ValueError("overflow in number field")
230
231        if n < 0:
232            # XXX We mimic GNU tar's behaviour with negative numbers,
233            # this could raise OverflowError.
234            n = struct.unpack("L", struct.pack("l", n))[0]
235
236        s = bytearray()
237        for i in range(digits - 1):
238            s.insert(0, n & 0o377)
239            n >>= 8
240        s.insert(0, 0o200)
241    return s
242
243def calc_chksums(buf):
244    """Calculate the checksum for a member's header by summing up all
245       characters except for the chksum field which is treated as if
246       it was filled with spaces. According to the GNU tar sources,
247       some tars (Sun and NeXT) calculate chksum with signed char,
248       which will be different if there are chars in the buffer with
249       the high bit set. So we calculate two checksums, unsigned and
250       signed.
251    """
252    unsigned_chksum = 256 + sum(struct.unpack("148B", buf[:148]) + struct.unpack("356B", buf[156:512]))
253    signed_chksum = 256 + sum(struct.unpack("148b", buf[:148]) + struct.unpack("356b", buf[156:512]))
254    return unsigned_chksum, signed_chksum
255
256def copyfileobj(src, dst, length=None):
257    """Copy length bytes from fileobj src to fileobj dst.
258       If length is None, copy the entire content.
259    """
260    if length == 0:
261        return
262    if length is None:
263        while True:
264            buf = src.read(16*1024)
265            if not buf:
266                break
267            dst.write(buf)
268        return
269
270    BUFSIZE = 16 * 1024
271    blocks, remainder = divmod(length, BUFSIZE)
272    for b in range(blocks):
273        buf = src.read(BUFSIZE)
274        if len(buf) < BUFSIZE:
275            raise IOError("end of file reached")
276        dst.write(buf)
277
278    if remainder != 0:
279        buf = src.read(remainder)
280        if len(buf) < remainder:
281            raise IOError("end of file reached")
282        dst.write(buf)
283    return
284
285filemode_table = (
286    ((S_IFLNK,      "l"),
287     (S_IFREG,      "-"),
288     (S_IFBLK,      "b"),
289     (S_IFDIR,      "d"),
290     (S_IFCHR,      "c"),
291     (S_IFIFO,      "p")),
292
293    ((TUREAD,       "r"),),
294    ((TUWRITE,      "w"),),
295    ((TUEXEC|TSUID, "s"),
296     (TSUID,        "S"),
297     (TUEXEC,       "x")),
298
299    ((TGREAD,       "r"),),
300    ((TGWRITE,      "w"),),
301    ((TGEXEC|TSGID, "s"),
302     (TSGID,        "S"),
303     (TGEXEC,       "x")),
304
305    ((TOREAD,       "r"),),
306    ((TOWRITE,      "w"),),
307    ((TOEXEC|TSVTX, "t"),
308     (TSVTX,        "T"),
309     (TOEXEC,       "x"))
310)
311
312def filemode(mode):
313    """Convert a file's mode to a string of the form
314       -rwxrwxrwx.
315       Used by TarFile.list()
316    """
317    perm = []
318    for table in filemode_table:
319        for bit, char in table:
320            if mode & bit == bit:
321                perm.append(char)
322                break
323        else:
324            perm.append("-")
325    return "".join(perm)
326
327class TarError(Exception):
328    """Base exception."""
329    pass
330class ExtractError(TarError):
331    """General exception for extract errors."""
332    pass
333class ReadError(TarError):
334    """Exception for unreadable tar archives."""
335    pass
336class CompressionError(TarError):
337    """Exception for unavailable compression methods."""
338    pass
339class StreamError(TarError):
340    """Exception for unsupported operations on stream-like TarFiles."""
341    pass
342class HeaderError(TarError):
343    """Base exception for header errors."""
344    pass
345class EmptyHeaderError(HeaderError):
346    """Exception for empty headers."""
347    pass
348class TruncatedHeaderError(HeaderError):
349    """Exception for truncated headers."""
350    pass
351class EOFHeaderError(HeaderError):
352    """Exception for end of file headers."""
353    pass
354class InvalidHeaderError(HeaderError):
355    """Exception for invalid headers."""
356    pass
357class SubsequentHeaderError(HeaderError):
358    """Exception for missing and invalid extended headers."""
359    pass
360
361#---------------------------
362# internal stream interface
363#---------------------------
364class _LowLevelFile(object):
365    """Low-level file object. Supports reading and writing.
366       It is used instead of a regular file object for streaming
367       access.
368    """
369
370    def __init__(self, name, mode):
371        mode = {
372            "r": os.O_RDONLY,
373            "w": os.O_WRONLY | os.O_CREAT | os.O_TRUNC,
374        }[mode]
375        if hasattr(os, "O_BINARY"):
376            mode |= os.O_BINARY
377        self.fd = os.open(name, mode, 0o666)
378
379    def close(self):
380        os.close(self.fd)
381
382    def read(self, size):
383        return os.read(self.fd, size)
384
385    def write(self, s):
386        os.write(self.fd, s)
387
388class _Stream(object):
389    """Class that serves as an adapter between TarFile and
390       a stream-like object.  The stream-like object only
391       needs to have a read() or write() method and is accessed
392       blockwise.  Use of gzip or bzip2 compression is possible.
393       A stream-like object could be for example: sys.stdin,
394       sys.stdout, a socket, a tape device etc.
395
396       _Stream is intended to be used only internally.
397    """
398
399    def __init__(self, name, mode, comptype, fileobj, bufsize):
400        """Construct a _Stream object.
401        """
402        self._extfileobj = True
403        if fileobj is None:
404            fileobj = _LowLevelFile(name, mode)
405            self._extfileobj = False
406
407        if comptype == '*':
408            # Enable transparent compression detection for the
409            # stream interface
410            fileobj = _StreamProxy(fileobj)
411            comptype = fileobj.getcomptype()
412
413        self.name     = name or ""
414        self.mode     = mode
415        self.comptype = comptype
416        self.fileobj  = fileobj
417        self.bufsize  = bufsize
418        self.buf      = b""
419        self.pos      = 0
420        self.closed   = False
421
422        try:
423            if comptype == "gz":
424                try:
425                    import zlib
426                except ImportError:
427                    raise CompressionError("zlib module is not available")
428                self.zlib = zlib
429                self.crc = zlib.crc32(b"")
430                if mode == "r":
431                    self._init_read_gz()
432                else:
433                    self._init_write_gz()
434
435            if comptype == "bz2":
436                try:
437                    import bz2
438                except ImportError:
439                    raise CompressionError("bz2 module is not available")
440                if mode == "r":
441                    self.dbuf = b""
442                    self.cmp = bz2.BZ2Decompressor()
443                else:
444                    self.cmp = bz2.BZ2Compressor()
445        except:
446            if not self._extfileobj:
447                self.fileobj.close()
448            self.closed = True
449            raise
450
451    def __del__(self):
452        if hasattr(self, "closed") and not self.closed:
453            self.close()
454
455    def _init_write_gz(self):
456        """Initialize for writing with gzip compression.
457        """
458        self.cmp = self.zlib.compressobj(9, self.zlib.DEFLATED,
459                                            -self.zlib.MAX_WBITS,
460                                            self.zlib.DEF_MEM_LEVEL,
461                                            0)
462        timestamp = struct.pack("<L", int(time.time()))
463        self.__write(b"\037\213\010\010" + timestamp + b"\002\377")
464        if self.name.endswith(".gz"):
465            self.name = self.name[:-3]
466        # RFC1952 says we must use ISO-8859-1 for the FNAME field.
467        self.__write(self.name.encode("iso-8859-1", "replace") + NUL)
468
469    def write(self, s):
470        """Write string s to the stream.
471        """
472        if self.comptype == "gz":
473            self.crc = self.zlib.crc32(s, self.crc)
474        self.pos += len(s)
475        if self.comptype != "tar":
476            s = self.cmp.compress(s)
477        self.__write(s)
478
479    def __write(self, s):
480        """Write string s to the stream if a whole new block
481           is ready to be written.
482        """
483        self.buf += s
484        while len(self.buf) > self.bufsize:
485            self.fileobj.write(self.buf[:self.bufsize])
486            self.buf = self.buf[self.bufsize:]
487
488    def close(self):
489        """Close the _Stream object. No operation should be
490           done on it afterwards.
491        """
492        if self.closed:
493            return
494
495        if self.mode == "w" and self.comptype != "tar":
496            self.buf += self.cmp.flush()
497
498        if self.mode == "w" and self.buf:
499            self.fileobj.write(self.buf)
500            self.buf = b""
501            if self.comptype == "gz":
502                # The native zlib crc is an unsigned 32-bit integer, but
503                # the Python wrapper implicitly casts that to a signed C
504                # long.  So, on a 32-bit box self.crc may "look negative",
505                # while the same crc on a 64-bit box may "look positive".
506                # To avoid irksome warnings from the `struct` module, force
507                # it to look positive on all boxes.
508                self.fileobj.write(struct.pack("<L", self.crc & 0xffffffff))
509                self.fileobj.write(struct.pack("<L", self.pos & 0xffffFFFF))
510
511        if not self._extfileobj:
512            self.fileobj.close()
513
514        self.closed = True
515
516    def _init_read_gz(self):
517        """Initialize for reading a gzip compressed fileobj.
518        """
519        self.cmp = self.zlib.decompressobj(-self.zlib.MAX_WBITS)
520        self.dbuf = b""
521
522        # taken from gzip.GzipFile with some alterations
523        if self.__read(2) != b"\037\213":
524            raise ReadError("not a gzip file")
525        if self.__read(1) != b"\010":
526            raise CompressionError("unsupported compression method")
527
528        flag = ord(self.__read(1))
529        self.__read(6)
530
531        if flag & 4:
532            xlen = ord(self.__read(1)) + 256 * ord(self.__read(1))
533            self.read(xlen)
534        if flag & 8:
535            while True:
536                s = self.__read(1)
537                if not s or s == NUL:
538                    break
539        if flag & 16:
540            while True:
541                s = self.__read(1)
542                if not s or s == NUL:
543                    break
544        if flag & 2:
545            self.__read(2)
546
547    def tell(self):
548        """Return the stream's file pointer position.
549        """
550        return self.pos
551
552    def seek(self, pos=0):
553        """Set the stream's file pointer to pos. Negative seeking
554           is forbidden.
555        """
556        if pos - self.pos >= 0:
557            blocks, remainder = divmod(pos - self.pos, self.bufsize)
558            for i in range(blocks):
559                self.read(self.bufsize)
560            self.read(remainder)
561        else:
562            raise StreamError("seeking backwards is not allowed")
563        return self.pos
564
565    def read(self, size=None):
566        """Return the next size number of bytes from the stream.
567           If size is not defined, return all bytes of the stream
568           up to EOF.
569        """
570        if size is None:
571            t = []
572            while True:
573                buf = self._read(self.bufsize)
574                if not buf:
575                    break
576                t.append(buf)
577            buf = "".join(t)
578        else:
579            buf = self._read(size)
580        self.pos += len(buf)
581        return buf
582
583    def _read(self, size):
584        """Return size bytes from the stream.
585        """
586        if self.comptype == "tar":
587            return self.__read(size)
588
589        c = len(self.dbuf)
590        while c < size:
591            buf = self.__read(self.bufsize)
592            if not buf:
593                break
594            try:
595                buf = self.cmp.decompress(buf)
596            except IOError:
597                raise ReadError("invalid compressed data")
598            self.dbuf += buf
599            c += len(buf)
600        buf = self.dbuf[:size]
601        self.dbuf = self.dbuf[size:]
602        return buf
603
604    def __read(self, size):
605        """Return size bytes from stream. If internal buffer is empty,
606           read another block from the stream.
607        """
608        c = len(self.buf)
609        while c < size:
610            buf = self.fileobj.read(self.bufsize)
611            if not buf:
612                break
613            self.buf += buf
614            c += len(buf)
615        buf = self.buf[:size]
616        self.buf = self.buf[size:]
617        return buf
618# class _Stream
619
620class _StreamProxy(object):
621    """Small proxy class that enables transparent compression
622       detection for the Stream interface (mode 'r|*').
623    """
624
625    def __init__(self, fileobj):
626        self.fileobj = fileobj
627        self.buf = self.fileobj.read(BLOCKSIZE)
628
629    def read(self, size):
630        self.read = self.fileobj.read
631        return self.buf
632
633    def getcomptype(self):
634        if self.buf.startswith(b"\037\213\010"):
635            return "gz"
636        if self.buf.startswith(b"BZh91"):
637            return "bz2"
638        return "tar"
639
640    def close(self):
641        self.fileobj.close()
642# class StreamProxy
643
644class _BZ2Proxy(object):
645    """Small proxy class that enables external file object
646       support for "r:bz2" and "w:bz2" modes. This is actually
647       a workaround for a limitation in bz2 module's BZ2File
648       class which (unlike gzip.GzipFile) has no support for
649       a file object argument.
650    """
651
652    blocksize = 16 * 1024
653
654    def __init__(self, fileobj, mode):
655        self.fileobj = fileobj
656        self.mode = mode
657        self.name = getattr(self.fileobj, "name", None)
658        self.init()
659
660    def init(self):
661        import bz2
662        self.pos = 0
663        if self.mode == "r":
664            self.bz2obj = bz2.BZ2Decompressor()
665            self.fileobj.seek(0)
666            self.buf = b""
667        else:
668            self.bz2obj = bz2.BZ2Compressor()
669
670    def read(self, size):
671        x = len(self.buf)
672        while x < size:
673            raw = self.fileobj.read(self.blocksize)
674            if not raw:
675                break
676            data = self.bz2obj.decompress(raw)
677            self.buf += data
678            x += len(data)
679
680        buf = self.buf[:size]
681        self.buf = self.buf[size:]
682        self.pos += len(buf)
683        return buf
684
685    def seek(self, pos):
686        if pos < self.pos:
687            self.init()
688        self.read(pos - self.pos)
689
690    def tell(self):
691        return self.pos
692
693    def write(self, data):
694        self.pos += len(data)
695        raw = self.bz2obj.compress(data)
696        self.fileobj.write(raw)
697
698    def close(self):
699        if self.mode == "w":
700            raw = self.bz2obj.flush()
701            self.fileobj.write(raw)
702# class _BZ2Proxy
703
704#------------------------
705# Extraction file object
706#------------------------
707class _FileInFile(object):
708    """A thin wrapper around an existing file object that
709       provides a part of its data as an individual file
710       object.
711    """
712
713    def __init__(self, fileobj, offset, size, blockinfo=None):
714        self.fileobj = fileobj
715        self.offset = offset
716        self.size = size
717        self.position = 0
718
719        if blockinfo is None:
720            blockinfo = [(0, size)]
721
722        # Construct a map with data and zero blocks.
723        self.map_index = 0
724        self.map = []
725        lastpos = 0
726        realpos = self.offset
727        for offset, size in blockinfo:
728            if offset > lastpos:
729                self.map.append((False, lastpos, offset, None))
730            self.map.append((True, offset, offset + size, realpos))
731            realpos += size
732            lastpos = offset + size
733        if lastpos < self.size:
734            self.map.append((False, lastpos, self.size, None))
735
736    def seekable(self):
737        if not hasattr(self.fileobj, "seekable"):
738            # XXX gzip.GzipFile and bz2.BZ2File
739            return True
740        return self.fileobj.seekable()
741
742    def tell(self):
743        """Return the current file position.
744        """
745        return self.position
746
747    def seek(self, position):
748        """Seek to a position in the file.
749        """
750        self.position = position
751
752    def read(self, size=None):
753        """Read data from the file.
754        """
755        if size is None:
756            size = self.size - self.position
757        else:
758            size = min(size, self.size - self.position)
759
760        buf = b""
761        while size > 0:
762            while True:
763                data, start, stop, offset = self.map[self.map_index]
764                if start <= self.position < stop:
765                    break
766                else:
767                    self.map_index += 1
768                    if self.map_index == len(self.map):
769                        self.map_index = 0
770            length = min(size, stop - self.position)
771            if data:
772                self.fileobj.seek(offset + (self.position - start))
773                buf += self.fileobj.read(length)
774            else:
775                buf += NUL * length
776            size -= length
777            self.position += length
778        return buf
779#class _FileInFile
780
781
782class ExFileObject(object):
783    """File-like object for reading an archive member.
784       Is returned by TarFile.extractfile().
785    """
786    blocksize = 1024
787
788    def __init__(self, tarfile, tarinfo):
789        self.fileobj = _FileInFile(tarfile.fileobj,
790                                   tarinfo.offset_data,
791                                   tarinfo.size,
792                                   tarinfo.sparse)
793        self.name = tarinfo.name
794        self.mode = "r"
795        self.closed = False
796        self.size = tarinfo.size
797
798        self.position = 0
799        self.buffer = b""
800
801    def readable(self):
802        return True
803
804    def writable(self):
805        return False
806
807    def seekable(self):
808        return self.fileobj.seekable()
809
810    def read(self, size=None):
811        """Read at most size bytes from the file. If size is not
812           present or None, read all data until EOF is reached.
813        """
814        if self.closed:
815            raise ValueError("I/O operation on closed file")
816
817        buf = b""
818        if self.buffer:
819            if size is None:
820                buf = self.buffer
821                self.buffer = b""
822            else:
823                buf = self.buffer[:size]
824                self.buffer = self.buffer[size:]
825
826        if size is None:
827            buf += self.fileobj.read()
828        else:
829            buf += self.fileobj.read(size - len(buf))
830
831        self.position += len(buf)
832        return buf
833
834    # XXX TextIOWrapper uses the read1() method.
835    read1 = read
836
837    def readline(self, size=-1):
838        """Read one entire line from the file. If size is present
839           and non-negative, return a string with at most that
840           size, which may be an incomplete line.
841        """
842        if self.closed:
843            raise ValueError("I/O operation on closed file")
844
845        pos = self.buffer.find(b"\n") + 1
846        if pos == 0:
847            # no newline found.
848            while True:
849                buf = self.fileobj.read(self.blocksize)
850                self.buffer += buf
851                if not buf or b"\n" in buf:
852                    pos = self.buffer.find(b"\n") + 1
853                    if pos == 0:
854                        # no newline found.
855                        pos = len(self.buffer)
856                    break
857
858        if size != -1:
859            pos = min(size, pos)
860
861        buf = self.buffer[:pos]
862        self.buffer = self.buffer[pos:]
863        self.position += len(buf)
864        return buf
865
866    def readlines(self):
867        """Return a list with all remaining lines.
868        """
869        result = []
870        while True:
871            line = self.readline()
872            if not line: break
873            result.append(line)
874        return result
875
876    def tell(self):
877        """Return the current file position.
878        """
879        if self.closed:
880            raise ValueError("I/O operation on closed file")
881
882        return self.position
883
884    def seek(self, pos, whence=os.SEEK_SET):
885        """Seek to a position in the file.
886        """
887        if self.closed:
888            raise ValueError("I/O operation on closed file")
889
890        if whence == os.SEEK_SET:
891            self.position = min(max(pos, 0), self.size)
892        elif whence == os.SEEK_CUR:
893            if pos < 0:
894                self.position = max(self.position + pos, 0)
895            else:
896                self.position = min(self.position + pos, self.size)
897        elif whence == os.SEEK_END:
898            self.position = max(min(self.size + pos, self.size), 0)
899        else:
900            raise ValueError("Invalid argument")
901
902        self.buffer = b""
903        self.fileobj.seek(self.position)
904
905    def close(self):
906        """Close the file object.
907        """
908        self.closed = True
909
910    def __iter__(self):
911        """Get an iterator over the file's lines.
912        """
913        while True:
914            line = self.readline()
915            if not line:
916                break
917            yield line
918#class ExFileObject
919
920#------------------
921# Exported Classes
922#------------------
923class TarInfo(object):
924    """Informational class which holds the details about an
925       archive member given by a tar header block.
926       TarInfo objects are returned by TarFile.getmember(),
927       TarFile.getmembers() and TarFile.gettarinfo() and are
928       usually created internally.
929    """
930
931    __slots__ = ("name", "mode", "uid", "gid", "size", "mtime",
932                 "chksum", "type", "linkname", "uname", "gname",
933                 "devmajor", "devminor",
934                 "offset", "offset_data", "pax_headers", "sparse",
935                 "tarfile", "_sparse_structs", "_link_target")
936
937    def __init__(self, name=""):
938        """Construct a TarInfo object. name is the optional name
939           of the member.
940        """
941        self.name = name        # member name
942        self.mode = 0o644       # file permissions
943        self.uid = 0            # user id
944        self.gid = 0            # group id
945        self.size = 0           # file size
946        self.mtime = 0          # modification time
947        self.chksum = 0         # header checksum
948        self.type = REGTYPE     # member type
949        self.linkname = ""      # link name
950        self.uname = ""         # user name
951        self.gname = ""         # group name
952        self.devmajor = 0       # device major number
953        self.devminor = 0       # device minor number
954
955        self.offset = 0         # the tar header starts here
956        self.offset_data = 0    # the file's data starts here
957
958        self.sparse = None      # sparse member information
959        self.pax_headers = {}   # pax header information
960
961    # In pax headers the "name" and "linkname" field are called
962    # "path" and "linkpath".
963    def _getpath(self):
964        return self.name
965    def _setpath(self, name):
966        self.name = name
967    path = property(_getpath, _setpath)
968
969    def _getlinkpath(self):
970        return self.linkname
971    def _setlinkpath(self, linkname):
972        self.linkname = linkname
973    linkpath = property(_getlinkpath, _setlinkpath)
974
975    def __repr__(self):
976        return "<%s %r at %#x>" % (self.__class__.__name__,self.name,id(self))
977
978    def get_info(self):
979        """Return the TarInfo's attributes as a dictionary.
980        """
981        info = {
982            "name":     self.name,
983            "mode":     self.mode & 0o7777,
984            "uid":      self.uid,
985            "gid":      self.gid,
986            "size":     self.size,
987            "mtime":    self.mtime,
988            "chksum":   self.chksum,
989            "type":     self.type,
990            "linkname": self.linkname,
991            "uname":    self.uname,
992            "gname":    self.gname,
993            "devmajor": self.devmajor,
994            "devminor": self.devminor
995        }
996
997        if info["type"] == DIRTYPE and not info["name"].endswith("/"):
998            info["name"] += "/"
999
1000        return info
1001
1002    def tobuf(self, format=DEFAULT_FORMAT, encoding=ENCODING, errors="surrogateescape"):
1003        """Return a tar header as a string of 512 byte blocks.
1004        """
1005        info = self.get_info()
1006
1007        if format == USTAR_FORMAT:
1008            return self.create_ustar_header(info, encoding, errors)
1009        elif format == GNU_FORMAT:
1010            return self.create_gnu_header(info, encoding, errors)
1011        elif format == PAX_FORMAT:
1012            return self.create_pax_header(info, encoding)
1013        else:
1014            raise ValueError("invalid format")
1015
1016    def create_ustar_header(self, info, encoding, errors):
1017        """Return the object as a ustar header block.
1018        """
1019        info["magic"] = POSIX_MAGIC
1020
1021        if len(info["linkname"]) > LENGTH_LINK:
1022            raise ValueError("linkname is too long")
1023
1024        if len(info["name"]) > LENGTH_NAME:
1025            info["prefix"], info["name"] = self._posix_split_name(info["name"])
1026
1027        return self._create_header(info, USTAR_FORMAT, encoding, errors)
1028
1029    def create_gnu_header(self, info, encoding, errors):
1030        """Return the object as a GNU header block sequence.
1031        """
1032        info["magic"] = GNU_MAGIC
1033
1034        buf = b""
1035        if len(info["linkname"]) > LENGTH_LINK:
1036            buf += self._create_gnu_long_header(info["linkname"], GNUTYPE_LONGLINK, encoding, errors)
1037
1038        if len(info["name"]) > LENGTH_NAME:
1039            buf += self._create_gnu_long_header(info["name"], GNUTYPE_LONGNAME, encoding, errors)
1040
1041        return buf + self._create_header(info, GNU_FORMAT, encoding, errors)
1042
1043    def create_pax_header(self, info, encoding):
1044        """Return the object as a ustar header block. If it cannot be
1045           represented this way, prepend a pax extended header sequence
1046           with supplement information.
1047        """
1048        info["magic"] = POSIX_MAGIC
1049        pax_headers = self.pax_headers.copy()
1050
1051        # Test string fields for values that exceed the field length or cannot
1052        # be represented in ASCII encoding.
1053        for name, hname, length in (
1054                ("name", "path", LENGTH_NAME), ("linkname", "linkpath", LENGTH_LINK),
1055                ("uname", "uname", 32), ("gname", "gname", 32)):
1056
1057            if hname in pax_headers:
1058                # The pax header has priority.
1059                continue
1060
1061            # Try to encode the string as ASCII.
1062            try:
1063                info[name].encode("ascii", "strict")
1064            except UnicodeEncodeError:
1065                pax_headers[hname] = info[name]
1066                continue
1067
1068            if len(info[name]) > length:
1069                pax_headers[hname] = info[name]
1070
1071        # Test number fields for values that exceed the field limit or values
1072        # that like to be stored as float.
1073        for name, digits in (("uid", 8), ("gid", 8), ("size", 12), ("mtime", 12)):
1074            if name in pax_headers:
1075                # The pax header has priority. Avoid overflow.
1076                info[name] = 0
1077                continue
1078
1079            val = info[name]
1080            if not 0 <= val < 8 ** (digits - 1) or isinstance(val, float):
1081                pax_headers[name] = str(val)
1082                info[name] = 0
1083
1084        # Create a pax extended header if necessary.
1085        if pax_headers:
1086            buf = self._create_pax_generic_header(pax_headers, XHDTYPE, encoding)
1087        else:
1088            buf = b""
1089
1090        return buf + self._create_header(info, USTAR_FORMAT, "ascii", "replace")
1091
1092    @classmethod
1093    def create_pax_global_header(cls, pax_headers):
1094        """Return the object as a pax global header block sequence.
1095        """
1096        return cls._create_pax_generic_header(pax_headers, XGLTYPE, "utf8")
1097
1098    def _posix_split_name(self, name):
1099        """Split a name longer than 100 chars into a prefix
1100           and a name part.
1101        """
1102        prefix = name[:LENGTH_PREFIX + 1]
1103        while prefix and prefix[-1] != "/":
1104            prefix = prefix[:-1]
1105
1106        name = name[len(prefix):]
1107        prefix = prefix[:-1]
1108
1109        if not prefix or len(name) > LENGTH_NAME:
1110            raise ValueError("name is too long")
1111        return prefix, name
1112
1113    @staticmethod
1114    def _create_header(info, format, encoding, errors):
1115        """Return a header block. info is a dictionary with file
1116           information, format must be one of the *_FORMAT constants.
1117        """
1118        parts = [
1119            stn(info.get("name", ""), 100, encoding, errors),
1120            itn(info.get("mode", 0) & 0o7777, 8, format),
1121            itn(info.get("uid", 0), 8, format),
1122            itn(info.get("gid", 0), 8, format),
1123            itn(info.get("size", 0), 12, format),
1124            itn(info.get("mtime", 0), 12, format),
1125            b"        ", # checksum field
1126            info.get("type", REGTYPE),
1127            stn(info.get("linkname", ""), 100, encoding, errors),
1128            info.get("magic", POSIX_MAGIC),
1129            stn(info.get("uname", ""), 32, encoding, errors),
1130            stn(info.get("gname", ""), 32, encoding, errors),
1131            itn(info.get("devmajor", 0), 8, format),
1132            itn(info.get("devminor", 0), 8, format),
1133            stn(info.get("prefix", ""), 155, encoding, errors)
1134        ]
1135
1136        buf = struct.pack("%ds" % BLOCKSIZE, b"".join(parts))
1137        chksum = calc_chksums(buf[-BLOCKSIZE:])[0]
1138        buf = buf[:-364] + ("%06o\0" % chksum).encode("ascii") + buf[-357:]
1139        return buf
1140
1141    @staticmethod
1142    def _create_payload(payload):
1143        """Return the string payload filled with zero bytes
1144           up to the next 512 byte border.
1145        """
1146        blocks, remainder = divmod(len(payload), BLOCKSIZE)
1147        if remainder > 0:
1148            payload += (BLOCKSIZE - remainder) * NUL
1149        return payload
1150
1151    @classmethod
1152    def _create_gnu_long_header(cls, name, type, encoding, errors):
1153        """Return a GNUTYPE_LONGNAME or GNUTYPE_LONGLINK sequence
1154           for name.
1155        """
1156        name = name.encode(encoding, errors) + NUL
1157
1158        info = {}
1159        info["name"] = "././@LongLink"
1160        info["type"] = type
1161        info["size"] = len(name)
1162        info["magic"] = GNU_MAGIC
1163
1164        # create extended header + name blocks.
1165        return cls._create_header(info, USTAR_FORMAT, encoding, errors) + \
1166                cls._create_payload(name)
1167
1168    @classmethod
1169    def _create_pax_generic_header(cls, pax_headers, type, encoding):
1170        """Return a POSIX.1-2008 extended or global header sequence
1171           that contains a list of keyword, value pairs. The values
1172           must be strings.
1173        """
1174        # Check if one of the fields contains surrogate characters and thereby
1175        # forces hdrcharset=BINARY, see _proc_pax() for more information.
1176        binary = False
1177        for keyword, value in pax_headers.items():
1178            try:
1179                value.encode("utf8", "strict")
1180            except UnicodeEncodeError:
1181                binary = True
1182                break
1183
1184        records = b""
1185        if binary:
1186            # Put the hdrcharset field at the beginning of the header.
1187            records += b"21 hdrcharset=BINARY\n"
1188
1189        for keyword, value in pax_headers.items():
1190            keyword = keyword.encode("utf8")
1191            if binary:
1192                # Try to restore the original byte representation of `value'.
1193                # Needless to say, that the encoding must match the string.
1194                value = value.encode(encoding, "surrogateescape")
1195            else:
1196                value = value.encode("utf8")
1197
1198            l = len(keyword) + len(value) + 3   # ' ' + '=' + '\n'
1199            n = p = 0
1200            while True:
1201                n = l + len(str(p))
1202                if n == p:
1203                    break
1204                p = n
1205            records += bytes(str(p), "ascii") + b" " + keyword + b"=" + value + b"\n"
1206
1207        # We use a hardcoded "././@PaxHeader" name like star does
1208        # instead of the one that POSIX recommends.
1209        info = {}
1210        info["name"] = "././@PaxHeader"
1211        info["type"] = type
1212        info["size"] = len(records)
1213        info["magic"] = POSIX_MAGIC
1214
1215        # Create pax header + record blocks.
1216        return cls._create_header(info, USTAR_FORMAT, "ascii", "replace") + \
1217                cls._create_payload(records)
1218
1219    @classmethod
1220    def frombuf(cls, buf, encoding, errors):
1221        """Construct a TarInfo object from a 512 byte bytes object.
1222        """
1223        if len(buf) == 0:
1224            raise EmptyHeaderError("empty header")
1225        if len(buf) != BLOCKSIZE:
1226            raise TruncatedHeaderError("truncated header")
1227        if buf.count(NUL) == BLOCKSIZE:
1228            raise EOFHeaderError("end of file header")
1229
1230        chksum = nti(buf[148:156])
1231        if chksum not in calc_chksums(buf):
1232            raise InvalidHeaderError("bad checksum")
1233
1234        obj = cls()
1235        obj.name = nts(buf[0:100], encoding, errors)
1236        obj.mode = nti(buf[100:108])
1237        obj.uid = nti(buf[108:116])
1238        obj.gid = nti(buf[116:124])
1239        obj.size = nti(buf[124:136])
1240        obj.mtime = nti(buf[136:148])
1241        obj.chksum = chksum
1242        obj.type = buf[156:157]
1243        obj.linkname = nts(buf[157:257], encoding, errors)
1244        obj.uname = nts(buf[265:297], encoding, errors)
1245        obj.gname = nts(buf[297:329], encoding, errors)
1246        obj.devmajor = nti(buf[329:337])
1247        obj.devminor = nti(buf[337:345])
1248        prefix = nts(buf[345:500], encoding, errors)
1249
1250        # Old V7 tar format represents a directory as a regular
1251        # file with a trailing slash.
1252        if obj.type == AREGTYPE and obj.name.endswith("/"):
1253            obj.type = DIRTYPE
1254
1255        # The old GNU sparse format occupies some of the unused
1256        # space in the buffer for up to 4 sparse structures.
1257        # Save the them for later processing in _proc_sparse().
1258        if obj.type == GNUTYPE_SPARSE:
1259            pos = 386
1260            structs = []
1261            for i in range(4):
1262                try:
1263                    offset = nti(buf[pos:pos + 12])
1264                    numbytes = nti(buf[pos + 12:pos + 24])
1265                except ValueError:
1266                    break
1267                structs.append((offset, numbytes))
1268                pos += 24
1269            isextended = bool(buf[482])
1270            origsize = nti(buf[483:495])
1271            obj._sparse_structs = (structs, isextended, origsize)
1272
1273        # Remove redundant slashes from directories.
1274        if obj.isdir():
1275            obj.name = obj.name.rstrip("/")
1276
1277        # Reconstruct a ustar longname.
1278        if prefix and obj.type not in GNU_TYPES:
1279            obj.name = prefix + "/" + obj.name
1280        return obj
1281
1282    @classmethod
1283    def fromtarfile(cls, tarfile):
1284        """Return the next TarInfo object from TarFile object
1285           tarfile.
1286        """
1287        buf = tarfile.fileobj.read(BLOCKSIZE)
1288        obj = cls.frombuf(buf, tarfile.encoding, tarfile.errors)
1289        obj.offset = tarfile.fileobj.tell() - BLOCKSIZE
1290        return obj._proc_member(tarfile)
1291
1292    #--------------------------------------------------------------------------
1293    # The following are methods that are called depending on the type of a
1294    # member. The entry point is _proc_member() which can be overridden in a
1295    # subclass to add custom _proc_*() methods. A _proc_*() method MUST
1296    # implement the following
1297    # operations:
1298    # 1. Set self.offset_data to the position where the data blocks begin,
1299    #    if there is data that follows.
1300    # 2. Set tarfile.offset to the position where the next member's header will
1301    #    begin.
1302    # 3. Return self or another valid TarInfo object.
1303    def _proc_member(self, tarfile):
1304        """Choose the right processing method depending on
1305           the type and call it.
1306        """
1307        if self.type in (GNUTYPE_LONGNAME, GNUTYPE_LONGLINK):
1308            return self._proc_gnulong(tarfile)
1309        elif self.type == GNUTYPE_SPARSE:
1310            return self._proc_sparse(tarfile)
1311        elif self.type in (XHDTYPE, XGLTYPE, SOLARIS_XHDTYPE):
1312            return self._proc_pax(tarfile)
1313        else:
1314            return self._proc_builtin(tarfile)
1315
1316    def _proc_builtin(self, tarfile):
1317        """Process a builtin type or an unknown type which
1318           will be treated as a regular file.
1319        """
1320        self.offset_data = tarfile.fileobj.tell()
1321        offset = self.offset_data
1322        if self.isreg() or self.type not in SUPPORTED_TYPES:
1323            # Skip the following data blocks.
1324            offset += self._block(self.size)
1325        tarfile.offset = offset
1326
1327        # Patch the TarInfo object with saved global
1328        # header information.
1329        self._apply_pax_info(tarfile.pax_headers, tarfile.encoding, tarfile.errors)
1330
1331        return self
1332
1333    def _proc_gnulong(self, tarfile):
1334        """Process the blocks that hold a GNU longname
1335           or longlink member.
1336        """
1337        buf = tarfile.fileobj.read(self._block(self.size))
1338
1339        # Fetch the next header and process it.
1340        try:
1341            next = self.fromtarfile(tarfile)
1342        except HeaderError:
1343            raise SubsequentHeaderError("missing or bad subsequent header")
1344
1345        # Patch the TarInfo object from the next header with
1346        # the longname information.
1347        next.offset = self.offset
1348        if self.type == GNUTYPE_LONGNAME:
1349            next.name = nts(buf, tarfile.encoding, tarfile.errors)
1350        elif self.type == GNUTYPE_LONGLINK:
1351            next.linkname = nts(buf, tarfile.encoding, tarfile.errors)
1352
1353        return next
1354
1355    def _proc_sparse(self, tarfile):
1356        """Process a GNU sparse header plus extra headers.
1357        """
1358        # We already collected some sparse structures in frombuf().
1359        structs, isextended, origsize = self._sparse_structs
1360        del self._sparse_structs
1361
1362        # Collect sparse structures from extended header blocks.
1363        while isextended:
1364            buf = tarfile.fileobj.read(BLOCKSIZE)
1365            pos = 0
1366            for i in range(21):
1367                try:
1368                    offset = nti(buf[pos:pos + 12])
1369                    numbytes = nti(buf[pos + 12:pos + 24])
1370                except ValueError:
1371                    break
1372                if offset and numbytes:
1373                    structs.append((offset, numbytes))
1374                pos += 24
1375            isextended = bool(buf[504])
1376        self.sparse = structs
1377
1378        self.offset_data = tarfile.fileobj.tell()
1379        tarfile.offset = self.offset_data + self._block(self.size)
1380        self.size = origsize
1381        return self
1382
1383    def _proc_pax(self, tarfile):
1384        """Process an extended or global header as described in
1385           POSIX.1-2008.
1386        """
1387        # Read the header information.
1388        buf = tarfile.fileobj.read(self._block(self.size))
1389
1390        # A pax header stores supplemental information for either
1391        # the following file (extended) or all following files
1392        # (global).
1393        if self.type == XGLTYPE:
1394            pax_headers = tarfile.pax_headers
1395        else:
1396            pax_headers = tarfile.pax_headers.copy()
1397
1398        # Check if the pax header contains a hdrcharset field. This tells us
1399        # the encoding of the path, linkpath, uname and gname fields. Normally,
1400        # these fields are UTF-8 encoded but since POSIX.1-2008 tar
1401        # implementations are allowed to store them as raw binary strings if
1402        # the translation to UTF-8 fails.
1403        match = re.search(br"\d+ hdrcharset=([^\n]+)\n", buf)
1404        if match is not None:
1405            pax_headers["hdrcharset"] = match.group(1).decode("utf8")
1406
1407        # For the time being, we don't care about anything other than "BINARY".
1408        # The only other value that is currently allowed by the standard is
1409        # "ISO-IR 10646 2000 UTF-8" in other words UTF-8.
1410        hdrcharset = pax_headers.get("hdrcharset")
1411        if hdrcharset == "BINARY":
1412            encoding = tarfile.encoding
1413        else:
1414            encoding = "utf8"
1415
1416        # Parse pax header information. A record looks like that:
1417        # "%d %s=%s\n" % (length, keyword, value). length is the size
1418        # of the complete record including the length field itself and
1419        # the newline. keyword and value are both UTF-8 encoded strings.
1420        regex = re.compile(br"(\d+) ([^=]+)=")
1421        pos = 0
1422        while True:
1423            match = regex.match(buf, pos)
1424            if not match:
1425                break
1426
1427            length, keyword = match.groups()
1428            length = int(length)
1429            value = buf[match.end(2) + 1:match.start(1) + length - 1]
1430
1431            # Normally, we could just use "utf8" as the encoding and "strict"
1432            # as the error handler, but we better not take the risk. For
1433            # example, GNU tar <= 1.23 is known to store filenames it cannot
1434            # translate to UTF-8 as raw strings (unfortunately without a
1435            # hdrcharset=BINARY header).
1436            # We first try the strict standard encoding, and if that fails we
1437            # fall back on the user's encoding and error handler.
1438            keyword = self._decode_pax_field(keyword, "utf8", "utf8",
1439                    tarfile.errors)
1440            if keyword in PAX_NAME_FIELDS:
1441                value = self._decode_pax_field(value, encoding, tarfile.encoding,
1442                        tarfile.errors)
1443            else:
1444                value = self._decode_pax_field(value, "utf8", "utf8",
1445                        tarfile.errors)
1446
1447            pax_headers[keyword] = value
1448            pos += length
1449
1450        # Fetch the next header.
1451        try:
1452            next = self.fromtarfile(tarfile)
1453        except HeaderError:
1454            raise SubsequentHeaderError("missing or bad subsequent header")
1455
1456        # Process GNU sparse information.
1457        if "GNU.sparse.map" in pax_headers:
1458            # GNU extended sparse format version 0.1.
1459            self._proc_gnusparse_01(next, pax_headers)
1460
1461        elif "GNU.sparse.size" in pax_headers:
1462            # GNU extended sparse format version 0.0.
1463            self._proc_gnusparse_00(next, pax_headers, buf)
1464
1465        elif pax_headers.get("GNU.sparse.major") == "1" and pax_headers.get("GNU.sparse.minor") == "0":
1466            # GNU extended sparse format version 1.0.
1467            self._proc_gnusparse_10(next, pax_headers, tarfile)
1468
1469        if self.type in (XHDTYPE, SOLARIS_XHDTYPE):
1470            # Patch the TarInfo object with the extended header info.
1471            next._apply_pax_info(pax_headers, tarfile.encoding, tarfile.errors)
1472            next.offset = self.offset
1473
1474            if "size" in pax_headers:
1475                # If the extended header replaces the size field,
1476                # we need to recalculate the offset where the next
1477                # header starts.
1478                offset = next.offset_data
1479                if next.isreg() or next.type not in SUPPORTED_TYPES:
1480                    offset += next._block(next.size)
1481                tarfile.offset = offset
1482
1483        return next
1484
1485    def _proc_gnusparse_00(self, next, pax_headers, buf):
1486        """Process a GNU tar extended sparse header, version 0.0.
1487        """
1488        offsets = []
1489        for match in re.finditer(br"\d+ GNU.sparse.offset=(\d+)\n", buf):
1490            offsets.append(int(match.group(1)))
1491        numbytes = []
1492        for match in re.finditer(br"\d+ GNU.sparse.numbytes=(\d+)\n", buf):
1493            numbytes.append(int(match.group(1)))
1494        next.sparse = list(zip(offsets, numbytes))
1495
1496    def _proc_gnusparse_01(self, next, pax_headers):
1497        """Process a GNU tar extended sparse header, version 0.1.
1498        """
1499        sparse = [int(x) for x in pax_headers["GNU.sparse.map"].split(",")]
1500        next.sparse = list(zip(sparse[::2], sparse[1::2]))
1501
1502    def _proc_gnusparse_10(self, next, pax_headers, tarfile):
1503        """Process a GNU tar extended sparse header, version 1.0.
1504        """
1505        fields = None
1506        sparse = []
1507        buf = tarfile.fileobj.read(BLOCKSIZE)
1508        fields, buf = buf.split(b"\n", 1)
1509        fields = int(fields)
1510        while len(sparse) < fields * 2:
1511            if b"\n" not in buf:
1512                buf += tarfile.fileobj.read(BLOCKSIZE)
1513            number, buf = buf.split(b"\n", 1)
1514            sparse.append(int(number))
1515        next.offset_data = tarfile.fileobj.tell()
1516        next.sparse = list(zip(sparse[::2], sparse[1::2]))
1517
1518    def _apply_pax_info(self, pax_headers, encoding, errors):
1519        """Replace fields with supplemental information from a previous
1520           pax extended or global header.
1521        """
1522        for keyword, value in pax_headers.items():
1523            if keyword == "GNU.sparse.name":
1524                setattr(self, "path", value)
1525            elif keyword == "GNU.sparse.size":
1526                setattr(self, "size", int(value))
1527            elif keyword == "GNU.sparse.realsize":
1528                setattr(self, "size", int(value))
1529            elif keyword in PAX_FIELDS:
1530                if keyword in PAX_NUMBER_FIELDS:
1531                    try:
1532                        value = PAX_NUMBER_FIELDS[keyword](value)
1533                    except ValueError:
1534                        value = 0
1535                if keyword == "path":
1536                    value = value.rstrip("/")
1537                setattr(self, keyword, value)
1538
1539        self.pax_headers = pax_headers.copy()
1540
1541    def _decode_pax_field(self, value, encoding, fallback_encoding, fallback_errors):
1542        """Decode a single field from a pax record.
1543        """
1544        try:
1545            return value.decode(encoding, "strict")
1546        except UnicodeDecodeError:
1547            return value.decode(fallback_encoding, fallback_errors)
1548
1549    def _block(self, count):
1550        """Round up a byte count by BLOCKSIZE and return it,
1551           e.g. _block(834) => 1024.
1552        """
1553        blocks, remainder = divmod(count, BLOCKSIZE)
1554        if remainder:
1555            blocks += 1
1556        return blocks * BLOCKSIZE
1557
1558    def isreg(self):
1559        return self.type in REGULAR_TYPES
1560    def isfile(self):
1561        return self.isreg()
1562    def isdir(self):
1563        return self.type == DIRTYPE
1564    def issym(self):
1565        return self.type == SYMTYPE
1566    def islnk(self):
1567        return self.type == LNKTYPE
1568    def ischr(self):
1569        return self.type == CHRTYPE
1570    def isblk(self):
1571        return self.type == BLKTYPE
1572    def isfifo(self):
1573        return self.type == FIFOTYPE
1574    def issparse(self):
1575        return self.sparse is not None
1576    def isdev(self):
1577        return self.type in (CHRTYPE, BLKTYPE, FIFOTYPE)
1578# class TarInfo
1579
1580class TarFile(object):
1581    """The TarFile Class provides an interface to tar archives.
1582    """
1583
1584    debug = 0                   # May be set from 0 (no msgs) to 3 (all msgs)
1585
1586    dereference = False         # If true, add content of linked file to the
1587                                # tar file, else the link.
1588
1589    ignore_zeros = False        # If true, skips empty or invalid blocks and
1590                                # continues processing.
1591
1592    errorlevel = 1              # If 0, fatal errors only appear in debug
1593                                # messages (if debug >= 0). If > 0, errors
1594                                # are passed to the caller as exceptions.
1595
1596    format = DEFAULT_FORMAT     # The format to use when creating an archive.
1597
1598    encoding = ENCODING         # Encoding for 8-bit character strings.
1599
1600    errors = None               # Error handler for unicode conversion.
1601
1602    tarinfo = TarInfo           # The default TarInfo class to use.
1603
1604    fileobject = ExFileObject   # The default ExFileObject class to use.
1605
1606    def __init__(self, name=None, mode="r", fileobj=None, format=None,
1607            tarinfo=None, dereference=None, ignore_zeros=None, encoding=None,
1608            errors="surrogateescape", pax_headers=None, debug=None, errorlevel=None):
1609        """Open an (uncompressed) tar archive `name'. `mode' is either 'r' to
1610           read from an existing archive, 'a' to append data to an existing
1611           file or 'w' to create a new file overwriting an existing one. `mode'
1612           defaults to 'r'.
1613           If `fileobj' is given, it is used for reading or writing data. If it
1614           can be determined, `mode' is overridden by `fileobj's mode.
1615           `fileobj' is not closed, when TarFile is closed.
1616        """
1617        if len(mode) > 1 or mode not in "raw":
1618            raise ValueError("mode must be 'r', 'a' or 'w'")
1619        self.mode = mode
1620        self._mode = {"r": "rb", "a": "r+b", "w": "wb"}[mode]
1621
1622        if not fileobj:
1623            if self.mode == "a" and not os.path.exists(name):
1624                # Create nonexistent files in append mode.
1625                self.mode = "w"
1626                self._mode = "wb"
1627            fileobj = bltn_open(name, self._mode)
1628            self._extfileobj = False
1629        else:
1630            if name is None and hasattr(fileobj, "name"):
1631                name = fileobj.name
1632            if hasattr(fileobj, "mode"):
1633                self._mode = fileobj.mode
1634            self._extfileobj = True
1635        self.name = os.path.abspath(name) if name else None
1636        self.fileobj = fileobj
1637
1638        # Init attributes.
1639        if format is not None:
1640            self.format = format
1641        if tarinfo is not None:
1642            self.tarinfo = tarinfo
1643        if dereference is not None:
1644            self.dereference = dereference
1645        if ignore_zeros is not None:
1646            self.ignore_zeros = ignore_zeros
1647        if encoding is not None:
1648            self.encoding = encoding
1649        self.errors = errors
1650
1651        if pax_headers is not None and self.format == PAX_FORMAT:
1652            self.pax_headers = pax_headers
1653        else:
1654            self.pax_headers = {}
1655
1656        if debug is not None:
1657            self.debug = debug
1658        if errorlevel is not None:
1659            self.errorlevel = errorlevel
1660
1661        # Init datastructures.
1662        self.closed = False
1663        self.members = []       # list of members as TarInfo objects
1664        self._loaded = False    # flag if all members have been read
1665        self.offset = self.fileobj.tell()
1666                                # current position in the archive file
1667        self.inodes = {}        # dictionary caching the inodes of
1668                                # archive members already added
1669
1670        try:
1671            if self.mode == "r":
1672                self.firstmember = None
1673                self.firstmember = self.next()
1674
1675            if self.mode == "a":
1676                # Move to the end of the archive,
1677                # before the first empty block.
1678                while True:
1679                    self.fileobj.seek(self.offset)
1680                    try:
1681                        tarinfo = self.tarinfo.fromtarfile(self)
1682                        self.members.append(tarinfo)
1683                    except EOFHeaderError:
1684                        self.fileobj.seek(self.offset)
1685                        break
1686                    except HeaderError as e:
1687                        raise ReadError(str(e))
1688
1689            if self.mode in "aw":
1690                self._loaded = True
1691
1692                if self.pax_headers:
1693                    buf = self.tarinfo.create_pax_global_header(self.pax_headers.copy())
1694                    self.fileobj.write(buf)
1695                    self.offset += len(buf)
1696        except:
1697            if not self._extfileobj:
1698                self.fileobj.close()
1699            self.closed = True
1700            raise
1701
1702    #--------------------------------------------------------------------------
1703    # Below are the classmethods which act as alternate constructors to the
1704    # TarFile class. The open() method is the only one that is needed for
1705    # public use; it is the "super"-constructor and is able to select an
1706    # adequate "sub"-constructor for a particular compression using the mapping
1707    # from OPEN_METH.
1708    #
1709    # This concept allows one to subclass TarFile without losing the comfort of
1710    # the super-constructor. A sub-constructor is registered and made available
1711    # by adding it to the mapping in OPEN_METH.
1712
1713    @classmethod
1714    def open(cls, name=None, mode="r", fileobj=None, bufsize=RECORDSIZE, **kwargs):
1715        """Open a tar archive for reading, writing or appending. Return
1716           an appropriate TarFile class.
1717
1718           mode:
1719           'r' or 'r:*' open for reading with transparent compression
1720           'r:'         open for reading exclusively uncompressed
1721           'r:gz'       open for reading with gzip compression
1722           'r:bz2'      open for reading with bzip2 compression
1723           'a' or 'a:'  open for appending, creating the file if necessary
1724           'w' or 'w:'  open for writing without compression
1725           'w:gz'       open for writing with gzip compression
1726           'w:bz2'      open for writing with bzip2 compression
1727
1728           'r|*'        open a stream of tar blocks with transparent compression
1729           'r|'         open an uncompressed stream of tar blocks for reading
1730           'r|gz'       open a gzip compressed stream of tar blocks
1731           'r|bz2'      open a bzip2 compressed stream of tar blocks
1732           'w|'         open an uncompressed stream for writing
1733           'w|gz'       open a gzip compressed stream for writing
1734           'w|bz2'      open a bzip2 compressed stream for writing
1735        """
1736
1737        if not name and not fileobj:
1738            raise ValueError("nothing to open")
1739
1740        if mode in ("r", "r:*"):
1741            # Find out which *open() is appropriate for opening the file.
1742            for comptype in cls.OPEN_METH:
1743                func = getattr(cls, cls.OPEN_METH[comptype])
1744                if fileobj is not None:
1745                    saved_pos = fileobj.tell()
1746                try:
1747                    return func(name, "r", fileobj, **kwargs)
1748                except (ReadError, CompressionError) as e:
1749                    if fileobj is not None:
1750                        fileobj.seek(saved_pos)
1751                    continue
1752            raise ReadError("file could not be opened successfully")
1753
1754        elif ":" in mode:
1755            filemode, comptype = mode.split(":", 1)
1756            filemode = filemode or "r"
1757            comptype = comptype or "tar"
1758
1759            # Select the *open() function according to
1760            # given compression.
1761            if comptype in cls.OPEN_METH:
1762                func = getattr(cls, cls.OPEN_METH[comptype])
1763            else:
1764                raise CompressionError("unknown compression type %r" % comptype)
1765            return func(name, filemode, fileobj, **kwargs)
1766
1767        elif "|" in mode:
1768            filemode, comptype = mode.split("|", 1)
1769            filemode = filemode or "r"
1770            comptype = comptype or "tar"
1771
1772            if filemode not in "rw":
1773                raise ValueError("mode must be 'r' or 'w'")
1774
1775            stream = _Stream(name, filemode, comptype, fileobj, bufsize)
1776            try:
1777                t = cls(name, filemode, stream, **kwargs)
1778            except:
1779                stream.close()
1780                raise
1781            t._extfileobj = False
1782            return t
1783
1784        elif mode in "aw":
1785            return cls.taropen(name, mode, fileobj, **kwargs)
1786
1787        raise ValueError("undiscernible mode")
1788
1789    @classmethod
1790    def taropen(cls, name, mode="r", fileobj=None, **kwargs):
1791        """Open uncompressed tar archive name for reading or writing.
1792        """
1793        if len(mode) > 1 or mode not in "raw":
1794            raise ValueError("mode must be 'r', 'a' or 'w'")
1795        return cls(name, mode, fileobj, **kwargs)
1796
1797    @classmethod
1798    def gzopen(cls, name, mode="r", fileobj=None, compresslevel=9, **kwargs):
1799        """Open gzip compressed tar archive name for reading or writing.
1800           Appending is not allowed.
1801        """
1802        if len(mode) > 1 or mode not in "rw":
1803            raise ValueError("mode must be 'r' or 'w'")
1804
1805        try:
1806            import gzip
1807            gzip.GzipFile
1808        except (ImportError, AttributeError):
1809            raise CompressionError("gzip module is not available")
1810
1811        extfileobj = fileobj is not None
1812        try:
1813            fileobj = gzip.GzipFile(name, mode + "b", compresslevel, fileobj)
1814            t = cls.taropen(name, mode, fileobj, **kwargs)
1815        except IOError:
1816            if not extfileobj and fileobj is not None:
1817                fileobj.close()
1818            if fileobj is None:
1819                raise
1820            raise ReadError("not a gzip file")
1821        except:
1822            if not extfileobj and fileobj is not None:
1823                fileobj.close()
1824            raise
1825        t._extfileobj = extfileobj
1826        return t
1827
1828    @classmethod
1829    def bz2open(cls, name, mode="r", fileobj=None, compresslevel=9, **kwargs):
1830        """Open bzip2 compressed tar archive name for reading or writing.
1831           Appending is not allowed.
1832        """
1833        if len(mode) > 1 or mode not in "rw":
1834            raise ValueError("mode must be 'r' or 'w'.")
1835
1836        try:
1837            import bz2
1838        except ImportError:
1839            raise CompressionError("bz2 module is not available")
1840
1841        if fileobj is not None:
1842            fileobj = _BZ2Proxy(fileobj, mode)
1843        else:
1844            fileobj = bz2.BZ2File(name, mode, compresslevel=compresslevel)
1845
1846        try:
1847            t = cls.taropen(name, mode, fileobj, **kwargs)
1848        except (IOError, EOFError):
1849            fileobj.close()
1850            raise ReadError("not a bzip2 file")
1851        t._extfileobj = False
1852        return t
1853
1854    # All *open() methods are registered here.
1855    OPEN_METH = {
1856        "tar": "taropen",   # uncompressed tar
1857        "gz":  "gzopen",    # gzip compressed tar
1858        "bz2": "bz2open"    # bzip2 compressed tar
1859    }
1860
1861    #--------------------------------------------------------------------------
1862    # The public methods which TarFile provides:
1863
1864    def close(self):
1865        """Close the TarFile. In write-mode, two finishing zero blocks are
1866           appended to the archive.
1867        """
1868        if self.closed:
1869            return
1870
1871        if self.mode in "aw":
1872            self.fileobj.write(NUL * (BLOCKSIZE * 2))
1873            self.offset += (BLOCKSIZE * 2)
1874            # fill up the end with zero-blocks
1875            # (like option -b20 for tar does)
1876            blocks, remainder = divmod(self.offset, RECORDSIZE)
1877            if remainder > 0:
1878                self.fileobj.write(NUL * (RECORDSIZE - remainder))
1879
1880        if not self._extfileobj:
1881            self.fileobj.close()
1882        self.closed = True
1883
1884    def getmember(self, name):
1885        """Return a TarInfo object for member `name'. If `name' can not be
1886           found in the archive, KeyError is raised. If a member occurs more
1887           than once in the archive, its last occurrence is assumed to be the
1888           most up-to-date version.
1889        """
1890        tarinfo = self._getmember(name)
1891        if tarinfo is None:
1892            raise KeyError("filename %r not found" % name)
1893        return tarinfo
1894
1895    def getmembers(self):
1896        """Return the members of the archive as a list of TarInfo objects. The
1897           list has the same order as the members in the archive.
1898        """
1899        self._check()
1900        if not self._loaded:    # if we want to obtain a list of
1901            self._load()        # all members, we first have to
1902                                # scan the whole archive.
1903        return self.members
1904
1905    def getnames(self):
1906        """Return the members of the archive as a list of their names. It has
1907           the same order as the list returned by getmembers().
1908        """
1909        return [tarinfo.name for tarinfo in self.getmembers()]
1910
1911    def gettarinfo(self, name=None, arcname=None, fileobj=None):
1912        """Create a TarInfo object for either the file `name' or the file
1913           object `fileobj' (using os.fstat on its file descriptor). You can
1914           modify some of the TarInfo's attributes before you add it using
1915           addfile(). If given, `arcname' specifies an alternative name for the
1916           file in the archive.
1917        """
1918        self._check("aw")
1919
1920        # When fileobj is given, replace name by
1921        # fileobj's real name.
1922        if fileobj is not None:
1923            name = fileobj.name
1924
1925        # Building the name of the member in the archive.
1926        # Backward slashes are converted to forward slashes,
1927        # Absolute paths are turned to relative paths.
1928        if arcname is None:
1929            arcname = name
1930        drv, arcname = os.path.splitdrive(arcname)
1931        arcname = arcname.replace(os.sep, "/")
1932        arcname = arcname.lstrip("/")
1933
1934        # Now, fill the TarInfo object with
1935        # information specific for the file.
1936        tarinfo = self.tarinfo()
1937        tarinfo.tarfile = self
1938
1939        # Use os.stat or os.lstat, depending on platform
1940        # and if symlinks shall be resolved.
1941        if fileobj is None:
1942            if hasattr(os, "lstat") and not self.dereference:
1943                statres = os.lstat(name)
1944            else:
1945                statres = os.stat(name)
1946        else:
1947            statres = os.fstat(fileobj.fileno())
1948        linkname = ""
1949
1950        stmd = statres.st_mode
1951        if stat.S_ISREG(stmd):
1952            inode = (statres.st_ino, statres.st_dev)
1953            if not self.dereference and statres.st_nlink > 1 and \
1954                    inode in self.inodes and arcname != self.inodes[inode]:
1955                # Is it a hardlink to an already
1956                # archived file?
1957                type = LNKTYPE
1958                linkname = self.inodes[inode]
1959            else:
1960                # The inode is added only if its valid.
1961                # For win32 it is always 0.
1962                type = REGTYPE
1963                if inode[0]:
1964                    self.inodes[inode] = arcname
1965        elif stat.S_ISDIR(stmd):
1966            type = DIRTYPE
1967        elif stat.S_ISFIFO(stmd):
1968            type = FIFOTYPE
1969        elif stat.S_ISLNK(stmd):
1970            type = SYMTYPE
1971            linkname = os.readlink(name)
1972        elif stat.S_ISCHR(stmd):
1973            type = CHRTYPE
1974        elif stat.S_ISBLK(stmd):
1975            type = BLKTYPE
1976        else:
1977            return None
1978
1979        # Fill the TarInfo object with all
1980        # information we can get.
1981        tarinfo.name = arcname
1982        tarinfo.mode = stmd
1983        tarinfo.uid = statres.st_uid
1984        tarinfo.gid = statres.st_gid
1985        if type == REGTYPE:
1986            tarinfo.size = statres.st_size
1987        else:
1988            tarinfo.size = 0
1989        tarinfo.mtime = statres.st_mtime
1990        tarinfo.type = type
1991        tarinfo.linkname = linkname
1992        if pwd:
1993            try:
1994                tarinfo.uname = pwd.getpwuid(tarinfo.uid)[0]
1995            except KeyError:
1996                pass
1997        if grp:
1998            try:
1999                tarinfo.gname = grp.getgrgid(tarinfo.gid)[0]
2000            except KeyError:
2001                pass
2002
2003        if type in (CHRTYPE, BLKTYPE):
2004            if hasattr(os, "major") and hasattr(os, "minor"):
2005                tarinfo.devmajor = os.major(statres.st_rdev)
2006                tarinfo.devminor = os.minor(statres.st_rdev)
2007        return tarinfo
2008
2009    def list(self, verbose=True):
2010        """Print a table of contents to sys.stdout. If `verbose' is False, only
2011           the names of the members are printed. If it is True, an `ls -l'-like
2012           output is produced.
2013        """
2014        self._check()
2015
2016        for tarinfo in self:
2017            if verbose:
2018                print(filemode(tarinfo.mode), end=' ')
2019                print("%s/%s" % (tarinfo.uname or tarinfo.uid,
2020                                 tarinfo.gname or tarinfo.gid), end=' ')
2021                if tarinfo.ischr() or tarinfo.isblk():
2022                    print("%10s" % ("%d,%d" \
2023                                    % (tarinfo.devmajor, tarinfo.devminor)), end=' ')
2024                else:
2025                    print("%10d" % tarinfo.size, end=' ')
2026                print("%d-%02d-%02d %02d:%02d:%02d" \
2027                      % time.localtime(tarinfo.mtime)[:6], end=' ')
2028
2029            print(tarinfo.name + ("/" if tarinfo.isdir() else ""), end=' ')
2030
2031            if verbose:
2032                if tarinfo.issym():
2033                    print("->", tarinfo.linkname, end=' ')
2034                if tarinfo.islnk():
2035                    print("link to", tarinfo.linkname, end=' ')
2036            print()
2037
2038    def add(self, name, arcname=None, recursive=True, exclude=None, filter=None):
2039        """Add the file `name' to the archive. `name' may be any type of file
2040           (directory, fifo, symbolic link, etc.). If given, `arcname'
2041           specifies an alternative name for the file in the archive.
2042           Directories are added recursively by default. This can be avoided by
2043           setting `recursive' to False. `exclude' is a function that should
2044           return True for each filename to be excluded. `filter' is a function
2045           that expects a TarInfo object argument and returns the changed
2046           TarInfo object, if it returns None the TarInfo object will be
2047           excluded from the archive.
2048        """
2049        self._check("aw")
2050
2051        if arcname is None:
2052            arcname = name
2053
2054        # Exclude pathnames.
2055        if exclude is not None:
2056            import warnings
2057            warnings.warn("use the filter argument instead",
2058                    DeprecationWarning, 2)
2059            if exclude(name):
2060                self._dbg(2, "tarfile: Excluded %r" % name)
2061                return
2062
2063        # Skip if somebody tries to archive the archive...
2064        if self.name is not None and os.path.abspath(name) == self.name:
2065            self._dbg(2, "tarfile: Skipped %r" % name)
2066            return
2067
2068        self._dbg(1, name)
2069
2070        # Create a TarInfo object from the file.
2071        tarinfo = self.gettarinfo(name, arcname)
2072
2073        if tarinfo is None:
2074            self._dbg(1, "tarfile: Unsupported type %r" % name)
2075            return
2076
2077        # Change or exclude the TarInfo object.
2078        if filter is not None:
2079            tarinfo = filter(tarinfo)
2080            if tarinfo is None:
2081                self._dbg(2, "tarfile: Excluded %r" % name)
2082                return
2083
2084        # Append the tar header and data to the archive.
2085        if tarinfo.isreg():
2086            f = bltn_open(name, "rb")
2087            self.addfile(tarinfo, f)
2088            f.close()
2089
2090        elif tarinfo.isdir():
2091            self.addfile(tarinfo)
2092            if recursive:
2093                for f in os.listdir(name):
2094                    self.add(os.path.join(name, f), os.path.join(arcname, f),
2095                            recursive, exclude, filter=filter)
2096
2097        else:
2098            self.addfile(tarinfo)
2099
2100    def addfile(self, tarinfo, fileobj=None):
2101        """Add the TarInfo object `tarinfo' to the archive. If `fileobj' is
2102           given, tarinfo.size bytes are read from it and added to the archive.
2103           You can create TarInfo objects using gettarinfo().
2104           On Windows platforms, `fileobj' should always be opened with mode
2105           'rb' to avoid irritation about the file size.
2106        """
2107        self._check("aw")
2108
2109        tarinfo = copy.copy(tarinfo)
2110
2111        buf = tarinfo.tobuf(self.format, self.encoding, self.errors)
2112        self.fileobj.write(buf)
2113        self.offset += len(buf)
2114
2115        # If there's data to follow, append it.
2116        if fileobj is not None:
2117            copyfileobj(fileobj, self.fileobj, tarinfo.size)
2118            blocks, remainder = divmod(tarinfo.size, BLOCKSIZE)
2119            if remainder > 0:
2120                self.fileobj.write(NUL * (BLOCKSIZE - remainder))
2121                blocks += 1
2122            self.offset += blocks * BLOCKSIZE
2123
2124        self.members.append(tarinfo)
2125
2126    def extractall(self, path=".", members=None):
2127        """Extract all members from the archive to the current working
2128           directory and set owner, modification time and permissions on
2129           directories afterwards. `path' specifies a different directory
2130           to extract to. `members' is optional and must be a subset of the
2131           list returned by getmembers().
2132        """
2133        directories = []
2134
2135        if members is None:
2136            members = self
2137
2138        for tarinfo in members:
2139            if tarinfo.isdir():
2140                # Extract directories with a safe mode.
2141                directories.append(tarinfo)
2142                tarinfo = copy.copy(tarinfo)
2143                tarinfo.mode = 0o700
2144            # Do not set_attrs directories, as we will do that further down
2145            self.extract(tarinfo, path, set_attrs=not tarinfo.isdir())
2146
2147        # Reverse sort directories.
2148        directories.sort(key=lambda a: a.name)
2149        directories.reverse()
2150
2151        # Set correct owner, mtime and filemode on directories.
2152        for tarinfo in directories:
2153            dirpath = os.path.join(path, tarinfo.name)
2154            try:
2155                self.chown(tarinfo, dirpath)
2156                self.utime(tarinfo, dirpath)
2157                self.chmod(tarinfo, dirpath)
2158            except ExtractError as e:
2159                if self.errorlevel > 1:
2160                    raise
2161                else:
2162                    self._dbg(1, "tarfile: %s" % e)
2163
2164    def extract(self, member, path="", set_attrs=True):
2165        """Extract a member from the archive to the current working directory,
2166           using its full name. Its file information is extracted as accurately
2167           as possible. `member' may be a filename or a TarInfo object. You can
2168           specify a different directory using `path'. File attributes (owner,
2169           mtime, mode) are set unless `set_attrs' is False.
2170        """
2171        self._check("r")
2172
2173        if isinstance(member, str):
2174            tarinfo = self.getmember(member)
2175        else:
2176            tarinfo = member
2177
2178        # Prepare the link target for makelink().
2179        if tarinfo.islnk():
2180            tarinfo._link_target = os.path.join(path, tarinfo.linkname)
2181
2182        try:
2183            self._extract_member(tarinfo, os.path.join(path, tarinfo.name),
2184                                 set_attrs=set_attrs)
2185        except EnvironmentError as e:
2186            if self.errorlevel > 0:
2187                raise
2188            else:
2189                if e.filename is None:
2190                    self._dbg(1, "tarfile: %s" % e.strerror)
2191                else:
2192                    self._dbg(1, "tarfile: %s %r" % (e.strerror, e.filename))
2193        except ExtractError as e:
2194            if self.errorlevel > 1:
2195                raise
2196            else:
2197                self._dbg(1, "tarfile: %s" % e)
2198
2199    def extractfile(self, member):
2200        """Extract a member from the archive as a file object. `member' may be
2201           a filename or a TarInfo object. If `member' is a regular file, a
2202           file-like object is returned. If `member' is a link, a file-like
2203           object is constructed from the link's target. If `member' is none of
2204           the above, None is returned.
2205           The file-like object is read-only and provides the following
2206           methods: read(), readline(), readlines(), seek() and tell()
2207        """
2208        self._check("r")
2209
2210        if isinstance(member, str):
2211            tarinfo = self.getmember(member)
2212        else:
2213            tarinfo = member
2214
2215        if tarinfo.isreg():
2216            return self.fileobject(self, tarinfo)
2217
2218        elif tarinfo.type not in SUPPORTED_TYPES:
2219            # If a member's type is unknown, it is treated as a
2220            # regular file.
2221            return self.fileobject(self, tarinfo)
2222
2223        elif tarinfo.islnk() or tarinfo.issym():
2224            if isinstance(self.fileobj, _Stream):
2225                # A small but ugly workaround for the case that someone tries
2226                # to extract a (sym)link as a file-object from a non-seekable
2227                # stream of tar blocks.
2228                raise StreamError("cannot extract (sym)link as file object")
2229            else:
2230                # A (sym)link's file object is its target's file object.
2231                return self.extractfile(self._find_link_target(tarinfo))
2232        else:
2233            # If there's no data associated with the member (directory, chrdev,
2234            # blkdev, etc.), return None instead of a file object.
2235            return None
2236
2237    def _extract_member(self, tarinfo, targetpath, set_attrs=True):
2238        """Extract the TarInfo object tarinfo to a physical
2239           file called targetpath.
2240        """
2241        # Fetch the TarInfo object for the given name
2242        # and build the destination pathname, replacing
2243        # forward slashes to platform specific separators.
2244        targetpath = targetpath.rstrip("/")
2245        targetpath = targetpath.replace("/", os.sep)
2246
2247        # Create all upper directories.
2248        upperdirs = os.path.dirname(targetpath)
2249        if upperdirs and not os.path.exists(upperdirs):
2250            # Create directories that are not part of the archive with
2251            # default permissions.
2252            os.makedirs(upperdirs)
2253
2254        if tarinfo.islnk() or tarinfo.issym():
2255            self._dbg(1, "%s -> %s" % (tarinfo.name, tarinfo.linkname))
2256        else:
2257            self._dbg(1, tarinfo.name)
2258
2259        if tarinfo.isreg():
2260            self.makefile(tarinfo, targetpath)
2261        elif tarinfo.isdir():
2262            self.makedir(tarinfo, targetpath)
2263        elif tarinfo.isfifo():
2264            self.makefifo(tarinfo, targetpath)
2265        elif tarinfo.ischr() or tarinfo.isblk():
2266            self.makedev(tarinfo, targetpath)
2267        elif tarinfo.islnk() or tarinfo.issym():
2268            self.makelink(tarinfo, targetpath)
2269        elif tarinfo.type not in SUPPORTED_TYPES:
2270            self.makeunknown(tarinfo, targetpath)
2271        else:
2272            self.makefile(tarinfo, targetpath)
2273
2274        if set_attrs:
2275            self.chown(tarinfo, targetpath)
2276            if not tarinfo.issym():
2277                self.chmod(tarinfo, targetpath)
2278                self.utime(tarinfo, targetpath)
2279
2280    #--------------------------------------------------------------------------
2281    # Below are the different file methods. They are called via
2282    # _extract_member() when extract() is called. They can be replaced in a
2283    # subclass to implement other functionality.
2284
2285    def makedir(self, tarinfo, targetpath):
2286        """Make a directory called targetpath.
2287        """
2288        try:
2289            # Use a safe mode for the directory, the real mode is set
2290            # later in _extract_member().
2291            os.mkdir(targetpath, 0o700)
2292        except EnvironmentError as e:
2293            if e.errno != errno.EEXIST:
2294                raise
2295
2296    def makefile(self, tarinfo, targetpath):
2297        """Make a file called targetpath.
2298        """
2299        source = self.fileobj
2300        source.seek(tarinfo.offset_data)
2301        target = bltn_open(targetpath, "wb")
2302        if tarinfo.sparse is not None:
2303            for offset, size in tarinfo.sparse:
2304                target.seek(offset)
2305                copyfileobj(source, target, size)
2306        else:
2307            copyfileobj(source, target, tarinfo.size)
2308        target.seek(tarinfo.size)
2309        target.truncate()
2310        target.close()
2311
2312    def makeunknown(self, tarinfo, targetpath):
2313        """Make a file from a TarInfo object with an unknown type
2314           at targetpath.
2315        """
2316        self.makefile(tarinfo, targetpath)
2317        self._dbg(1, "tarfile: Unknown file type %r, " \
2318                     "extracted as regular file." % tarinfo.type)
2319
2320    def makefifo(self, tarinfo, targetpath):
2321        """Make a fifo called targetpath.
2322        """
2323        if hasattr(os, "mkfifo"):
2324            os.mkfifo(targetpath)
2325        else:
2326            raise ExtractError("fifo not supported by system")
2327
2328    def makedev(self, tarinfo, targetpath):
2329        """Make a character or block device called targetpath.
2330        """
2331        if not hasattr(os, "mknod") or not hasattr(os, "makedev"):
2332            raise ExtractError("special devices not supported by system")
2333
2334        mode = tarinfo.mode
2335        if tarinfo.isblk():
2336            mode |= stat.S_IFBLK
2337        else:
2338            mode |= stat.S_IFCHR
2339
2340        os.mknod(targetpath, mode,
2341                 os.makedev(tarinfo.devmajor, tarinfo.devminor))
2342
2343    def makelink(self, tarinfo, targetpath):
2344        """Make a (symbolic) link called targetpath. If it cannot be created
2345          (platform limitation), we try to make a copy of the referenced file
2346          instead of a link.
2347        """
2348        try:
2349            # For systems that support symbolic and hard links.
2350            if tarinfo.issym():
2351                os.symlink(tarinfo.linkname, targetpath)
2352            else:
2353                # See extract().
2354                if os.path.exists(tarinfo._link_target):
2355                    os.link(tarinfo._link_target, targetpath)
2356                else:
2357                    self._extract_member(self._find_link_target(tarinfo),
2358                                         targetpath)
2359        except symlink_exception:
2360            if tarinfo.issym():
2361                linkpath = os.path.join(os.path.dirname(tarinfo.name),
2362                                        tarinfo.linkname)
2363            else:
2364                linkpath = tarinfo.linkname
2365        else:
2366            try:
2367                self._extract_member(self._find_link_target(tarinfo),
2368                                     targetpath)
2369            except KeyError:
2370                raise ExtractError("unable to resolve link inside archive")
2371
2372    def chown(self, tarinfo, targetpath):
2373        """Set owner of targetpath according to tarinfo.
2374        """
2375        if pwd and hasattr(os, "geteuid") and os.geteuid() == 0:
2376            # We have to be root to do so.
2377            try:
2378                g = grp.getgrnam(tarinfo.gname)[2]
2379            except KeyError:
2380                g = tarinfo.gid
2381            try:
2382                u = pwd.getpwnam(tarinfo.uname)[2]
2383            except KeyError:
2384                u = tarinfo.uid
2385            try:
2386                if tarinfo.issym() and hasattr(os, "lchown"):
2387                    os.lchown(targetpath, u, g)
2388                else:
2389                    if sys.platform != "os2emx":
2390                        os.chown(targetpath, u, g)
2391            except EnvironmentError as e:
2392                raise ExtractError("could not change owner")
2393
2394    def chmod(self, tarinfo, targetpath):
2395        """Set file permissions of targetpath according to tarinfo.
2396        """
2397        if hasattr(os, 'chmod'):
2398            try:
2399                os.chmod(targetpath, tarinfo.mode)
2400            except EnvironmentError as e:
2401                raise ExtractError("could not change mode")
2402
2403    def utime(self, tarinfo, targetpath):
2404        """Set modification time of targetpath according to tarinfo.
2405        """
2406        if not hasattr(os, 'utime'):
2407            return
2408        try:
2409            os.utime(targetpath, (tarinfo.mtime, tarinfo.mtime))
2410        except EnvironmentError as e:
2411            raise ExtractError("could not change modification time")
2412
2413    #--------------------------------------------------------------------------
2414    def next(self):
2415        """Return the next member of the archive as a TarInfo object, when
2416           TarFile is opened for reading. Return None if there is no more
2417           available.
2418        """
2419        self._check("ra")
2420        if self.firstmember is not None:
2421            m = self.firstmember
2422            self.firstmember = None
2423            return m
2424
2425        # Read the next block.
2426        self.fileobj.seek(self.offset)
2427        tarinfo = None
2428        while True:
2429            try:
2430                tarinfo = self.tarinfo.fromtarfile(self)
2431            except EOFHeaderError as e:
2432                if self.ignore_zeros:
2433                    self._dbg(2, "0x%X: %s" % (self.offset, e))
2434                    self.offset += BLOCKSIZE
2435                    continue
2436            except InvalidHeaderError as e:
2437                if self.ignore_zeros:
2438                    self._dbg(2, "0x%X: %s" % (self.offset, e))
2439                    self.offset += BLOCKSIZE
2440                    continue
2441                elif self.offset == 0:
2442                    raise ReadError(str(e))
2443            except EmptyHeaderError:
2444                if self.offset == 0:
2445                    raise ReadError("empty file")
2446            except TruncatedHeaderError as e:
2447                if self.offset == 0:
2448                    raise ReadError(str(e))
2449            except SubsequentHeaderError as e:
2450                raise ReadError(str(e))
2451            break
2452
2453        if tarinfo is not None:
2454            self.members.append(tarinfo)
2455        else:
2456            self._loaded = True
2457
2458        return tarinfo
2459
2460    #--------------------------------------------------------------------------
2461    # Little helper methods:
2462
2463    def _getmember(self, name, tarinfo=None, normalize=False):
2464        """Find an archive member by name from bottom to top.
2465           If tarinfo is given, it is used as the starting point.
2466        """
2467        # Ensure that all members have been loaded.
2468        members = self.getmembers()
2469
2470        # Limit the member search list up to tarinfo.
2471        if tarinfo is not None:
2472            members = members[:members.index(tarinfo)]
2473
2474        if normalize:
2475            name = os.path.normpath(name)
2476
2477        for member in reversed(members):
2478            if normalize:
2479                member_name = os.path.normpath(member.name)
2480            else:
2481                member_name = member.name
2482
2483            if name == member_name:
2484                return member
2485
2486    def _load(self):
2487        """Read through the entire archive file and look for readable
2488           members.
2489        """
2490        while True:
2491            tarinfo = self.next()
2492            if tarinfo is None:
2493                break
2494        self._loaded = True
2495
2496    def _check(self, mode=None):
2497        """Check if TarFile is still open, and if the operation's mode
2498           corresponds to TarFile's mode.
2499        """
2500        if self.closed:
2501            raise IOError("%s is closed" % self.__class__.__name__)
2502        if mode is not None and self.mode not in mode:
2503            raise IOError("bad operation for mode %r" % self.mode)
2504
2505    def _find_link_target(self, tarinfo):
2506        """Find the target member of a symlink or hardlink member in the
2507           archive.
2508        """
2509        if tarinfo.issym():
2510            # Always search the entire archive.
2511            linkname = os.path.dirname(tarinfo.name) + "/" + tarinfo.linkname
2512            limit = None
2513        else:
2514            # Search the archive before the link, because a hard link is
2515            # just a reference to an already archived file.
2516            linkname = tarinfo.linkname
2517            limit = tarinfo
2518
2519        member = self._getmember(linkname, tarinfo=limit, normalize=True)
2520        if member is None:
2521            raise KeyError("linkname %r not found" % linkname)
2522        return member
2523
2524    def __iter__(self):
2525        """Provide an iterator object.
2526        """
2527        if self._loaded:
2528            return iter(self.members)
2529        else:
2530            return TarIter(self)
2531
2532    def _dbg(self, level, msg):
2533        """Write debugging output to sys.stderr.
2534        """
2535        if level <= self.debug:
2536            print(msg, file=sys.stderr)
2537
2538    def __enter__(self):
2539        self._check()
2540        return self
2541
2542    def __exit__(self, type, value, traceback):
2543        if type is None:
2544            self.close()
2545        else:
2546            # An exception occurred. We must not call close() because
2547            # it would try to write end-of-archive blocks and padding.
2548            if not self._extfileobj:
2549                self.fileobj.close()
2550            self.closed = True
2551# class TarFile
2552
2553class TarIter(object):
2554    """Iterator Class.
2555
2556       for tarinfo in TarFile(...):
2557           suite...
2558    """
2559
2560    def __init__(self, tarfile):
2561        """Construct a TarIter object.
2562        """
2563        self.tarfile = tarfile
2564        self.index = 0
2565    def __iter__(self):
2566        """Return iterator object.
2567        """
2568        return self
2569
2570    def __next__(self):
2571        """Return the next item using TarFile's next() method.
2572           When all members have been read, set TarFile as _loaded.
2573        """
2574        # Fix for SF #1100429: Under rare circumstances it can
2575        # happen that getmembers() is called during iteration,
2576        # which will cause TarIter to stop prematurely.
2577        if not self.tarfile._loaded:
2578            tarinfo = self.tarfile.next()
2579            if not tarinfo:
2580                self.tarfile._loaded = True
2581                raise StopIteration
2582        else:
2583            try:
2584                tarinfo = self.tarfile.members[self.index]
2585            except IndexError:
2586                raise StopIteration
2587        self.index += 1
2588        return tarinfo
2589
2590    next = __next__ # for Python 2.x
2591
2592#--------------------
2593# exported functions
2594#--------------------
2595def is_tarfile(name):
2596    """Return True if name points to a tar archive that we
2597       are able to handle, else return False.
2598    """
2599    try:
2600        t = open(name)
2601        t.close()
2602        return True
2603    except TarError:
2604        return False
2605
2606bltn_open = open
2607open = TarFile.open
2608