1"""Interface to the libbzip2 compression library. 2 3This module provides a file interface, classes for incremental 4(de)compression, and functions for one-shot (de)compression. 5""" 6 7__all__ = ["BZ2File", "BZ2Compressor", "BZ2Decompressor", 8 "open", "compress", "decompress"] 9 10__author__ = "Nadeem Vawda <nadeem.vawda@gmail.com>" 11 12from builtins import open as _builtin_open 13import io 14import os 15import warnings 16import _compression 17from threading import RLock 18 19from _bz2 import BZ2Compressor, BZ2Decompressor 20 21 22_MODE_CLOSED = 0 23_MODE_READ = 1 24# Value 2 no longer used 25_MODE_WRITE = 3 26 27_sentinel = object() 28 29 30class BZ2File(_compression.BaseStream): 31 32 """A file object providing transparent bzip2 (de)compression. 33 34 A BZ2File can act as a wrapper for an existing file object, or refer 35 directly to a named file on disk. 36 37 Note that BZ2File provides a *binary* file interface - data read is 38 returned as bytes, and data to be written should be given as bytes. 39 """ 40 41 def __init__(self, filename, mode="r", buffering=_sentinel, compresslevel=9): 42 """Open a bzip2-compressed file. 43 44 If filename is a str, bytes, or PathLike object, it gives the 45 name of the file to be opened. Otherwise, it should be a file 46 object, which will be used to read or write the compressed data. 47 48 mode can be 'r' for reading (default), 'w' for (over)writing, 49 'x' for creating exclusively, or 'a' for appending. These can 50 equivalently be given as 'rb', 'wb', 'xb', and 'ab'. 51 52 buffering is ignored since Python 3.0. Its use is deprecated. 53 54 If mode is 'w', 'x' or 'a', compresslevel can be a number between 1 55 and 9 specifying the level of compression: 1 produces the least 56 compression, and 9 (default) produces the most compression. 57 58 If mode is 'r', the input file may be the concatenation of 59 multiple compressed streams. 60 """ 61 # This lock must be recursive, so that BufferedIOBase's 62 # writelines() does not deadlock. 63 self._lock = RLock() 64 self._fp = None 65 self._closefp = False 66 self._mode = _MODE_CLOSED 67 68 if buffering is not _sentinel: 69 warnings.warn("Use of 'buffering' argument is deprecated and ignored " 70 "since Python 3.0.", 71 DeprecationWarning, 72 stacklevel=2) 73 74 if not (1 <= compresslevel <= 9): 75 raise ValueError("compresslevel must be between 1 and 9") 76 77 if mode in ("", "r", "rb"): 78 mode = "rb" 79 mode_code = _MODE_READ 80 elif mode in ("w", "wb"): 81 mode = "wb" 82 mode_code = _MODE_WRITE 83 self._compressor = BZ2Compressor(compresslevel) 84 elif mode in ("x", "xb"): 85 mode = "xb" 86 mode_code = _MODE_WRITE 87 self._compressor = BZ2Compressor(compresslevel) 88 elif mode in ("a", "ab"): 89 mode = "ab" 90 mode_code = _MODE_WRITE 91 self._compressor = BZ2Compressor(compresslevel) 92 else: 93 raise ValueError("Invalid mode: %r" % (mode,)) 94 95 if isinstance(filename, (str, bytes, os.PathLike)): 96 self._fp = _builtin_open(filename, mode) 97 self._closefp = True 98 self._mode = mode_code 99 elif hasattr(filename, "read") or hasattr(filename, "write"): 100 self._fp = filename 101 self._mode = mode_code 102 else: 103 raise TypeError("filename must be a str, bytes, file or PathLike object") 104 105 if self._mode == _MODE_READ: 106 raw = _compression.DecompressReader(self._fp, 107 BZ2Decompressor, trailing_error=OSError) 108 self._buffer = io.BufferedReader(raw) 109 else: 110 self._pos = 0 111 112 def close(self): 113 """Flush and close the file. 114 115 May be called more than once without error. Once the file is 116 closed, any other operation on it will raise a ValueError. 117 """ 118 with self._lock: 119 if self._mode == _MODE_CLOSED: 120 return 121 try: 122 if self._mode == _MODE_READ: 123 self._buffer.close() 124 elif self._mode == _MODE_WRITE: 125 self._fp.write(self._compressor.flush()) 126 self._compressor = None 127 finally: 128 try: 129 if self._closefp: 130 self._fp.close() 131 finally: 132 self._fp = None 133 self._closefp = False 134 self._mode = _MODE_CLOSED 135 self._buffer = None 136 137 @property 138 def closed(self): 139 """True if this file is closed.""" 140 return self._mode == _MODE_CLOSED 141 142 def fileno(self): 143 """Return the file descriptor for the underlying file.""" 144 self._check_not_closed() 145 return self._fp.fileno() 146 147 def seekable(self): 148 """Return whether the file supports seeking.""" 149 return self.readable() and self._buffer.seekable() 150 151 def readable(self): 152 """Return whether the file was opened for reading.""" 153 self._check_not_closed() 154 return self._mode == _MODE_READ 155 156 def writable(self): 157 """Return whether the file was opened for writing.""" 158 self._check_not_closed() 159 return self._mode == _MODE_WRITE 160 161 def peek(self, n=0): 162 """Return buffered data without advancing the file position. 163 164 Always returns at least one byte of data, unless at EOF. 165 The exact number of bytes returned is unspecified. 166 """ 167 with self._lock: 168 self._check_can_read() 169 # Relies on the undocumented fact that BufferedReader.peek() 170 # always returns at least one byte (except at EOF), independent 171 # of the value of n 172 return self._buffer.peek(n) 173 174 def read(self, size=-1): 175 """Read up to size uncompressed bytes from the file. 176 177 If size is negative or omitted, read until EOF is reached. 178 Returns b'' if the file is already at EOF. 179 """ 180 with self._lock: 181 self._check_can_read() 182 return self._buffer.read(size) 183 184 def read1(self, size=-1): 185 """Read up to size uncompressed bytes, while trying to avoid 186 making multiple reads from the underlying stream. Reads up to a 187 buffer's worth of data if size is negative. 188 189 Returns b'' if the file is at EOF. 190 """ 191 with self._lock: 192 self._check_can_read() 193 if size < 0: 194 size = io.DEFAULT_BUFFER_SIZE 195 return self._buffer.read1(size) 196 197 def readinto(self, b): 198 """Read bytes into b. 199 200 Returns the number of bytes read (0 for EOF). 201 """ 202 with self._lock: 203 self._check_can_read() 204 return self._buffer.readinto(b) 205 206 def readline(self, size=-1): 207 """Read a line of uncompressed bytes from the file. 208 209 The terminating newline (if present) is retained. If size is 210 non-negative, no more than size bytes will be read (in which 211 case the line may be incomplete). Returns b'' if already at EOF. 212 """ 213 if not isinstance(size, int): 214 if not hasattr(size, "__index__"): 215 raise TypeError("Integer argument expected") 216 size = size.__index__() 217 with self._lock: 218 self._check_can_read() 219 return self._buffer.readline(size) 220 221 def readlines(self, size=-1): 222 """Read a list of lines of uncompressed bytes from the file. 223 224 size can be specified to control the number of lines read: no 225 further lines will be read once the total size of the lines read 226 so far equals or exceeds size. 227 """ 228 if not isinstance(size, int): 229 if not hasattr(size, "__index__"): 230 raise TypeError("Integer argument expected") 231 size = size.__index__() 232 with self._lock: 233 self._check_can_read() 234 return self._buffer.readlines(size) 235 236 def write(self, data): 237 """Write a byte string to the file. 238 239 Returns the number of uncompressed bytes written, which is 240 always len(data). Note that due to buffering, the file on disk 241 may not reflect the data written until close() is called. 242 """ 243 with self._lock: 244 self._check_can_write() 245 compressed = self._compressor.compress(data) 246 self._fp.write(compressed) 247 self._pos += len(data) 248 return len(data) 249 250 def writelines(self, seq): 251 """Write a sequence of byte strings to the file. 252 253 Returns the number of uncompressed bytes written. 254 seq can be any iterable yielding byte strings. 255 256 Line separators are not added between the written byte strings. 257 """ 258 with self._lock: 259 return _compression.BaseStream.writelines(self, seq) 260 261 def seek(self, offset, whence=io.SEEK_SET): 262 """Change the file position. 263 264 The new position is specified by offset, relative to the 265 position indicated by whence. Values for whence are: 266 267 0: start of stream (default); offset must not be negative 268 1: current stream position 269 2: end of stream; offset must not be positive 270 271 Returns the new file position. 272 273 Note that seeking is emulated, so depending on the parameters, 274 this operation may be extremely slow. 275 """ 276 with self._lock: 277 self._check_can_seek() 278 return self._buffer.seek(offset, whence) 279 280 def tell(self): 281 """Return the current file position.""" 282 with self._lock: 283 self._check_not_closed() 284 if self._mode == _MODE_READ: 285 return self._buffer.tell() 286 return self._pos 287 288 289def open(filename, mode="rb", compresslevel=9, 290 encoding=None, errors=None, newline=None): 291 """Open a bzip2-compressed file in binary or text mode. 292 293 The filename argument can be an actual filename (a str, bytes, or 294 PathLike object), or an existing file object to read from or write 295 to. 296 297 The mode argument can be "r", "rb", "w", "wb", "x", "xb", "a" or 298 "ab" for binary mode, or "rt", "wt", "xt" or "at" for text mode. 299 The default mode is "rb", and the default compresslevel is 9. 300 301 For binary mode, this function is equivalent to the BZ2File 302 constructor: BZ2File(filename, mode, compresslevel). In this case, 303 the encoding, errors and newline arguments must not be provided. 304 305 For text mode, a BZ2File object is created, and wrapped in an 306 io.TextIOWrapper instance with the specified encoding, error 307 handling behavior, and line ending(s). 308 309 """ 310 if "t" in mode: 311 if "b" in mode: 312 raise ValueError("Invalid mode: %r" % (mode,)) 313 else: 314 if encoding is not None: 315 raise ValueError("Argument 'encoding' not supported in binary mode") 316 if errors is not None: 317 raise ValueError("Argument 'errors' not supported in binary mode") 318 if newline is not None: 319 raise ValueError("Argument 'newline' not supported in binary mode") 320 321 bz_mode = mode.replace("t", "") 322 binary_file = BZ2File(filename, bz_mode, compresslevel=compresslevel) 323 324 if "t" in mode: 325 return io.TextIOWrapper(binary_file, encoding, errors, newline) 326 else: 327 return binary_file 328 329 330def compress(data, compresslevel=9): 331 """Compress a block of data. 332 333 compresslevel, if given, must be a number between 1 and 9. 334 335 For incremental compression, use a BZ2Compressor object instead. 336 """ 337 comp = BZ2Compressor(compresslevel) 338 return comp.compress(data) + comp.flush() 339 340 341def decompress(data): 342 """Decompress a block of data. 343 344 For incremental decompression, use a BZ2Decompressor object instead. 345 """ 346 results = [] 347 while data: 348 decomp = BZ2Decompressor() 349 try: 350 res = decomp.decompress(data) 351 except OSError: 352 if results: 353 break # Leftover data is not a valid bzip2 stream; ignore it. 354 else: 355 raise # Error on the first iteration; bail out. 356 results.append(res) 357 if not decomp.eof: 358 raise ValueError("Compressed data ended before the " 359 "end-of-stream marker was reached") 360 data = decomp.unused_data 361 return b"".join(results) 362