1"""Utilities for fast persistence of big data, with optional compression."""
2
3# Author: Gael Varoquaux <gael dot varoquaux at normalesup dot org>
4# Copyright (c) 2009 Gael Varoquaux
5# License: BSD Style, 3 clauses.
6
7import pickle
8import io
9import sys
10import warnings
11import contextlib
12
13from .compressor import _ZFILE_PREFIX
14from .compressor import _COMPRESSORS
15
16try:
17    import numpy as np
18except ImportError:
19    np = None
20
21Unpickler = pickle._Unpickler
22Pickler = pickle._Pickler
23xrange = range
24
25
26try:
27    # The python standard library can be built without bz2 so we make bz2
28    # usage optional.
29    # see https://github.com/scikit-learn/scikit-learn/issues/7526 for more
30    # details.
31    import bz2
32except ImportError:
33    bz2 = None
34
35# Buffer size used in io.BufferedReader and io.BufferedWriter
36_IO_BUFFER_SIZE = 1024 ** 2
37
38
39def _is_raw_file(fileobj):
40    """Check if fileobj is a raw file object, e.g created with open."""
41    fileobj = getattr(fileobj, 'raw', fileobj)
42    return isinstance(fileobj, io.FileIO)
43
44
45def _get_prefixes_max_len():
46    # Compute the max prefix len of registered compressors.
47    prefixes = [len(compressor.prefix) for compressor in _COMPRESSORS.values()]
48    prefixes += [len(_ZFILE_PREFIX)]
49    return max(prefixes)
50
51
52def _is_numpy_array_byte_order_mismatch(array):
53    """Check if numpy array is having byte order mis-match"""
54    return ((sys.byteorder == 'big' and
55             (array.dtype.byteorder == '<' or
56              (array.dtype.byteorder == '|' and array.dtype.fields and
57               all(e[0].byteorder == '<'
58                   for e in array.dtype.fields.values())))) or
59            (sys.byteorder == 'little' and
60             (array.dtype.byteorder == '>' or
61              (array.dtype.byteorder == '|' and array.dtype.fields and
62               all(e[0].byteorder == '>'
63                   for e in array.dtype.fields.values())))))
64
65
66def _ensure_native_byte_order(array):
67    """Use the byte order of the host while preserving values
68
69    Does nothing if array already uses the system byte order.
70    """
71    if _is_numpy_array_byte_order_mismatch(array):
72        array = array.byteswap().newbyteorder('=')
73    return array
74
75
76###############################################################################
77# Cache file utilities
78def _detect_compressor(fileobj):
79    """Return the compressor matching fileobj.
80
81    Parameters
82    ----------
83    fileobj: file object
84
85    Returns
86    -------
87    str in {'zlib', 'gzip', 'bz2', 'lzma', 'xz', 'compat', 'not-compressed'}
88    """
89    # Read the magic number in the first bytes of the file.
90    max_prefix_len = _get_prefixes_max_len()
91    if hasattr(fileobj, 'peek'):
92        # Peek allows to read those bytes without moving the cursor in the
93        # file whic.
94        first_bytes = fileobj.peek(max_prefix_len)
95    else:
96        # Fallback to seek if the fileobject is not peekable.
97        first_bytes = fileobj.read(max_prefix_len)
98        fileobj.seek(0)
99
100    if first_bytes.startswith(_ZFILE_PREFIX):
101        return "compat"
102    else:
103        for name, compressor in _COMPRESSORS.items():
104            if first_bytes.startswith(compressor.prefix):
105                return name
106
107    return "not-compressed"
108
109
110def _buffered_read_file(fobj):
111    """Return a buffered version of a read file object."""
112    return io.BufferedReader(fobj, buffer_size=_IO_BUFFER_SIZE)
113
114
115def _buffered_write_file(fobj):
116    """Return a buffered version of a write file object."""
117    return io.BufferedWriter(fobj, buffer_size=_IO_BUFFER_SIZE)
118
119
120@contextlib.contextmanager
121def _read_fileobject(fileobj, filename, mmap_mode=None):
122    """Utility function opening the right fileobject from a filename.
123
124    The magic number is used to choose between the type of file object to open:
125    * regular file object (default)
126    * zlib file object
127    * gzip file object
128    * bz2 file object
129    * lzma file object (for xz and lzma compressor)
130
131    Parameters
132    ----------
133    fileobj: file object
134    compressor: str in {'zlib', 'gzip', 'bz2', 'lzma', 'xz', 'compat',
135                        'not-compressed'}
136    filename: str
137        filename path corresponding to the fileobj parameter.
138    mmap_mode: str
139        memory map mode that should be used to open the pickle file. This
140        parameter is useful to verify that the user is not trying to one with
141        compression. Default: None.
142
143    Returns
144    -------
145        a file like object
146
147    """
148    # Detect if the fileobj contains compressed data.
149    compressor = _detect_compressor(fileobj)
150
151    if compressor == 'compat':
152        # Compatibility with old pickle mode: simply return the input
153        # filename "as-is" and let the compatibility function be called by the
154        # caller.
155        warnings.warn("The file '%s' has been generated with a joblib "
156                      "version less than 0.10. "
157                      "Please regenerate this pickle file." % filename,
158                      DeprecationWarning, stacklevel=2)
159        yield filename
160    else:
161        if compressor in _COMPRESSORS:
162            # based on the compressor detected in the file, we open the
163            # correct decompressor file object, wrapped in a buffer.
164            compressor_wrapper = _COMPRESSORS[compressor]
165            inst = compressor_wrapper.decompressor_file(fileobj)
166            fileobj = _buffered_read_file(inst)
167
168        # Checking if incompatible load parameters with the type of file:
169        # mmap_mode cannot be used with compressed file or in memory buffers
170        # such as io.BytesIO.
171        if mmap_mode is not None:
172            if isinstance(fileobj, io.BytesIO):
173                warnings.warn('In memory persistence is not compatible with '
174                              'mmap_mode "%(mmap_mode)s" flag passed. '
175                              'mmap_mode option will be ignored.'
176                              % locals(), stacklevel=2)
177            elif compressor != 'not-compressed':
178                warnings.warn('mmap_mode "%(mmap_mode)s" is not compatible '
179                              'with compressed file %(filename)s. '
180                              '"%(mmap_mode)s" flag will be ignored.'
181                              % locals(), stacklevel=2)
182            elif not _is_raw_file(fileobj):
183                warnings.warn('"%(fileobj)r" is not a raw file, mmap_mode '
184                              '"%(mmap_mode)s" flag will be ignored.'
185                              % locals(), stacklevel=2)
186
187        yield fileobj
188
189
190def _write_fileobject(filename, compress=("zlib", 3)):
191    """Return the right compressor file object in write mode."""
192    compressmethod = compress[0]
193    compresslevel = compress[1]
194
195    if compressmethod in _COMPRESSORS.keys():
196        file_instance = _COMPRESSORS[compressmethod].compressor_file(
197            filename, compresslevel=compresslevel)
198        return _buffered_write_file(file_instance)
199    else:
200        file_instance = _COMPRESSORS['zlib'].compressor_file(
201            filename, compresslevel=compresslevel)
202        return _buffered_write_file(file_instance)
203
204
205# Utility functions/variables from numpy required for writing arrays.
206# We need at least the functions introduced in version 1.9 of numpy. Here,
207# we use the ones from numpy 1.10.2.
208BUFFER_SIZE = 2 ** 18  # size of buffer for reading npz files in bytes
209
210
211def _read_bytes(fp, size, error_template="ran out of data"):
212    """Read from file-like object until size bytes are read.
213
214    TODO python2_drop: is it still needed? The docstring mentions python 2.6
215    and it looks like this can be at least simplified ...
216
217    Raises ValueError if not EOF is encountered before size bytes are read.
218    Non-blocking objects only supported if they derive from io objects.
219
220    Required as e.g. ZipExtFile in python 2.6 can return less data than
221    requested.
222
223    This function was taken from numpy/lib/format.py in version 1.10.2.
224
225    Parameters
226    ----------
227    fp: file-like object
228    size: int
229    error_template: str
230
231    Returns
232    -------
233    a bytes object
234        The data read in bytes.
235
236    """
237    data = bytes()
238    while True:
239        # io files (default in python3) return None or raise on
240        # would-block, python2 file will truncate, probably nothing can be
241        # done about that.  note that regular files can't be non-blocking
242        try:
243            r = fp.read(size - len(data))
244            data += r
245            if len(r) == 0 or len(data) == size:
246                break
247        except io.BlockingIOError:
248            pass
249    if len(data) != size:
250        msg = "EOF: reading %s, expected %d bytes got %d"
251        raise ValueError(msg % (error_template, size, len(data)))
252    else:
253        return data
254