1# -*- coding: utf-8 -*-
2#
3# Copyright (C) 2019 Radim Rehurek <me@radimrehurek.com>
4#
5# This code is distributed under the terms and conditions
6# from the MIT License (MIT).
7#
8
9"""Implements the majority of smart_open's top-level API.
10
11The main functions are:
12
13  * ``parse_uri()``
14  * ``open()``
15
16"""
17
18import collections
19import io
20import locale
21import logging
22import os
23import os.path as P
24import pathlib
25import urllib.parse
26import warnings
27
28#
29# This module defines a function called smart_open so we cannot use
30# smart_open.submodule to reference to the submodules.
31#
32import smart_open.local_file as so_file
33import smart_open.compression as so_compression
34
35from smart_open import doctools
36from smart_open import transport
37
38#
39# For backwards compatibility and keeping old unit tests happy.
40#
41from smart_open.compression import register_compressor  # noqa: F401
42from smart_open.utils import check_kwargs as _check_kwargs  # noqa: F401
43from smart_open.utils import inspect_kwargs as _inspect_kwargs  # noqa: F401
44
45logger = logging.getLogger(__name__)
46
47DEFAULT_ENCODING = locale.getpreferredencoding(do_setlocale=False)
48
49
50def _sniff_scheme(uri_as_string):
51    """Returns the scheme of the URL only, as a string."""
52    #
53    # urlsplit doesn't work on Windows -- it parses the drive as the scheme...
54    # no protocol given => assume a local file
55    #
56    if os.name == 'nt' and '://' not in uri_as_string:
57        uri_as_string = 'file://' + uri_as_string
58
59    return urllib.parse.urlsplit(uri_as_string).scheme
60
61
62def parse_uri(uri_as_string):
63    """
64    Parse the given URI from a string.
65
66    Parameters
67    ----------
68    uri_as_string: str
69        The URI to parse.
70
71    Returns
72    -------
73    collections.namedtuple
74        The parsed URI.
75
76    Notes
77    -----
78    smart_open/doctools.py magic goes here
79    """
80    scheme = _sniff_scheme(uri_as_string)
81    submodule = transport.get_transport(scheme)
82    as_dict = submodule.parse_uri(uri_as_string)
83
84    #
85    # The conversion to a namedtuple is just to keep the old tests happy while
86    # I'm still refactoring.
87    #
88    Uri = collections.namedtuple('Uri', sorted(as_dict.keys()))
89    return Uri(**as_dict)
90
91
92#
93# To keep old unit tests happy while I'm refactoring.
94#
95_parse_uri = parse_uri
96
97_builtin_open = open
98
99
100def open(
101        uri,
102        mode='r',
103        buffering=-1,
104        encoding=None,
105        errors=None,
106        newline=None,
107        closefd=True,
108        opener=None,
109        ignore_ext=False,
110        compression=None,
111        transport_params=None,
112        ):
113    r"""Open the URI object, returning a file-like object.
114
115    The URI is usually a string in a variety of formats.
116    For a full list of examples, see the :func:`parse_uri` function.
117
118    The URI may also be one of:
119
120    - an instance of the pathlib.Path class
121    - a stream (anything that implements io.IOBase-like functionality)
122
123    Parameters
124    ----------
125    uri: str or object
126        The object to open.
127    mode: str, optional
128        Mimicks built-in open parameter of the same name.
129    buffering: int, optional
130        Mimicks built-in open parameter of the same name.
131    encoding: str, optional
132        Mimicks built-in open parameter of the same name.
133    errors: str, optional
134        Mimicks built-in open parameter of the same name.
135    newline: str, optional
136        Mimicks built-in open parameter of the same name.
137    closefd: boolean, optional
138        Mimicks built-in open parameter of the same name.  Ignored.
139    opener: object, optional
140        Mimicks built-in open parameter of the same name.  Ignored.
141    ignore_ext: boolean, optional
142        Disable transparent compression/decompression based on the file extension.
143    compression: str, optional (see smart_open.compression.get_supported_compression_types)
144        Explicitly specify the compression/decompression behavior.
145        If you specify this parameter, then ignore_ext must not be specified.
146    transport_params: dict, optional
147        Additional parameters for the transport layer (see notes below).
148
149    Returns
150    -------
151    A file-like object.
152
153    Notes
154    -----
155    smart_open has several implementations for its transport layer (e.g. S3, HTTP).
156    Each transport layer has a different set of keyword arguments for overriding
157    default behavior.  If you specify a keyword argument that is *not* supported
158    by the transport layer being used, smart_open will ignore that argument and
159    log a warning message.
160
161    smart_open/doctools.py magic goes here
162
163    See Also
164    --------
165    - `Standard library reference <https://docs.python.org/3.7/library/functions.html#open>`__
166    - `smart_open README.rst
167      <https://github.com/RaRe-Technologies/smart_open/blob/master/README.rst>`__
168
169    """
170    logger.debug('%r', locals())
171
172    if not isinstance(mode, str):
173        raise TypeError('mode should be a string')
174
175    if compression and ignore_ext:
176        raise ValueError('ignore_ext and compression parameters are mutually exclusive')
177    elif compression and compression not in so_compression.get_supported_compression_types():
178        raise ValueError(f'invalid compression type: {compression}')
179    elif ignore_ext:
180        compression = so_compression.NO_COMPRESSION
181        warnings.warn("'ignore_ext' will be deprecated in a future release", PendingDeprecationWarning)
182    elif compression is None:
183        compression = so_compression.INFER_FROM_EXTENSION
184
185    if transport_params is None:
186        transport_params = {}
187
188    fobj = _shortcut_open(
189        uri,
190        mode,
191        compression=compression,
192        buffering=buffering,
193        encoding=encoding,
194        errors=errors,
195        newline=newline,
196    )
197    if fobj is not None:
198        return fobj
199
200    #
201    # This is a work-around for the problem described in Issue #144.
202    # If the user has explicitly specified an encoding, then assume they want
203    # us to open the destination in text mode, instead of the default binary.
204    #
205    # If we change the default mode to be text, and match the normal behavior
206    # of Py2 and 3, then the above assumption will be unnecessary.
207    #
208    if encoding is not None and 'b' in mode:
209        mode = mode.replace('b', '')
210
211    if isinstance(uri, pathlib.Path):
212        uri = str(uri)
213
214    explicit_encoding = encoding
215    encoding = explicit_encoding if explicit_encoding else DEFAULT_ENCODING
216
217    #
218    # This is how we get from the filename to the end result.  Decompression is
219    # optional, but it always accepts bytes and returns bytes.
220    #
221    # Decoding is also optional, accepts bytes and returns text.  The diagram
222    # below is for reading, for writing, the flow is from right to left, but
223    # the code is identical.
224    #
225    #           open as binary         decompress?          decode?
226    # filename ---------------> bytes -------------> bytes ---------> text
227    #                          binary             decompressed       decode
228    #
229
230    try:
231        binary_mode = _get_binary_mode(mode)
232    except ValueError as ve:
233        raise NotImplementedError(ve.args[0])
234
235    binary = _open_binary_stream(uri, binary_mode, transport_params)
236    decompressed = so_compression.compression_wrapper(binary, binary_mode, compression)
237
238    if 'b' not in mode or explicit_encoding is not None:
239        decoded = _encoding_wrapper(
240            decompressed,
241            mode,
242            encoding=encoding,
243            errors=errors,
244            newline=newline,
245        )
246    else:
247        decoded = decompressed
248
249    return decoded
250
251
252def _get_binary_mode(mode_str):
253    #
254    # https://docs.python.org/3/library/functions.html#open
255    #
256    # The order of characters in the mode parameter appears to be unspecified.
257    # The implementation follows the examples, just to be safe.
258    #
259    mode = list(mode_str)
260    binmode = []
261
262    if 't' in mode and 'b' in mode:
263        raise ValueError("can't have text and binary mode at once")
264
265    counts = [mode.count(x) for x in 'rwa']
266    if sum(counts) > 1:
267        raise ValueError("must have exactly one of create/read/write/append mode")
268
269    def transfer(char):
270        binmode.append(mode.pop(mode.index(char)))
271
272    if 'a' in mode:
273        transfer('a')
274    elif 'w' in mode:
275        transfer('w')
276    elif 'r' in mode:
277        transfer('r')
278    else:
279        raise ValueError(
280            "Must have exactly one of create/read/write/append "
281            "mode and at most one plus"
282        )
283
284    if 'b' in mode:
285        transfer('b')
286    elif 't' in mode:
287        mode.pop(mode.index('t'))
288        binmode.append('b')
289    else:
290        binmode.append('b')
291
292    if '+' in mode:
293        transfer('+')
294
295    #
296    # There shouldn't be anything left in the mode list at this stage.
297    # If there is, then either we've missed something and the implementation
298    # of this function is broken, or the original input mode is invalid.
299    #
300    if mode:
301        raise ValueError('invalid mode: %r' % mode_str)
302
303    return ''.join(binmode)
304
305
306def _shortcut_open(
307        uri,
308        mode,
309        compression,
310        buffering=-1,
311        encoding=None,
312        errors=None,
313        newline=None,
314        ):
315    """Try to open the URI using the standard library io.open function.
316
317    This can be much faster than the alternative of opening in binary mode and
318    then decoding.
319
320    This is only possible under the following conditions:
321
322        1. Opening a local file; and
323        2. Compression is disabled
324
325    If it is not possible to use the built-in open for the specified URI, returns None.
326
327    :param str uri: A string indicating what to open.
328    :param str mode: The mode to pass to the open function.
329    :param str compression: The compression type selected.
330    :returns: The opened file
331    :rtype: file
332    """
333    if not isinstance(uri, str):
334        return None
335
336    scheme = _sniff_scheme(uri)
337    if scheme not in (transport.NO_SCHEME, so_file.SCHEME):
338        return None
339
340    local_path = so_file.extract_local_path(uri)
341    if compression == so_compression.INFER_FROM_EXTENSION:
342        _, extension = P.splitext(local_path)
343        if extension in so_compression.get_supported_extensions():
344            return None
345    elif compression != so_compression.NO_COMPRESSION:
346        return None
347
348    open_kwargs = {}
349    if encoding is not None:
350        open_kwargs['encoding'] = encoding
351        mode = mode.replace('b', '')
352    if newline is not None:
353        open_kwargs['newline'] = newline
354
355    #
356    # binary mode of the builtin/stdlib open function doesn't take an errors argument
357    #
358    if errors and 'b' not in mode:
359        open_kwargs['errors'] = errors
360
361    return _builtin_open(local_path, mode, buffering=buffering, **open_kwargs)
362
363
364def _open_binary_stream(uri, mode, transport_params):
365    """Open an arbitrary URI in the specified binary mode.
366
367    Not all modes are supported for all protocols.
368
369    :arg uri: The URI to open.  May be a string, or something else.
370    :arg str mode: The mode to open with.  Must be rb, wb or ab.
371    :arg transport_params: Keyword argumens for the transport layer.
372    :returns: A named file object
373    :rtype: file-like object with a .name attribute
374    """
375    if mode not in ('rb', 'rb+', 'wb', 'wb+', 'ab', 'ab+'):
376        #
377        # This should really be a ValueError, but for the sake of compatibility
378        # with older versions, which raise NotImplementedError, we do the same.
379        #
380        raise NotImplementedError('unsupported mode: %r' % mode)
381
382    if hasattr(uri, 'read'):
383        # simply pass-through if already a file-like
384        # we need to return something as the file name, but we don't know what
385        # so we probe for uri.name (e.g., this works with open() or tempfile.NamedTemporaryFile)
386        # if the value ends with COMPRESSED_EXT, we will note it in compression_wrapper()
387        # if there is no such an attribute, we return "unknown" - this
388        # effectively disables any compression
389        if not hasattr(uri, 'name'):
390            uri.name = getattr(uri, 'name', 'unknown')
391        return uri
392
393    if not isinstance(uri, str):
394        raise TypeError("don't know how to handle uri %s" % repr(uri))
395
396    scheme = _sniff_scheme(uri)
397    submodule = transport.get_transport(scheme)
398    fobj = submodule.open_uri(uri, mode, transport_params)
399    if not hasattr(fobj, 'name'):
400        fobj.name = uri
401
402    return fobj
403
404
405def _encoding_wrapper(fileobj, mode, encoding=None, errors=None, newline=None):
406    """Decode bytes into text, if necessary.
407
408    If mode specifies binary access, does nothing, unless the encoding is
409    specified.  A non-null encoding implies text mode.
410
411    :arg fileobj: must quack like a filehandle object.
412    :arg str mode: is the mode which was originally requested by the user.
413    :arg str encoding: The text encoding to use.  If mode is binary, overrides mode.
414    :arg str errors: The method to use when handling encoding/decoding errors.
415    :returns: a file object
416    """
417    logger.debug('encoding_wrapper: %r', locals())
418
419    #
420    # If the mode is binary, but the user specified an encoding, assume they
421    # want text.  If we don't make this assumption, ignore the encoding and
422    # return bytes, smart_open behavior will diverge from the built-in open:
423    #
424    #   open(filename, encoding='utf-8') returns a text stream in Py3
425    #   smart_open(filename, encoding='utf-8') would return a byte stream
426    #       without our assumption, because the default mode is rb.
427    #
428    if 'b' in mode and encoding is None:
429        return fileobj
430
431    if encoding is None:
432        encoding = DEFAULT_ENCODING
433
434    fileobj = io.TextIOWrapper(
435        fileobj,
436        encoding=encoding,
437        errors=errors,
438        newline=newline,
439        write_through=True,
440    )
441    return fileobj
442
443
444class patch_pathlib(object):
445    """Replace `Path.open` with `smart_open.open`"""
446
447    def __init__(self):
448        self.old_impl = _patch_pathlib(open)
449
450    def __enter__(self):
451        return self
452
453    def __exit__(self, exc_type, exc_val, exc_tb):
454        _patch_pathlib(self.old_impl)
455
456
457def _patch_pathlib(func):
458    """Replace `Path.open` with `func`"""
459    old_impl = pathlib.Path.open
460    pathlib.Path.open = func
461    return old_impl
462
463
464def smart_open(
465        uri,
466        mode='rb',
467        buffering=-1,
468        encoding=None,
469        errors=None,
470        newline=None,
471        closefd=True,
472        opener=None,
473        ignore_extension=False,
474        **kwargs
475    ):
476    #
477    # This is a thin wrapper of smart_open.open.  It's here for backward
478    # compatibility.  It works exactly like smart_open.open when the passed
479    # parameters are identical.  Otherwise, it raises a DeprecationWarning.
480    #
481    # For completeness, the main differences of the old smart_open function:
482    #
483    # 1. Default mode was read binary (mode='rb')
484    # 2. ignore_ext parameter was called ignore_extension
485    # 3. Transport parameters were passed directly as kwargs
486    #
487    url = 'https://github.com/RaRe-Technologies/smart_open/blob/develop/MIGRATING_FROM_OLDER_VERSIONS.rst'
488    if kwargs:
489        raise DeprecationWarning(
490            'The following keyword parameters are not supported: %r. '
491            'See  %s for more information.' % (sorted(kwargs), url)
492        )
493    message = 'This function is deprecated.  See %s for more information' % url
494    warnings.warn(message, category=DeprecationWarning)
495
496    ignore_ext = ignore_extension
497    del kwargs, url, message, ignore_extension
498    return open(**locals())
499
500
501#
502# Prevent failures with doctools from messing up the entire library.  We don't
503# expect such failures, but contributed modules (e.g. new transport mechanisms)
504# may not be as polished.
505#
506try:
507    doctools.tweak_open_docstring(open)
508    doctools.tweak_parse_uri_docstring(parse_uri)
509except Exception as ex:
510    logger.error(
511        'Encountered a non-fatal error while building docstrings (see below). '
512        'help(smart_open) will provide incomplete information as a result. '
513        'For full help text, see '
514        '<https://github.com/RaRe-Technologies/smart_open/blob/master/help.txt>.'
515    )
516    logger.exception(ex)
517