1# -*- coding: utf-8 -*- 2# 3# Copyright (C) 2019 Radim Rehurek <me@radimrehurek.com> 4# 5# This code is distributed under the terms and conditions 6# from the MIT License (MIT). 7# 8 9"""Implements the majority of smart_open's top-level API. 10 11The main functions are: 12 13 * ``parse_uri()`` 14 * ``open()`` 15 16""" 17 18import collections 19import io 20import locale 21import logging 22import os 23import os.path as P 24import pathlib 25import urllib.parse 26import warnings 27 28# 29# This module defines a function called smart_open so we cannot use 30# smart_open.submodule to reference to the submodules. 31# 32import smart_open.local_file as so_file 33import smart_open.compression as so_compression 34 35from smart_open import doctools 36from smart_open import transport 37 38# 39# For backwards compatibility and keeping old unit tests happy. 40# 41from smart_open.compression import register_compressor # noqa: F401 42from smart_open.utils import check_kwargs as _check_kwargs # noqa: F401 43from smart_open.utils import inspect_kwargs as _inspect_kwargs # noqa: F401 44 45logger = logging.getLogger(__name__) 46 47DEFAULT_ENCODING = locale.getpreferredencoding(do_setlocale=False) 48 49 50def _sniff_scheme(uri_as_string): 51 """Returns the scheme of the URL only, as a string.""" 52 # 53 # urlsplit doesn't work on Windows -- it parses the drive as the scheme... 54 # no protocol given => assume a local file 55 # 56 if os.name == 'nt' and '://' not in uri_as_string: 57 uri_as_string = 'file://' + uri_as_string 58 59 return urllib.parse.urlsplit(uri_as_string).scheme 60 61 62def parse_uri(uri_as_string): 63 """ 64 Parse the given URI from a string. 65 66 Parameters 67 ---------- 68 uri_as_string: str 69 The URI to parse. 70 71 Returns 72 ------- 73 collections.namedtuple 74 The parsed URI. 75 76 Notes 77 ----- 78 smart_open/doctools.py magic goes here 79 """ 80 scheme = _sniff_scheme(uri_as_string) 81 submodule = transport.get_transport(scheme) 82 as_dict = submodule.parse_uri(uri_as_string) 83 84 # 85 # The conversion to a namedtuple is just to keep the old tests happy while 86 # I'm still refactoring. 87 # 88 Uri = collections.namedtuple('Uri', sorted(as_dict.keys())) 89 return Uri(**as_dict) 90 91 92# 93# To keep old unit tests happy while I'm refactoring. 94# 95_parse_uri = parse_uri 96 97_builtin_open = open 98 99 100def open( 101 uri, 102 mode='r', 103 buffering=-1, 104 encoding=None, 105 errors=None, 106 newline=None, 107 closefd=True, 108 opener=None, 109 ignore_ext=False, 110 compression=None, 111 transport_params=None, 112 ): 113 r"""Open the URI object, returning a file-like object. 114 115 The URI is usually a string in a variety of formats. 116 For a full list of examples, see the :func:`parse_uri` function. 117 118 The URI may also be one of: 119 120 - an instance of the pathlib.Path class 121 - a stream (anything that implements io.IOBase-like functionality) 122 123 Parameters 124 ---------- 125 uri: str or object 126 The object to open. 127 mode: str, optional 128 Mimicks built-in open parameter of the same name. 129 buffering: int, optional 130 Mimicks built-in open parameter of the same name. 131 encoding: str, optional 132 Mimicks built-in open parameter of the same name. 133 errors: str, optional 134 Mimicks built-in open parameter of the same name. 135 newline: str, optional 136 Mimicks built-in open parameter of the same name. 137 closefd: boolean, optional 138 Mimicks built-in open parameter of the same name. Ignored. 139 opener: object, optional 140 Mimicks built-in open parameter of the same name. Ignored. 141 ignore_ext: boolean, optional 142 Disable transparent compression/decompression based on the file extension. 143 compression: str, optional (see smart_open.compression.get_supported_compression_types) 144 Explicitly specify the compression/decompression behavior. 145 If you specify this parameter, then ignore_ext must not be specified. 146 transport_params: dict, optional 147 Additional parameters for the transport layer (see notes below). 148 149 Returns 150 ------- 151 A file-like object. 152 153 Notes 154 ----- 155 smart_open has several implementations for its transport layer (e.g. S3, HTTP). 156 Each transport layer has a different set of keyword arguments for overriding 157 default behavior. If you specify a keyword argument that is *not* supported 158 by the transport layer being used, smart_open will ignore that argument and 159 log a warning message. 160 161 smart_open/doctools.py magic goes here 162 163 See Also 164 -------- 165 - `Standard library reference <https://docs.python.org/3.7/library/functions.html#open>`__ 166 - `smart_open README.rst 167 <https://github.com/RaRe-Technologies/smart_open/blob/master/README.rst>`__ 168 169 """ 170 logger.debug('%r', locals()) 171 172 if not isinstance(mode, str): 173 raise TypeError('mode should be a string') 174 175 if compression and ignore_ext: 176 raise ValueError('ignore_ext and compression parameters are mutually exclusive') 177 elif compression and compression not in so_compression.get_supported_compression_types(): 178 raise ValueError(f'invalid compression type: {compression}') 179 elif ignore_ext: 180 compression = so_compression.NO_COMPRESSION 181 warnings.warn("'ignore_ext' will be deprecated in a future release", PendingDeprecationWarning) 182 elif compression is None: 183 compression = so_compression.INFER_FROM_EXTENSION 184 185 if transport_params is None: 186 transport_params = {} 187 188 fobj = _shortcut_open( 189 uri, 190 mode, 191 compression=compression, 192 buffering=buffering, 193 encoding=encoding, 194 errors=errors, 195 newline=newline, 196 ) 197 if fobj is not None: 198 return fobj 199 200 # 201 # This is a work-around for the problem described in Issue #144. 202 # If the user has explicitly specified an encoding, then assume they want 203 # us to open the destination in text mode, instead of the default binary. 204 # 205 # If we change the default mode to be text, and match the normal behavior 206 # of Py2 and 3, then the above assumption will be unnecessary. 207 # 208 if encoding is not None and 'b' in mode: 209 mode = mode.replace('b', '') 210 211 if isinstance(uri, pathlib.Path): 212 uri = str(uri) 213 214 explicit_encoding = encoding 215 encoding = explicit_encoding if explicit_encoding else DEFAULT_ENCODING 216 217 # 218 # This is how we get from the filename to the end result. Decompression is 219 # optional, but it always accepts bytes and returns bytes. 220 # 221 # Decoding is also optional, accepts bytes and returns text. The diagram 222 # below is for reading, for writing, the flow is from right to left, but 223 # the code is identical. 224 # 225 # open as binary decompress? decode? 226 # filename ---------------> bytes -------------> bytes ---------> text 227 # binary decompressed decode 228 # 229 230 try: 231 binary_mode = _get_binary_mode(mode) 232 except ValueError as ve: 233 raise NotImplementedError(ve.args[0]) 234 235 binary = _open_binary_stream(uri, binary_mode, transport_params) 236 decompressed = so_compression.compression_wrapper(binary, binary_mode, compression) 237 238 if 'b' not in mode or explicit_encoding is not None: 239 decoded = _encoding_wrapper( 240 decompressed, 241 mode, 242 encoding=encoding, 243 errors=errors, 244 newline=newline, 245 ) 246 else: 247 decoded = decompressed 248 249 return decoded 250 251 252def _get_binary_mode(mode_str): 253 # 254 # https://docs.python.org/3/library/functions.html#open 255 # 256 # The order of characters in the mode parameter appears to be unspecified. 257 # The implementation follows the examples, just to be safe. 258 # 259 mode = list(mode_str) 260 binmode = [] 261 262 if 't' in mode and 'b' in mode: 263 raise ValueError("can't have text and binary mode at once") 264 265 counts = [mode.count(x) for x in 'rwa'] 266 if sum(counts) > 1: 267 raise ValueError("must have exactly one of create/read/write/append mode") 268 269 def transfer(char): 270 binmode.append(mode.pop(mode.index(char))) 271 272 if 'a' in mode: 273 transfer('a') 274 elif 'w' in mode: 275 transfer('w') 276 elif 'r' in mode: 277 transfer('r') 278 else: 279 raise ValueError( 280 "Must have exactly one of create/read/write/append " 281 "mode and at most one plus" 282 ) 283 284 if 'b' in mode: 285 transfer('b') 286 elif 't' in mode: 287 mode.pop(mode.index('t')) 288 binmode.append('b') 289 else: 290 binmode.append('b') 291 292 if '+' in mode: 293 transfer('+') 294 295 # 296 # There shouldn't be anything left in the mode list at this stage. 297 # If there is, then either we've missed something and the implementation 298 # of this function is broken, or the original input mode is invalid. 299 # 300 if mode: 301 raise ValueError('invalid mode: %r' % mode_str) 302 303 return ''.join(binmode) 304 305 306def _shortcut_open( 307 uri, 308 mode, 309 compression, 310 buffering=-1, 311 encoding=None, 312 errors=None, 313 newline=None, 314 ): 315 """Try to open the URI using the standard library io.open function. 316 317 This can be much faster than the alternative of opening in binary mode and 318 then decoding. 319 320 This is only possible under the following conditions: 321 322 1. Opening a local file; and 323 2. Compression is disabled 324 325 If it is not possible to use the built-in open for the specified URI, returns None. 326 327 :param str uri: A string indicating what to open. 328 :param str mode: The mode to pass to the open function. 329 :param str compression: The compression type selected. 330 :returns: The opened file 331 :rtype: file 332 """ 333 if not isinstance(uri, str): 334 return None 335 336 scheme = _sniff_scheme(uri) 337 if scheme not in (transport.NO_SCHEME, so_file.SCHEME): 338 return None 339 340 local_path = so_file.extract_local_path(uri) 341 if compression == so_compression.INFER_FROM_EXTENSION: 342 _, extension = P.splitext(local_path) 343 if extension in so_compression.get_supported_extensions(): 344 return None 345 elif compression != so_compression.NO_COMPRESSION: 346 return None 347 348 open_kwargs = {} 349 if encoding is not None: 350 open_kwargs['encoding'] = encoding 351 mode = mode.replace('b', '') 352 if newline is not None: 353 open_kwargs['newline'] = newline 354 355 # 356 # binary mode of the builtin/stdlib open function doesn't take an errors argument 357 # 358 if errors and 'b' not in mode: 359 open_kwargs['errors'] = errors 360 361 return _builtin_open(local_path, mode, buffering=buffering, **open_kwargs) 362 363 364def _open_binary_stream(uri, mode, transport_params): 365 """Open an arbitrary URI in the specified binary mode. 366 367 Not all modes are supported for all protocols. 368 369 :arg uri: The URI to open. May be a string, or something else. 370 :arg str mode: The mode to open with. Must be rb, wb or ab. 371 :arg transport_params: Keyword argumens for the transport layer. 372 :returns: A named file object 373 :rtype: file-like object with a .name attribute 374 """ 375 if mode not in ('rb', 'rb+', 'wb', 'wb+', 'ab', 'ab+'): 376 # 377 # This should really be a ValueError, but for the sake of compatibility 378 # with older versions, which raise NotImplementedError, we do the same. 379 # 380 raise NotImplementedError('unsupported mode: %r' % mode) 381 382 if hasattr(uri, 'read'): 383 # simply pass-through if already a file-like 384 # we need to return something as the file name, but we don't know what 385 # so we probe for uri.name (e.g., this works with open() or tempfile.NamedTemporaryFile) 386 # if the value ends with COMPRESSED_EXT, we will note it in compression_wrapper() 387 # if there is no such an attribute, we return "unknown" - this 388 # effectively disables any compression 389 if not hasattr(uri, 'name'): 390 uri.name = getattr(uri, 'name', 'unknown') 391 return uri 392 393 if not isinstance(uri, str): 394 raise TypeError("don't know how to handle uri %s" % repr(uri)) 395 396 scheme = _sniff_scheme(uri) 397 submodule = transport.get_transport(scheme) 398 fobj = submodule.open_uri(uri, mode, transport_params) 399 if not hasattr(fobj, 'name'): 400 fobj.name = uri 401 402 return fobj 403 404 405def _encoding_wrapper(fileobj, mode, encoding=None, errors=None, newline=None): 406 """Decode bytes into text, if necessary. 407 408 If mode specifies binary access, does nothing, unless the encoding is 409 specified. A non-null encoding implies text mode. 410 411 :arg fileobj: must quack like a filehandle object. 412 :arg str mode: is the mode which was originally requested by the user. 413 :arg str encoding: The text encoding to use. If mode is binary, overrides mode. 414 :arg str errors: The method to use when handling encoding/decoding errors. 415 :returns: a file object 416 """ 417 logger.debug('encoding_wrapper: %r', locals()) 418 419 # 420 # If the mode is binary, but the user specified an encoding, assume they 421 # want text. If we don't make this assumption, ignore the encoding and 422 # return bytes, smart_open behavior will diverge from the built-in open: 423 # 424 # open(filename, encoding='utf-8') returns a text stream in Py3 425 # smart_open(filename, encoding='utf-8') would return a byte stream 426 # without our assumption, because the default mode is rb. 427 # 428 if 'b' in mode and encoding is None: 429 return fileobj 430 431 if encoding is None: 432 encoding = DEFAULT_ENCODING 433 434 fileobj = io.TextIOWrapper( 435 fileobj, 436 encoding=encoding, 437 errors=errors, 438 newline=newline, 439 write_through=True, 440 ) 441 return fileobj 442 443 444class patch_pathlib(object): 445 """Replace `Path.open` with `smart_open.open`""" 446 447 def __init__(self): 448 self.old_impl = _patch_pathlib(open) 449 450 def __enter__(self): 451 return self 452 453 def __exit__(self, exc_type, exc_val, exc_tb): 454 _patch_pathlib(self.old_impl) 455 456 457def _patch_pathlib(func): 458 """Replace `Path.open` with `func`""" 459 old_impl = pathlib.Path.open 460 pathlib.Path.open = func 461 return old_impl 462 463 464def smart_open( 465 uri, 466 mode='rb', 467 buffering=-1, 468 encoding=None, 469 errors=None, 470 newline=None, 471 closefd=True, 472 opener=None, 473 ignore_extension=False, 474 **kwargs 475 ): 476 # 477 # This is a thin wrapper of smart_open.open. It's here for backward 478 # compatibility. It works exactly like smart_open.open when the passed 479 # parameters are identical. Otherwise, it raises a DeprecationWarning. 480 # 481 # For completeness, the main differences of the old smart_open function: 482 # 483 # 1. Default mode was read binary (mode='rb') 484 # 2. ignore_ext parameter was called ignore_extension 485 # 3. Transport parameters were passed directly as kwargs 486 # 487 url = 'https://github.com/RaRe-Technologies/smart_open/blob/develop/MIGRATING_FROM_OLDER_VERSIONS.rst' 488 if kwargs: 489 raise DeprecationWarning( 490 'The following keyword parameters are not supported: %r. ' 491 'See %s for more information.' % (sorted(kwargs), url) 492 ) 493 message = 'This function is deprecated. See %s for more information' % url 494 warnings.warn(message, category=DeprecationWarning) 495 496 ignore_ext = ignore_extension 497 del kwargs, url, message, ignore_extension 498 return open(**locals()) 499 500 501# 502# Prevent failures with doctools from messing up the entire library. We don't 503# expect such failures, but contributed modules (e.g. new transport mechanisms) 504# may not be as polished. 505# 506try: 507 doctools.tweak_open_docstring(open) 508 doctools.tweak_parse_uri_docstring(parse_uri) 509except Exception as ex: 510 logger.error( 511 'Encountered a non-fatal error while building docstrings (see below). ' 512 'help(smart_open) will provide incomplete information as a result. ' 513 'For full help text, see ' 514 '<https://github.com/RaRe-Technologies/smart_open/blob/master/help.txt>.' 515 ) 516 logger.exception(ex) 517