1# -*- coding: utf-8 -*- 2from __future__ import absolute_import, print_function, division 3 4import logging 5import sys 6from contextlib import contextmanager 7 8from petl.compat import PY3 9from petl.io.sources import register_reader, register_writer, get_reader, get_writer 10 11logger = logging.getLogger(__name__) 12 13# region RemoteSource 14 15 16class RemoteSource(object): 17 """Read or write directly from files in remote filesystems. 18 19 This source handles many filesystems that are selected based on the 20 protocol passed in the `url` argument. 21 22 The url should be specified in `to..()` and `from...()` functions. E.g.:: 23 24 >>> import petl as etl 25 >>> 26 >>> def example_s3(): 27 ... url = 's3://mybucket/prefix/to/myfilename.csv' 28 ... data = b'foo,bar\\na,1\\nb,2\\nc,2\\n' 29 ... 30 ... etl.tocsv(data, url) 31 ... tbl = etl.fromcsv(url) 32 ... 33 >>> example_s3() # doctest: +SKIP 34 +-----+-----+ 35 | foo | bar | 36 +=====+=====+ 37 | 'a' | '1' | 38 +-----+-----+ 39 | 'b' | '2' | 40 +-----+-----+ 41 | 'c' | '2' | 42 +-----+-----+ 43 44 This source uses `fsspec`_ to provide the data transfer with the remote 45 filesystem. Check the `Built-in Implementations <fs_builtin>`_ for available 46 remote implementations. 47 48 Some filesystem can use `URL chaining <fs_chain>`_ for compound I/O. 49 50 .. note:: 51 52 For working this source require `fsspec`_ to be installed, e.g.:: 53 54 $ pip install fsspec 55 56 Some remote filesystems require aditional packages to be installed. 57 Check `Known Implementations <fs_known>`_ for checking what packages 58 need to be installed, e.g.:: 59 60 $ pip install s3fs # AWS S3 61 $ pip install gcsfs # Google Cloud Storage 62 $ pip install adlfs # Azure Blob service 63 $ pip install paramiko # SFTP 64 $ pip install requests # HTTP, github 65 66 .. versionadded:: 1.6.0 67 68 .. _fsspec: https://filesystem-spec.readthedocs.io/en/latest/ 69 .. _fs_builtin: https://filesystem-spec.readthedocs.io/en/latest/api.html#built-in-implementations 70 .. _fs_known: https://filesystem-spec.readthedocs.io/en/latest/api.html#other-known-implementations 71 .. _fs_chain: https://filesystem-spec.readthedocs.io/en/latest/features.html#url-chaining 72 """ 73 74 def __init__(self, url, **kwargs): 75 self.url = url 76 self.kwargs = kwargs 77 78 def open_file(self, mode="rb"): 79 import fsspec 80 # auto_mkdir=True can fail in some filesystems or without permission for full path 81 # E.g: s3fs tries to create a bucket when writing into a folder does not exists 82 fs = fsspec.open(self.url, mode=mode, compression='infer', auto_mkdir=False, **self.kwargs) 83 return fs 84 85 @contextmanager 86 def open(self, mode="rb"): 87 mode2 = mode[:1] + r"b" # python2 88 fs = self.open_file(mode=mode2) 89 with fs as source: 90 yield source 91 92 93# registering filesystems with packages installed 94 95 96def _register_filesystems(only_available=False): 97 """Register all known fsspec implementations as remote source.""" 98 from fsspec.registry import known_implementations, registry 99 # https://filesystem-spec.readthedocs.io/en/latest/api.html#built-in-implementations 100 # https://filesystem-spec.readthedocs.io/en/latest/api.html#other-known-implementations 101 _register_filesystems_from(known_implementations, only_available) 102 # https://filesystem-spec.readthedocs.io/en/latest/api.html#fsspec.registry.register_implementation 103 _register_filesystems_from(registry, only_available) 104 105 106def _register_filesystems_from(fsspec_registry, only_available): 107 """Register each fsspec provider from this registry as remote source.""" 108 for protocol, spec in fsspec_registry.items(): 109 missing_deps = "err" in spec 110 if missing_deps and only_available: 111 # this could lead to only buit-in implementations available 112 # Other Known Implementations are reported with 'err' even even 113 # the package is installed 114 continue 115 # When missing a package for fsspec use the available source in petl 116 # E.g: fsspec requires `requests` package installed for handling http and https 117 # but petl has URLSource that can work with urlib 118 has_reader = get_reader(protocol) 119 if not missing_deps or has_reader is None: 120 register_reader(protocol, RemoteSource) 121 has_writer = get_writer(protocol) 122 if not missing_deps or has_writer is None: 123 register_writer(protocol, RemoteSource) 124 125 126def _try_register_filesystems(): 127 try: 128 # pylint: disable=unused-import 129 import fsspec # noqa: F401 130 except ImportError: 131 logger.debug("# Missing fsspec package. Install with: pip install fsspec") 132 else: 133 try: 134 _register_filesystems() 135 except Exception as ex: 136 raise ImportError("# ERROR: failed to register fsspec filesystems", ex) 137 138 139if PY3: 140 _try_register_filesystems() 141 142# endregion 143 144# region SMBSource 145 146 147class SMBSource(object): 148 """Downloads or uploads to Windows and Samba network drives. E.g.:: 149 150 >>> def example_smb(): 151 ... import petl as etl 152 ... url = 'smb://user:password@server/share/folder/file.csv' 153 ... data = b'foo,bar\\na,1\\nb,2\\nc,2\\n' 154 ... etl.tocsv(data, url) 155 ... tbl = etl.fromcsv(url) 156 ... 157 >>> example_smb() # doctest: +SKIP 158 +-----+-----+ 159 | foo | bar | 160 +=====+=====+ 161 | 'a' | '1' | 162 +-----+-----+ 163 | 'b' | '2' | 164 +-----+-----+ 165 | 'c' | '2' | 166 +-----+-----+ 167 168 The argument `url` (str) must have a URI with format: 169 `smb://workgroup;user:password@server:port/share/folder/file.csv`. 170 171 Note that you need to pass in a valid hostname or IP address for the host 172 component of the URL. Do not use the Windows/NetBIOS machine name for the 173 host component. 174 175 The first component of the path in the URL points to the name of the shared 176 folder. Subsequent path components will point to the directory/folder/file. 177 178 .. note:: 179 180 For working this source require `smbprotocol`_ to be installed, e.g.:: 181 182 $ pip install smbprotocol[kerberos] 183 184 .. versionadded:: 1.5.0 185 186 .. _smbprotocol: https://github.com/jborean93/smbprotocol#requirements 187 """ 188 189 def __init__(self, url, **kwargs): 190 self.url = url 191 self.kwargs = kwargs 192 193 @contextmanager 194 def open(self, mode="rb"): 195 mode2 = mode[:1] + r"b" # python2 196 source = _open_file_smbprotocol(self.url, mode=mode2, **self.kwargs) 197 try: 198 yield source 199 finally: 200 source.close() 201 202 203def _open_file_smbprotocol(url, mode="rb", **kwargs): 204 205 _domain, host, port, user, passwd, server_path = _parse_smb_url(url) 206 import smbclient 207 208 try: 209 # register the server with explicit credentials 210 if user: 211 smbclient.register_session( 212 host, username=user, password=passwd, port=port 213 ) 214 # Read an existing file as bytes 215 mode2 = mode[:1] + r"b" 216 filehandle = smbclient.open_file(server_path, mode=mode2, **kwargs) 217 return filehandle 218 219 except Exception as ex: 220 raise ConnectionError("SMB error: %s" % ex).with_traceback(sys.exc_info()[2]) 221 222 223def _parse_smb_url(url): 224 e = "SMB url must be smb://workgroup;user:password@server:port/share/folder/file.txt: " 225 226 if not url: 227 raise ValueError("SMB error: no host given") 228 if not url.startswith("smb://"): 229 raise ValueError(e + url) 230 231 if PY3: 232 from urllib.parse import urlparse 233 else: 234 from urlparse import urlparse 235 parsed = urlparse(url) 236 if not parsed.path: 237 raise ValueError(e + url) 238 239 unc_path = parsed.path.replace("/", "\\") 240 server_path = "\\\\{}{}".format(parsed.hostname, unc_path) 241 242 if not parsed.username: 243 domain = None 244 username = None 245 elif ";" in parsed.username: 246 domain, username = parsed.username.split(";") 247 else: 248 domain, username = None, parsed.username 249 port = 445 if not parsed.port else int(parsed.port) 250 return domain, parsed.hostname, port, username, parsed.password, server_path 251 252 253register_reader("smb", SMBSource) 254register_writer("smb", SMBSource) 255 256# endregion 257