1"""A file interface for handling local and remote data files. 2 3The goal of datasource is to abstract some of the file system operations 4when dealing with data files so the researcher doesn't have to know all the 5low-level details. Through datasource, a researcher can obtain and use a 6file with one function call, regardless of location of the file. 7 8DataSource is meant to augment standard python libraries, not replace them. 9It should work seamlessly with standard file IO operations and the os 10module. 11 12DataSource files can originate locally or remotely: 13 14- local files : '/home/guido/src/local/data.txt' 15- URLs (http, ftp, ...) : 'http://www.scipy.org/not/real/data.txt' 16 17DataSource files can also be compressed or uncompressed. Currently only 18gzip, bz2 and xz are supported. 19 20Example:: 21 22 >>> # Create a DataSource, use os.curdir (default) for local storage. 23 >>> from numpy import DataSource 24 >>> ds = DataSource() 25 >>> 26 >>> # Open a remote file. 27 >>> # DataSource downloads the file, stores it locally in: 28 >>> # './www.google.com/index.html' 29 >>> # opens the file and returns a file object. 30 >>> fp = ds.open('http://www.google.com/') # doctest: +SKIP 31 >>> 32 >>> # Use the file as you normally would 33 >>> fp.read() # doctest: +SKIP 34 >>> fp.close() # doctest: +SKIP 35 36""" 37import os 38import shutil 39import io 40 41from numpy.core.overrides import set_module 42 43 44_open = open 45 46 47def _check_mode(mode, encoding, newline): 48 """Check mode and that encoding and newline are compatible. 49 50 Parameters 51 ---------- 52 mode : str 53 File open mode. 54 encoding : str 55 File encoding. 56 newline : str 57 Newline for text files. 58 59 """ 60 if "t" in mode: 61 if "b" in mode: 62 raise ValueError("Invalid mode: %r" % (mode,)) 63 else: 64 if encoding is not None: 65 raise ValueError("Argument 'encoding' not supported in binary mode") 66 if newline is not None: 67 raise ValueError("Argument 'newline' not supported in binary mode") 68 69 70# Using a class instead of a module-level dictionary 71# to reduce the initial 'import numpy' overhead by 72# deferring the import of lzma, bz2 and gzip until needed 73 74# TODO: .zip support, .tar support? 75class _FileOpeners: 76 """ 77 Container for different methods to open (un-)compressed files. 78 79 `_FileOpeners` contains a dictionary that holds one method for each 80 supported file format. Attribute lookup is implemented in such a way 81 that an instance of `_FileOpeners` itself can be indexed with the keys 82 of that dictionary. Currently uncompressed files as well as files 83 compressed with ``gzip``, ``bz2`` or ``xz`` compression are supported. 84 85 Notes 86 ----- 87 `_file_openers`, an instance of `_FileOpeners`, is made available for 88 use in the `_datasource` module. 89 90 Examples 91 -------- 92 >>> import gzip 93 >>> np.lib._datasource._file_openers.keys() 94 [None, '.bz2', '.gz', '.xz', '.lzma'] 95 >>> np.lib._datasource._file_openers['.gz'] is gzip.open 96 True 97 98 """ 99 100 def __init__(self): 101 self._loaded = False 102 self._file_openers = {None: io.open} 103 104 def _load(self): 105 if self._loaded: 106 return 107 108 try: 109 import bz2 110 self._file_openers[".bz2"] = bz2.open 111 except ImportError: 112 pass 113 114 try: 115 import gzip 116 self._file_openers[".gz"] = gzip.open 117 except ImportError: 118 pass 119 120 try: 121 import lzma 122 self._file_openers[".xz"] = lzma.open 123 self._file_openers[".lzma"] = lzma.open 124 except (ImportError, AttributeError): 125 # There are incompatible backports of lzma that do not have the 126 # lzma.open attribute, so catch that as well as ImportError. 127 pass 128 129 self._loaded = True 130 131 def keys(self): 132 """ 133 Return the keys of currently supported file openers. 134 135 Parameters 136 ---------- 137 None 138 139 Returns 140 ------- 141 keys : list 142 The keys are None for uncompressed files and the file extension 143 strings (i.e. ``'.gz'``, ``'.xz'``) for supported compression 144 methods. 145 146 """ 147 self._load() 148 return list(self._file_openers.keys()) 149 150 def __getitem__(self, key): 151 self._load() 152 return self._file_openers[key] 153 154_file_openers = _FileOpeners() 155 156def open(path, mode='r', destpath=os.curdir, encoding=None, newline=None): 157 """ 158 Open `path` with `mode` and return the file object. 159 160 If ``path`` is an URL, it will be downloaded, stored in the 161 `DataSource` `destpath` directory and opened from there. 162 163 Parameters 164 ---------- 165 path : str 166 Local file path or URL to open. 167 mode : str, optional 168 Mode to open `path`. Mode 'r' for reading, 'w' for writing, 'a' to 169 append. Available modes depend on the type of object specified by 170 path. Default is 'r'. 171 destpath : str, optional 172 Path to the directory where the source file gets downloaded to for 173 use. If `destpath` is None, a temporary directory will be created. 174 The default path is the current directory. 175 encoding : {None, str}, optional 176 Open text file with given encoding. The default encoding will be 177 what `io.open` uses. 178 newline : {None, str}, optional 179 Newline to use when reading text file. 180 181 Returns 182 ------- 183 out : file object 184 The opened file. 185 186 Notes 187 ----- 188 This is a convenience function that instantiates a `DataSource` and 189 returns the file object from ``DataSource.open(path)``. 190 191 """ 192 193 ds = DataSource(destpath) 194 return ds.open(path, mode, encoding=encoding, newline=newline) 195 196 197@set_module('numpy') 198class DataSource: 199 """ 200 DataSource(destpath='.') 201 202 A generic data source file (file, http, ftp, ...). 203 204 DataSources can be local files or remote files/URLs. The files may 205 also be compressed or uncompressed. DataSource hides some of the 206 low-level details of downloading the file, allowing you to simply pass 207 in a valid file path (or URL) and obtain a file object. 208 209 Parameters 210 ---------- 211 destpath : str or None, optional 212 Path to the directory where the source file gets downloaded to for 213 use. If `destpath` is None, a temporary directory will be created. 214 The default path is the current directory. 215 216 Notes 217 ----- 218 URLs require a scheme string (``http://``) to be used, without it they 219 will fail:: 220 221 >>> repos = np.DataSource() 222 >>> repos.exists('www.google.com/index.html') 223 False 224 >>> repos.exists('http://www.google.com/index.html') 225 True 226 227 Temporary directories are deleted when the DataSource is deleted. 228 229 Examples 230 -------- 231 :: 232 233 >>> ds = np.DataSource('/home/guido') 234 >>> urlname = 'http://www.google.com/' 235 >>> gfile = ds.open('http://www.google.com/') 236 >>> ds.abspath(urlname) 237 '/home/guido/www.google.com/index.html' 238 239 >>> ds = np.DataSource(None) # use with temporary file 240 >>> ds.open('/home/guido/foobar.txt') 241 <open file '/home/guido.foobar.txt', mode 'r' at 0x91d4430> 242 >>> ds.abspath('/home/guido/foobar.txt') 243 '/tmp/.../home/guido/foobar.txt' 244 245 """ 246 247 def __init__(self, destpath=os.curdir): 248 """Create a DataSource with a local path at destpath.""" 249 if destpath: 250 self._destpath = os.path.abspath(destpath) 251 self._istmpdest = False 252 else: 253 import tempfile # deferring import to improve startup time 254 self._destpath = tempfile.mkdtemp() 255 self._istmpdest = True 256 257 def __del__(self): 258 # Remove temp directories 259 if hasattr(self, '_istmpdest') and self._istmpdest: 260 shutil.rmtree(self._destpath) 261 262 def _iszip(self, filename): 263 """Test if the filename is a zip file by looking at the file extension. 264 265 """ 266 fname, ext = os.path.splitext(filename) 267 return ext in _file_openers.keys() 268 269 def _iswritemode(self, mode): 270 """Test if the given mode will open a file for writing.""" 271 272 # Currently only used to test the bz2 files. 273 _writemodes = ("w", "+") 274 for c in mode: 275 if c in _writemodes: 276 return True 277 return False 278 279 def _splitzipext(self, filename): 280 """Split zip extension from filename and return filename. 281 282 *Returns*: 283 base, zip_ext : {tuple} 284 285 """ 286 287 if self._iszip(filename): 288 return os.path.splitext(filename) 289 else: 290 return filename, None 291 292 def _possible_names(self, filename): 293 """Return a tuple containing compressed filename variations.""" 294 names = [filename] 295 if not self._iszip(filename): 296 for zipext in _file_openers.keys(): 297 if zipext: 298 names.append(filename+zipext) 299 return names 300 301 def _isurl(self, path): 302 """Test if path is a net location. Tests the scheme and netloc.""" 303 304 # We do this here to reduce the 'import numpy' initial import time. 305 from urllib.parse import urlparse 306 307 # BUG : URLs require a scheme string ('http://') to be used. 308 # www.google.com will fail. 309 # Should we prepend the scheme for those that don't have it and 310 # test that also? Similar to the way we append .gz and test for 311 # for compressed versions of files. 312 313 scheme, netloc, upath, uparams, uquery, ufrag = urlparse(path) 314 return bool(scheme and netloc) 315 316 def _cache(self, path): 317 """Cache the file specified by path. 318 319 Creates a copy of the file in the datasource cache. 320 321 """ 322 # We import these here because importing urllib is slow and 323 # a significant fraction of numpy's total import time. 324 from urllib.request import urlopen 325 from urllib.error import URLError 326 327 upath = self.abspath(path) 328 329 # ensure directory exists 330 if not os.path.exists(os.path.dirname(upath)): 331 os.makedirs(os.path.dirname(upath)) 332 333 # TODO: Doesn't handle compressed files! 334 if self._isurl(path): 335 with urlopen(path) as openedurl: 336 with _open(upath, 'wb') as f: 337 shutil.copyfileobj(openedurl, f) 338 else: 339 shutil.copyfile(path, upath) 340 return upath 341 342 def _findfile(self, path): 343 """Searches for ``path`` and returns full path if found. 344 345 If path is an URL, _findfile will cache a local copy and return the 346 path to the cached file. If path is a local file, _findfile will 347 return a path to that local file. 348 349 The search will include possible compressed versions of the file 350 and return the first occurrence found. 351 352 """ 353 354 # Build list of possible local file paths 355 if not self._isurl(path): 356 # Valid local paths 357 filelist = self._possible_names(path) 358 # Paths in self._destpath 359 filelist += self._possible_names(self.abspath(path)) 360 else: 361 # Cached URLs in self._destpath 362 filelist = self._possible_names(self.abspath(path)) 363 # Remote URLs 364 filelist = filelist + self._possible_names(path) 365 366 for name in filelist: 367 if self.exists(name): 368 if self._isurl(name): 369 name = self._cache(name) 370 return name 371 return None 372 373 def abspath(self, path): 374 """ 375 Return absolute path of file in the DataSource directory. 376 377 If `path` is an URL, then `abspath` will return either the location 378 the file exists locally or the location it would exist when opened 379 using the `open` method. 380 381 Parameters 382 ---------- 383 path : str 384 Can be a local file or a remote URL. 385 386 Returns 387 ------- 388 out : str 389 Complete path, including the `DataSource` destination directory. 390 391 Notes 392 ----- 393 The functionality is based on `os.path.abspath`. 394 395 """ 396 # We do this here to reduce the 'import numpy' initial import time. 397 from urllib.parse import urlparse 398 399 # TODO: This should be more robust. Handles case where path includes 400 # the destpath, but not other sub-paths. Failing case: 401 # path = /home/guido/datafile.txt 402 # destpath = /home/alex/ 403 # upath = self.abspath(path) 404 # upath == '/home/alex/home/guido/datafile.txt' 405 406 # handle case where path includes self._destpath 407 splitpath = path.split(self._destpath, 2) 408 if len(splitpath) > 1: 409 path = splitpath[1] 410 scheme, netloc, upath, uparams, uquery, ufrag = urlparse(path) 411 netloc = self._sanitize_relative_path(netloc) 412 upath = self._sanitize_relative_path(upath) 413 return os.path.join(self._destpath, netloc, upath) 414 415 def _sanitize_relative_path(self, path): 416 """Return a sanitised relative path for which 417 os.path.abspath(os.path.join(base, path)).startswith(base) 418 """ 419 last = None 420 path = os.path.normpath(path) 421 while path != last: 422 last = path 423 # Note: os.path.join treats '/' as os.sep on Windows 424 path = path.lstrip(os.sep).lstrip('/') 425 path = path.lstrip(os.pardir).lstrip('..') 426 drive, path = os.path.splitdrive(path) # for Windows 427 return path 428 429 def exists(self, path): 430 """ 431 Test if path exists. 432 433 Test if `path` exists as (and in this order): 434 435 - a local file. 436 - a remote URL that has been downloaded and stored locally in the 437 `DataSource` directory. 438 - a remote URL that has not been downloaded, but is valid and 439 accessible. 440 441 Parameters 442 ---------- 443 path : str 444 Can be a local file or a remote URL. 445 446 Returns 447 ------- 448 out : bool 449 True if `path` exists. 450 451 Notes 452 ----- 453 When `path` is an URL, `exists` will return True if it's either 454 stored locally in the `DataSource` directory, or is a valid remote 455 URL. `DataSource` does not discriminate between the two, the file 456 is accessible if it exists in either location. 457 458 """ 459 460 # First test for local path 461 if os.path.exists(path): 462 return True 463 464 # We import this here because importing urllib is slow and 465 # a significant fraction of numpy's total import time. 466 from urllib.request import urlopen 467 from urllib.error import URLError 468 469 # Test cached url 470 upath = self.abspath(path) 471 if os.path.exists(upath): 472 return True 473 474 # Test remote url 475 if self._isurl(path): 476 try: 477 netfile = urlopen(path) 478 netfile.close() 479 del(netfile) 480 return True 481 except URLError: 482 return False 483 return False 484 485 def open(self, path, mode='r', encoding=None, newline=None): 486 """ 487 Open and return file-like object. 488 489 If `path` is an URL, it will be downloaded, stored in the 490 `DataSource` directory and opened from there. 491 492 Parameters 493 ---------- 494 path : str 495 Local file path or URL to open. 496 mode : {'r', 'w', 'a'}, optional 497 Mode to open `path`. Mode 'r' for reading, 'w' for writing, 498 'a' to append. Available modes depend on the type of object 499 specified by `path`. Default is 'r'. 500 encoding : {None, str}, optional 501 Open text file with given encoding. The default encoding will be 502 what `io.open` uses. 503 newline : {None, str}, optional 504 Newline to use when reading text file. 505 506 Returns 507 ------- 508 out : file object 509 File object. 510 511 """ 512 513 # TODO: There is no support for opening a file for writing which 514 # doesn't exist yet (creating a file). Should there be? 515 516 # TODO: Add a ``subdir`` parameter for specifying the subdirectory 517 # used to store URLs in self._destpath. 518 519 if self._isurl(path) and self._iswritemode(mode): 520 raise ValueError("URLs are not writeable") 521 522 # NOTE: _findfile will fail on a new file opened for writing. 523 found = self._findfile(path) 524 if found: 525 _fname, ext = self._splitzipext(found) 526 if ext == 'bz2': 527 mode.replace("+", "") 528 return _file_openers[ext](found, mode=mode, 529 encoding=encoding, newline=newline) 530 else: 531 raise IOError("%s not found." % path) 532 533 534class Repository (DataSource): 535 """ 536 Repository(baseurl, destpath='.') 537 538 A data repository where multiple DataSource's share a base 539 URL/directory. 540 541 `Repository` extends `DataSource` by prepending a base URL (or 542 directory) to all the files it handles. Use `Repository` when you will 543 be working with multiple files from one base URL. Initialize 544 `Repository` with the base URL, then refer to each file by its filename 545 only. 546 547 Parameters 548 ---------- 549 baseurl : str 550 Path to the local directory or remote location that contains the 551 data files. 552 destpath : str or None, optional 553 Path to the directory where the source file gets downloaded to for 554 use. If `destpath` is None, a temporary directory will be created. 555 The default path is the current directory. 556 557 Examples 558 -------- 559 To analyze all files in the repository, do something like this 560 (note: this is not self-contained code):: 561 562 >>> repos = np.lib._datasource.Repository('/home/user/data/dir/') 563 >>> for filename in filelist: 564 ... fp = repos.open(filename) 565 ... fp.analyze() 566 ... fp.close() 567 568 Similarly you could use a URL for a repository:: 569 570 >>> repos = np.lib._datasource.Repository('http://www.xyz.edu/data') 571 572 """ 573 574 def __init__(self, baseurl, destpath=os.curdir): 575 """Create a Repository with a shared url or directory of baseurl.""" 576 DataSource.__init__(self, destpath=destpath) 577 self._baseurl = baseurl 578 579 def __del__(self): 580 DataSource.__del__(self) 581 582 def _fullpath(self, path): 583 """Return complete path for path. Prepends baseurl if necessary.""" 584 splitpath = path.split(self._baseurl, 2) 585 if len(splitpath) == 1: 586 result = os.path.join(self._baseurl, path) 587 else: 588 result = path # path contains baseurl already 589 return result 590 591 def _findfile(self, path): 592 """Extend DataSource method to prepend baseurl to ``path``.""" 593 return DataSource._findfile(self, self._fullpath(path)) 594 595 def abspath(self, path): 596 """ 597 Return absolute path of file in the Repository directory. 598 599 If `path` is an URL, then `abspath` will return either the location 600 the file exists locally or the location it would exist when opened 601 using the `open` method. 602 603 Parameters 604 ---------- 605 path : str 606 Can be a local file or a remote URL. This may, but does not 607 have to, include the `baseurl` with which the `Repository` was 608 initialized. 609 610 Returns 611 ------- 612 out : str 613 Complete path, including the `DataSource` destination directory. 614 615 """ 616 return DataSource.abspath(self, self._fullpath(path)) 617 618 def exists(self, path): 619 """ 620 Test if path exists prepending Repository base URL to path. 621 622 Test if `path` exists as (and in this order): 623 624 - a local file. 625 - a remote URL that has been downloaded and stored locally in the 626 `DataSource` directory. 627 - a remote URL that has not been downloaded, but is valid and 628 accessible. 629 630 Parameters 631 ---------- 632 path : str 633 Can be a local file or a remote URL. This may, but does not 634 have to, include the `baseurl` with which the `Repository` was 635 initialized. 636 637 Returns 638 ------- 639 out : bool 640 True if `path` exists. 641 642 Notes 643 ----- 644 When `path` is an URL, `exists` will return True if it's either 645 stored locally in the `DataSource` directory, or is a valid remote 646 URL. `DataSource` does not discriminate between the two, the file 647 is accessible if it exists in either location. 648 649 """ 650 return DataSource.exists(self, self._fullpath(path)) 651 652 def open(self, path, mode='r', encoding=None, newline=None): 653 """ 654 Open and return file-like object prepending Repository base URL. 655 656 If `path` is an URL, it will be downloaded, stored in the 657 DataSource directory and opened from there. 658 659 Parameters 660 ---------- 661 path : str 662 Local file path or URL to open. This may, but does not have to, 663 include the `baseurl` with which the `Repository` was 664 initialized. 665 mode : {'r', 'w', 'a'}, optional 666 Mode to open `path`. Mode 'r' for reading, 'w' for writing, 667 'a' to append. Available modes depend on the type of object 668 specified by `path`. Default is 'r'. 669 encoding : {None, str}, optional 670 Open text file with given encoding. The default encoding will be 671 what `io.open` uses. 672 newline : {None, str}, optional 673 Newline to use when reading text file. 674 675 Returns 676 ------- 677 out : file object 678 File object. 679 680 """ 681 return DataSource.open(self, self._fullpath(path), mode, 682 encoding=encoding, newline=newline) 683 684 def listdir(self): 685 """ 686 List files in the source Repository. 687 688 Returns 689 ------- 690 files : list of str 691 List of file names (not containing a directory part). 692 693 Notes 694 ----- 695 Does not currently work for remote repositories. 696 697 """ 698 if self._isurl(self._baseurl): 699 raise NotImplementedError( 700 "Directory listing of URLs, not supported yet.") 701 else: 702 return os.listdir(self._baseurl) 703