1"""A file interface for handling local and remote data files.
2
3The goal of datasource is to abstract some of the file system operations
4when dealing with data files so the researcher doesn't have to know all the
5low-level details.  Through datasource, a researcher can obtain and use a
6file with one function call, regardless of location of the file.
7
8DataSource is meant to augment standard python libraries, not replace them.
9It should work seamlessly with standard file IO operations and the os
10module.
11
12DataSource files can originate locally or remotely:
13
14- local files : '/home/guido/src/local/data.txt'
15- URLs (http, ftp, ...) : 'http://www.scipy.org/not/real/data.txt'
16
17DataSource files can also be compressed or uncompressed.  Currently only
18gzip, bz2 and xz are supported.
19
20Example::
21
22    >>> # Create a DataSource, use os.curdir (default) for local storage.
23    >>> from numpy import DataSource
24    >>> ds = DataSource()
25    >>>
26    >>> # Open a remote file.
27    >>> # DataSource downloads the file, stores it locally in:
28    >>> #     './www.google.com/index.html'
29    >>> # opens the file and returns a file object.
30    >>> fp = ds.open('http://www.google.com/') # doctest: +SKIP
31    >>>
32    >>> # Use the file as you normally would
33    >>> fp.read() # doctest: +SKIP
34    >>> fp.close() # doctest: +SKIP
35
36"""
37import os
38import shutil
39import io
40
41from numpy.core.overrides import set_module
42
43
44_open = open
45
46
47def _check_mode(mode, encoding, newline):
48    """Check mode and that encoding and newline are compatible.
49
50    Parameters
51    ----------
52    mode : str
53        File open mode.
54    encoding : str
55        File encoding.
56    newline : str
57        Newline for text files.
58
59    """
60    if "t" in mode:
61        if "b" in mode:
62            raise ValueError("Invalid mode: %r" % (mode,))
63    else:
64        if encoding is not None:
65            raise ValueError("Argument 'encoding' not supported in binary mode")
66        if newline is not None:
67            raise ValueError("Argument 'newline' not supported in binary mode")
68
69
70# Using a class instead of a module-level dictionary
71# to reduce the initial 'import numpy' overhead by
72# deferring the import of lzma, bz2 and gzip until needed
73
74# TODO: .zip support, .tar support?
75class _FileOpeners:
76    """
77    Container for different methods to open (un-)compressed files.
78
79    `_FileOpeners` contains a dictionary that holds one method for each
80    supported file format. Attribute lookup is implemented in such a way
81    that an instance of `_FileOpeners` itself can be indexed with the keys
82    of that dictionary. Currently uncompressed files as well as files
83    compressed with ``gzip``, ``bz2`` or ``xz`` compression are supported.
84
85    Notes
86    -----
87    `_file_openers`, an instance of `_FileOpeners`, is made available for
88    use in the `_datasource` module.
89
90    Examples
91    --------
92    >>> import gzip
93    >>> np.lib._datasource._file_openers.keys()
94    [None, '.bz2', '.gz', '.xz', '.lzma']
95    >>> np.lib._datasource._file_openers['.gz'] is gzip.open
96    True
97
98    """
99
100    def __init__(self):
101        self._loaded = False
102        self._file_openers = {None: io.open}
103
104    def _load(self):
105        if self._loaded:
106            return
107
108        try:
109            import bz2
110            self._file_openers[".bz2"] = bz2.open
111        except ImportError:
112            pass
113
114        try:
115            import gzip
116            self._file_openers[".gz"] = gzip.open
117        except ImportError:
118            pass
119
120        try:
121            import lzma
122            self._file_openers[".xz"] = lzma.open
123            self._file_openers[".lzma"] = lzma.open
124        except (ImportError, AttributeError):
125            # There are incompatible backports of lzma that do not have the
126            # lzma.open attribute, so catch that as well as ImportError.
127            pass
128
129        self._loaded = True
130
131    def keys(self):
132        """
133        Return the keys of currently supported file openers.
134
135        Parameters
136        ----------
137        None
138
139        Returns
140        -------
141        keys : list
142            The keys are None for uncompressed files and the file extension
143            strings (i.e. ``'.gz'``, ``'.xz'``) for supported compression
144            methods.
145
146        """
147        self._load()
148        return list(self._file_openers.keys())
149
150    def __getitem__(self, key):
151        self._load()
152        return self._file_openers[key]
153
154_file_openers = _FileOpeners()
155
156def open(path, mode='r', destpath=os.curdir, encoding=None, newline=None):
157    """
158    Open `path` with `mode` and return the file object.
159
160    If ``path`` is an URL, it will be downloaded, stored in the
161    `DataSource` `destpath` directory and opened from there.
162
163    Parameters
164    ----------
165    path : str
166        Local file path or URL to open.
167    mode : str, optional
168        Mode to open `path`. Mode 'r' for reading, 'w' for writing, 'a' to
169        append. Available modes depend on the type of object specified by
170        path.  Default is 'r'.
171    destpath : str, optional
172        Path to the directory where the source file gets downloaded to for
173        use.  If `destpath` is None, a temporary directory will be created.
174        The default path is the current directory.
175    encoding : {None, str}, optional
176        Open text file with given encoding. The default encoding will be
177        what `io.open` uses.
178    newline : {None, str}, optional
179        Newline to use when reading text file.
180
181    Returns
182    -------
183    out : file object
184        The opened file.
185
186    Notes
187    -----
188    This is a convenience function that instantiates a `DataSource` and
189    returns the file object from ``DataSource.open(path)``.
190
191    """
192
193    ds = DataSource(destpath)
194    return ds.open(path, mode, encoding=encoding, newline=newline)
195
196
197@set_module('numpy')
198class DataSource:
199    """
200    DataSource(destpath='.')
201
202    A generic data source file (file, http, ftp, ...).
203
204    DataSources can be local files or remote files/URLs.  The files may
205    also be compressed or uncompressed. DataSource hides some of the
206    low-level details of downloading the file, allowing you to simply pass
207    in a valid file path (or URL) and obtain a file object.
208
209    Parameters
210    ----------
211    destpath : str or None, optional
212        Path to the directory where the source file gets downloaded to for
213        use.  If `destpath` is None, a temporary directory will be created.
214        The default path is the current directory.
215
216    Notes
217    -----
218    URLs require a scheme string (``http://``) to be used, without it they
219    will fail::
220
221        >>> repos = np.DataSource()
222        >>> repos.exists('www.google.com/index.html')
223        False
224        >>> repos.exists('http://www.google.com/index.html')
225        True
226
227    Temporary directories are deleted when the DataSource is deleted.
228
229    Examples
230    --------
231    ::
232
233        >>> ds = np.DataSource('/home/guido')
234        >>> urlname = 'http://www.google.com/'
235        >>> gfile = ds.open('http://www.google.com/')
236        >>> ds.abspath(urlname)
237        '/home/guido/www.google.com/index.html'
238
239        >>> ds = np.DataSource(None)  # use with temporary file
240        >>> ds.open('/home/guido/foobar.txt')
241        <open file '/home/guido.foobar.txt', mode 'r' at 0x91d4430>
242        >>> ds.abspath('/home/guido/foobar.txt')
243        '/tmp/.../home/guido/foobar.txt'
244
245    """
246
247    def __init__(self, destpath=os.curdir):
248        """Create a DataSource with a local path at destpath."""
249        if destpath:
250            self._destpath = os.path.abspath(destpath)
251            self._istmpdest = False
252        else:
253            import tempfile  # deferring import to improve startup time
254            self._destpath = tempfile.mkdtemp()
255            self._istmpdest = True
256
257    def __del__(self):
258        # Remove temp directories
259        if hasattr(self, '_istmpdest') and self._istmpdest:
260            shutil.rmtree(self._destpath)
261
262    def _iszip(self, filename):
263        """Test if the filename is a zip file by looking at the file extension.
264
265        """
266        fname, ext = os.path.splitext(filename)
267        return ext in _file_openers.keys()
268
269    def _iswritemode(self, mode):
270        """Test if the given mode will open a file for writing."""
271
272        # Currently only used to test the bz2 files.
273        _writemodes = ("w", "+")
274        for c in mode:
275            if c in _writemodes:
276                return True
277        return False
278
279    def _splitzipext(self, filename):
280        """Split zip extension from filename and return filename.
281
282        *Returns*:
283            base, zip_ext : {tuple}
284
285        """
286
287        if self._iszip(filename):
288            return os.path.splitext(filename)
289        else:
290            return filename, None
291
292    def _possible_names(self, filename):
293        """Return a tuple containing compressed filename variations."""
294        names = [filename]
295        if not self._iszip(filename):
296            for zipext in _file_openers.keys():
297                if zipext:
298                    names.append(filename+zipext)
299        return names
300
301    def _isurl(self, path):
302        """Test if path is a net location.  Tests the scheme and netloc."""
303
304        # We do this here to reduce the 'import numpy' initial import time.
305        from urllib.parse import urlparse
306
307        # BUG : URLs require a scheme string ('http://') to be used.
308        #       www.google.com will fail.
309        #       Should we prepend the scheme for those that don't have it and
310        #       test that also?  Similar to the way we append .gz and test for
311        #       for compressed versions of files.
312
313        scheme, netloc, upath, uparams, uquery, ufrag = urlparse(path)
314        return bool(scheme and netloc)
315
316    def _cache(self, path):
317        """Cache the file specified by path.
318
319        Creates a copy of the file in the datasource cache.
320
321        """
322        # We import these here because importing urllib is slow and
323        # a significant fraction of numpy's total import time.
324        from urllib.request import urlopen
325        from urllib.error import URLError
326
327        upath = self.abspath(path)
328
329        # ensure directory exists
330        if not os.path.exists(os.path.dirname(upath)):
331            os.makedirs(os.path.dirname(upath))
332
333        # TODO: Doesn't handle compressed files!
334        if self._isurl(path):
335            with urlopen(path) as openedurl:
336                with _open(upath, 'wb') as f:
337                    shutil.copyfileobj(openedurl, f)
338        else:
339            shutil.copyfile(path, upath)
340        return upath
341
342    def _findfile(self, path):
343        """Searches for ``path`` and returns full path if found.
344
345        If path is an URL, _findfile will cache a local copy and return the
346        path to the cached file.  If path is a local file, _findfile will
347        return a path to that local file.
348
349        The search will include possible compressed versions of the file
350        and return the first occurrence found.
351
352        """
353
354        # Build list of possible local file paths
355        if not self._isurl(path):
356            # Valid local paths
357            filelist = self._possible_names(path)
358            # Paths in self._destpath
359            filelist += self._possible_names(self.abspath(path))
360        else:
361            # Cached URLs in self._destpath
362            filelist = self._possible_names(self.abspath(path))
363            # Remote URLs
364            filelist = filelist + self._possible_names(path)
365
366        for name in filelist:
367            if self.exists(name):
368                if self._isurl(name):
369                    name = self._cache(name)
370                return name
371        return None
372
373    def abspath(self, path):
374        """
375        Return absolute path of file in the DataSource directory.
376
377        If `path` is an URL, then `abspath` will return either the location
378        the file exists locally or the location it would exist when opened
379        using the `open` method.
380
381        Parameters
382        ----------
383        path : str
384            Can be a local file or a remote URL.
385
386        Returns
387        -------
388        out : str
389            Complete path, including the `DataSource` destination directory.
390
391        Notes
392        -----
393        The functionality is based on `os.path.abspath`.
394
395        """
396        # We do this here to reduce the 'import numpy' initial import time.
397        from urllib.parse import urlparse
398
399        # TODO:  This should be more robust.  Handles case where path includes
400        #        the destpath, but not other sub-paths. Failing case:
401        #        path = /home/guido/datafile.txt
402        #        destpath = /home/alex/
403        #        upath = self.abspath(path)
404        #        upath == '/home/alex/home/guido/datafile.txt'
405
406        # handle case where path includes self._destpath
407        splitpath = path.split(self._destpath, 2)
408        if len(splitpath) > 1:
409            path = splitpath[1]
410        scheme, netloc, upath, uparams, uquery, ufrag = urlparse(path)
411        netloc = self._sanitize_relative_path(netloc)
412        upath = self._sanitize_relative_path(upath)
413        return os.path.join(self._destpath, netloc, upath)
414
415    def _sanitize_relative_path(self, path):
416        """Return a sanitised relative path for which
417        os.path.abspath(os.path.join(base, path)).startswith(base)
418        """
419        last = None
420        path = os.path.normpath(path)
421        while path != last:
422            last = path
423            # Note: os.path.join treats '/' as os.sep on Windows
424            path = path.lstrip(os.sep).lstrip('/')
425            path = path.lstrip(os.pardir).lstrip('..')
426            drive, path = os.path.splitdrive(path)  # for Windows
427        return path
428
429    def exists(self, path):
430        """
431        Test if path exists.
432
433        Test if `path` exists as (and in this order):
434
435        - a local file.
436        - a remote URL that has been downloaded and stored locally in the
437          `DataSource` directory.
438        - a remote URL that has not been downloaded, but is valid and
439          accessible.
440
441        Parameters
442        ----------
443        path : str
444            Can be a local file or a remote URL.
445
446        Returns
447        -------
448        out : bool
449            True if `path` exists.
450
451        Notes
452        -----
453        When `path` is an URL, `exists` will return True if it's either
454        stored locally in the `DataSource` directory, or is a valid remote
455        URL.  `DataSource` does not discriminate between the two, the file
456        is accessible if it exists in either location.
457
458        """
459
460        # First test for local path
461        if os.path.exists(path):
462            return True
463
464        # We import this here because importing urllib is slow and
465        # a significant fraction of numpy's total import time.
466        from urllib.request import urlopen
467        from urllib.error import URLError
468
469        # Test cached url
470        upath = self.abspath(path)
471        if os.path.exists(upath):
472            return True
473
474        # Test remote url
475        if self._isurl(path):
476            try:
477                netfile = urlopen(path)
478                netfile.close()
479                del(netfile)
480                return True
481            except URLError:
482                return False
483        return False
484
485    def open(self, path, mode='r', encoding=None, newline=None):
486        """
487        Open and return file-like object.
488
489        If `path` is an URL, it will be downloaded, stored in the
490        `DataSource` directory and opened from there.
491
492        Parameters
493        ----------
494        path : str
495            Local file path or URL to open.
496        mode : {'r', 'w', 'a'}, optional
497            Mode to open `path`.  Mode 'r' for reading, 'w' for writing,
498            'a' to append. Available modes depend on the type of object
499            specified by `path`. Default is 'r'.
500        encoding : {None, str}, optional
501            Open text file with given encoding. The default encoding will be
502            what `io.open` uses.
503        newline : {None, str}, optional
504            Newline to use when reading text file.
505
506        Returns
507        -------
508        out : file object
509            File object.
510
511        """
512
513        # TODO: There is no support for opening a file for writing which
514        #       doesn't exist yet (creating a file).  Should there be?
515
516        # TODO: Add a ``subdir`` parameter for specifying the subdirectory
517        #       used to store URLs in self._destpath.
518
519        if self._isurl(path) and self._iswritemode(mode):
520            raise ValueError("URLs are not writeable")
521
522        # NOTE: _findfile will fail on a new file opened for writing.
523        found = self._findfile(path)
524        if found:
525            _fname, ext = self._splitzipext(found)
526            if ext == 'bz2':
527                mode.replace("+", "")
528            return _file_openers[ext](found, mode=mode,
529                                      encoding=encoding, newline=newline)
530        else:
531            raise IOError("%s not found." % path)
532
533
534class Repository (DataSource):
535    """
536    Repository(baseurl, destpath='.')
537
538    A data repository where multiple DataSource's share a base
539    URL/directory.
540
541    `Repository` extends `DataSource` by prepending a base URL (or
542    directory) to all the files it handles. Use `Repository` when you will
543    be working with multiple files from one base URL.  Initialize
544    `Repository` with the base URL, then refer to each file by its filename
545    only.
546
547    Parameters
548    ----------
549    baseurl : str
550        Path to the local directory or remote location that contains the
551        data files.
552    destpath : str or None, optional
553        Path to the directory where the source file gets downloaded to for
554        use.  If `destpath` is None, a temporary directory will be created.
555        The default path is the current directory.
556
557    Examples
558    --------
559    To analyze all files in the repository, do something like this
560    (note: this is not self-contained code)::
561
562        >>> repos = np.lib._datasource.Repository('/home/user/data/dir/')
563        >>> for filename in filelist:
564        ...     fp = repos.open(filename)
565        ...     fp.analyze()
566        ...     fp.close()
567
568    Similarly you could use a URL for a repository::
569
570        >>> repos = np.lib._datasource.Repository('http://www.xyz.edu/data')
571
572    """
573
574    def __init__(self, baseurl, destpath=os.curdir):
575        """Create a Repository with a shared url or directory of baseurl."""
576        DataSource.__init__(self, destpath=destpath)
577        self._baseurl = baseurl
578
579    def __del__(self):
580        DataSource.__del__(self)
581
582    def _fullpath(self, path):
583        """Return complete path for path.  Prepends baseurl if necessary."""
584        splitpath = path.split(self._baseurl, 2)
585        if len(splitpath) == 1:
586            result = os.path.join(self._baseurl, path)
587        else:
588            result = path    # path contains baseurl already
589        return result
590
591    def _findfile(self, path):
592        """Extend DataSource method to prepend baseurl to ``path``."""
593        return DataSource._findfile(self, self._fullpath(path))
594
595    def abspath(self, path):
596        """
597        Return absolute path of file in the Repository directory.
598
599        If `path` is an URL, then `abspath` will return either the location
600        the file exists locally or the location it would exist when opened
601        using the `open` method.
602
603        Parameters
604        ----------
605        path : str
606            Can be a local file or a remote URL. This may, but does not
607            have to, include the `baseurl` with which the `Repository` was
608            initialized.
609
610        Returns
611        -------
612        out : str
613            Complete path, including the `DataSource` destination directory.
614
615        """
616        return DataSource.abspath(self, self._fullpath(path))
617
618    def exists(self, path):
619        """
620        Test if path exists prepending Repository base URL to path.
621
622        Test if `path` exists as (and in this order):
623
624        - a local file.
625        - a remote URL that has been downloaded and stored locally in the
626          `DataSource` directory.
627        - a remote URL that has not been downloaded, but is valid and
628          accessible.
629
630        Parameters
631        ----------
632        path : str
633            Can be a local file or a remote URL. This may, but does not
634            have to, include the `baseurl` with which the `Repository` was
635            initialized.
636
637        Returns
638        -------
639        out : bool
640            True if `path` exists.
641
642        Notes
643        -----
644        When `path` is an URL, `exists` will return True if it's either
645        stored locally in the `DataSource` directory, or is a valid remote
646        URL.  `DataSource` does not discriminate between the two, the file
647        is accessible if it exists in either location.
648
649        """
650        return DataSource.exists(self, self._fullpath(path))
651
652    def open(self, path, mode='r', encoding=None, newline=None):
653        """
654        Open and return file-like object prepending Repository base URL.
655
656        If `path` is an URL, it will be downloaded, stored in the
657        DataSource directory and opened from there.
658
659        Parameters
660        ----------
661        path : str
662            Local file path or URL to open. This may, but does not have to,
663            include the `baseurl` with which the `Repository` was
664            initialized.
665        mode : {'r', 'w', 'a'}, optional
666            Mode to open `path`.  Mode 'r' for reading, 'w' for writing,
667            'a' to append. Available modes depend on the type of object
668            specified by `path`. Default is 'r'.
669        encoding : {None, str}, optional
670            Open text file with given encoding. The default encoding will be
671            what `io.open` uses.
672        newline : {None, str}, optional
673            Newline to use when reading text file.
674
675        Returns
676        -------
677        out : file object
678            File object.
679
680        """
681        return DataSource.open(self, self._fullpath(path), mode,
682                               encoding=encoding, newline=newline)
683
684    def listdir(self):
685        """
686        List files in the source Repository.
687
688        Returns
689        -------
690        files : list of str
691            List of file names (not containing a directory part).
692
693        Notes
694        -----
695        Does not currently work for remote repositories.
696
697        """
698        if self._isurl(self._baseurl):
699            raise NotImplementedError(
700                  "Directory listing of URLs, not supported yet.")
701        else:
702            return os.listdir(self._baseurl)
703