1# Natural Language Toolkit: Utility functions
2#
3# Copyright (C) 2001-2019 NLTK Project
4# Author: Edward Loper <edloper@gmail.com>
5# URL: <http://nltk.org/>
6# For license information, see LICENSE.TXT
7
8"""
9Functions to find and load NLTK resource files, such as corpora,
10grammars, and saved processing objects.  Resource files are identified
11using URLs, such as ``nltk:corpora/abc/rural.txt`` or
12``http://nltk.org/sample/toy.cfg``.  The following URL protocols are
13supported:
14
15  - ``file:path``: Specifies the file whose path is *path*.
16    Both relative and absolute paths may be used.
17
18  - ``http://host/path``: Specifies the file stored on the web
19    server *host* at path *path*.
20
21  - ``nltk:path``: Specifies the file stored in the NLTK data
22    package at *path*.  NLTK will search for these files in the
23    directories specified by ``nltk.data.path``.
24
25If no protocol is specified, then the default protocol ``nltk:`` will
26be used.
27
28This module provides to functions that can be used to access a
29resource file, given its URL: ``load()`` loads a given resource, and
30adds it to a resource cache; and ``retrieve()`` copies a given resource
31to a local file.
32"""
33from __future__ import print_function, unicode_literals, division
34
35import functools
36import textwrap
37import io
38import os
39import re
40import sys
41import zipfile
42import codecs
43
44from abc import ABCMeta, abstractmethod
45from gzip import GzipFile, WRITE as GZ_WRITE
46
47from six import add_metaclass
48from six import string_types, text_type
49from six.moves.urllib.request import urlopen, url2pathname
50
51try:
52    import cPickle as pickle
53except ImportError:
54    import pickle
55
56try:  # Python 3.
57    textwrap_indent = functools.partial(textwrap.indent, prefix='  ')
58except AttributeError:  # Python 2; indent() not available for Python2.
59    textwrap_fill = functools.partial(
60        textwrap.fill,
61        initial_indent='  ',
62        subsequent_indent='  ',
63        replace_whitespace=False,
64    )
65
66    def textwrap_indent(text):
67        return '\n'.join(textwrap_fill(line) for line in text.splitlines())
68
69
70try:
71    from zlib import Z_SYNC_FLUSH as FLUSH
72except ImportError:
73    from zlib import Z_FINISH as FLUSH
74
75# this import should be more specific:
76import nltk
77from nltk.compat import py3_data, add_py3_data, BytesIO
78
79######################################################################
80# Search Path
81######################################################################
82
83path = []
84"""A list of directories where the NLTK data package might reside.
85   These directories will be checked in order when looking for a
86   resource in the data package.  Note that this allows users to
87   substitute in their own versions of resources, if they have them
88   (e.g., in their home directory under ~/nltk_data)."""
89
90# User-specified locations:
91_paths_from_env = os.environ.get('NLTK_DATA', str('')).split(os.pathsep)
92path += [d for d in _paths_from_env if d]
93if 'APPENGINE_RUNTIME' not in os.environ and os.path.expanduser('~/') != '~/':
94    path.append(os.path.expanduser(str('~/nltk_data')))
95
96if sys.platform.startswith('win'):
97    # Common locations on Windows:
98    path += [
99        os.path.join(sys.prefix, str('nltk_data')),
100        os.path.join(sys.prefix, str('share'), str('nltk_data')),
101        os.path.join(sys.prefix, str('lib'), str('nltk_data')),
102        os.path.join(os.environ.get(str('APPDATA'), str('C:\\')), str('nltk_data')),
103        str(r'C:\nltk_data'),
104        str(r'D:\nltk_data'),
105        str(r'E:\nltk_data'),
106    ]
107else:
108    # Common locations on UNIX & OS X:
109    path += [
110        os.path.join(sys.prefix, str('nltk_data')),
111        os.path.join(sys.prefix, str('share'), str('nltk_data')),
112        os.path.join(sys.prefix, str('lib'), str('nltk_data')),
113        str('/usr/share/nltk_data'),
114        str('/usr/local/share/nltk_data'),
115        str('/usr/lib/nltk_data'),
116        str('/usr/local/lib/nltk_data'),
117    ]
118
119
120######################################################################
121# Util Functions
122######################################################################
123
124
125def gzip_open_unicode(
126    filename,
127    mode="rb",
128    compresslevel=9,
129    encoding='utf-8',
130    fileobj=None,
131    errors=None,
132    newline=None,
133):
134    if fileobj is None:
135        fileobj = GzipFile(filename, mode, compresslevel, fileobj)
136    return io.TextIOWrapper(fileobj, encoding, errors, newline)
137
138
139def split_resource_url(resource_url):
140    """
141    Splits a resource url into "<protocol>:<path>".
142
143    >>> windows = sys.platform.startswith('win')
144    >>> split_resource_url('nltk:home/nltk')
145    ('nltk', 'home/nltk')
146    >>> split_resource_url('nltk:/home/nltk')
147    ('nltk', '/home/nltk')
148    >>> split_resource_url('file:/home/nltk')
149    ('file', '/home/nltk')
150    >>> split_resource_url('file:///home/nltk')
151    ('file', '/home/nltk')
152    >>> split_resource_url('file:///C:/home/nltk')
153    ('file', '/C:/home/nltk')
154    """
155    protocol, path_ = resource_url.split(':', 1)
156    if protocol == 'nltk':
157        pass
158    elif protocol == 'file':
159        if path_.startswith('/'):
160            path_ = '/' + path_.lstrip('/')
161    else:
162        path_ = re.sub(r'^/{0,2}', '', path_)
163    return protocol, path_
164
165
166def normalize_resource_url(resource_url):
167    r"""
168    Normalizes a resource url
169
170    >>> windows = sys.platform.startswith('win')
171    >>> os.path.normpath(split_resource_url(normalize_resource_url('file:grammar.fcfg'))[1]) == \
172    ... ('\\' if windows else '') + os.path.abspath(os.path.join(os.curdir, 'grammar.fcfg'))
173    True
174    >>> not windows or normalize_resource_url('file:C:/dir/file') == 'file:///C:/dir/file'
175    True
176    >>> not windows or normalize_resource_url('file:C:\\dir\\file') == 'file:///C:/dir/file'
177    True
178    >>> not windows or normalize_resource_url('file:C:\\dir/file') == 'file:///C:/dir/file'
179    True
180    >>> not windows or normalize_resource_url('file://C:/dir/file') == 'file:///C:/dir/file'
181    True
182    >>> not windows or normalize_resource_url('file:////C:/dir/file') == 'file:///C:/dir/file'
183    True
184    >>> not windows or normalize_resource_url('nltk:C:/dir/file') == 'file:///C:/dir/file'
185    True
186    >>> not windows or normalize_resource_url('nltk:C:\\dir\\file') == 'file:///C:/dir/file'
187    True
188    >>> windows or normalize_resource_url('file:/dir/file/toy.cfg') == 'file:///dir/file/toy.cfg'
189    True
190    >>> normalize_resource_url('nltk:home/nltk')
191    'nltk:home/nltk'
192    >>> windows or normalize_resource_url('nltk:/home/nltk') == 'file:///home/nltk'
193    True
194    >>> normalize_resource_url('http://example.com/dir/file')
195    'http://example.com/dir/file'
196    >>> normalize_resource_url('dir/file')
197    'nltk:dir/file'
198    """
199    try:
200        protocol, name = split_resource_url(resource_url)
201    except ValueError:
202        # the resource url has no protocol, use the nltk protocol by default
203        protocol = 'nltk'
204        name = resource_url
205    # use file protocol if the path is an absolute path
206    if protocol == 'nltk' and os.path.isabs(name):
207        protocol = 'file://'
208        name = normalize_resource_name(name, False, None)
209    elif protocol == 'file':
210        protocol = 'file://'
211        # name is absolute
212        name = normalize_resource_name(name, False, None)
213    elif protocol == 'nltk':
214        protocol = 'nltk:'
215        name = normalize_resource_name(name, True)
216    else:
217        # handled by urllib
218        protocol += '://'
219    return ''.join([protocol, name])
220
221
222def normalize_resource_name(resource_name, allow_relative=True, relative_path=None):
223    """
224    :type resource_name: str or unicode
225    :param resource_name: The name of the resource to search for.
226        Resource names are posix-style relative path names, such as
227        ``corpora/brown``.  Directory names will automatically
228        be converted to a platform-appropriate path separator.
229        Directory trailing slashes are preserved
230
231    >>> windows = sys.platform.startswith('win')
232    >>> normalize_resource_name('.', True)
233    './'
234    >>> normalize_resource_name('./', True)
235    './'
236    >>> windows or normalize_resource_name('dir/file', False, '/') == '/dir/file'
237    True
238    >>> not windows or normalize_resource_name('C:/file', False, '/') == '/C:/file'
239    True
240    >>> windows or normalize_resource_name('/dir/file', False, '/') == '/dir/file'
241    True
242    >>> windows or normalize_resource_name('../dir/file', False, '/') == '/dir/file'
243    True
244    >>> not windows or normalize_resource_name('/dir/file', True, '/') == 'dir/file'
245    True
246    >>> windows or normalize_resource_name('/dir/file', True, '/') == '/dir/file'
247    True
248    """
249    is_dir = bool(re.search(r'[\\/.]$', resource_name)) or resource_name.endswith(
250        os.path.sep
251    )
252    if sys.platform.startswith('win'):
253        resource_name = resource_name.lstrip('/')
254    else:
255        resource_name = re.sub(r'^/+', '/', resource_name)
256    if allow_relative:
257        resource_name = os.path.normpath(resource_name)
258    else:
259        if relative_path is None:
260            relative_path = os.curdir
261        resource_name = os.path.abspath(os.path.join(relative_path, resource_name))
262    resource_name = resource_name.replace('\\', '/').replace(os.path.sep, '/')
263    if sys.platform.startswith('win') and os.path.isabs(resource_name):
264        resource_name = '/' + resource_name
265    if is_dir and not resource_name.endswith('/'):
266        resource_name += '/'
267    return resource_name
268
269
270######################################################################
271# Path Pointers
272######################################################################
273
274
275@add_metaclass(ABCMeta)
276class PathPointer(object):
277    """
278    An abstract base class for 'path pointers,' used by NLTK's data
279    package to identify specific paths.  Two subclasses exist:
280    ``FileSystemPathPointer`` identifies a file that can be accessed
281    directly via a given absolute path.  ``ZipFilePathPointer``
282    identifies a file contained within a zipfile, that can be accessed
283    by reading that zipfile.
284    """
285
286    @abstractmethod
287    def open(self, encoding=None):
288        """
289        Return a seekable read-only stream that can be used to read
290        the contents of the file identified by this path pointer.
291
292        :raise IOError: If the path specified by this pointer does
293            not contain a readable file.
294        """
295
296    @abstractmethod
297    def file_size(self):
298        """
299        Return the size of the file pointed to by this path pointer,
300        in bytes.
301
302        :raise IOError: If the path specified by this pointer does
303            not contain a readable file.
304        """
305
306    @abstractmethod
307    def join(self, fileid):
308        """
309        Return a new path pointer formed by starting at the path
310        identified by this pointer, and then following the relative
311        path given by ``fileid``.  The path components of ``fileid``
312        should be separated by forward slashes, regardless of
313        the underlying file system's path seperator character.
314        """
315
316
317class FileSystemPathPointer(PathPointer, text_type):
318    """
319    A path pointer that identifies a file which can be accessed
320    directly via a given absolute path.
321    """
322
323    @py3_data
324    def __init__(self, _path):
325        """
326        Create a new path pointer for the given absolute path.
327
328        :raise IOError: If the given path does not exist.
329        """
330
331        _path = os.path.abspath(_path)
332        if not os.path.exists(_path):
333            raise IOError('No such file or directory: %r' % _path)
334        self._path = _path
335
336        # There's no need to call str.__init__(), since it's a no-op;
337        # str does all of its setup work in __new__.
338
339    @property
340    def path(self):
341        """The absolute path identified by this path pointer."""
342        return self._path
343
344    def open(self, encoding=None):
345        stream = open(self._path, 'rb')
346        if encoding is not None:
347            stream = SeekableUnicodeStreamReader(stream, encoding)
348        return stream
349
350    def file_size(self):
351        return os.stat(self._path).st_size
352
353    def join(self, fileid):
354        _path = os.path.join(self._path, fileid)
355        return FileSystemPathPointer(_path)
356
357    def __repr__(self):
358        # This should be a byte string under Python 2.x;
359        # we don't want transliteration here so
360        # @python_2_unicode_compatible is not used.
361        return str('FileSystemPathPointer(%r)' % self._path)
362
363    def __str__(self):
364        return self._path
365
366
367class BufferedGzipFile(GzipFile):
368    """
369    A ``GzipFile`` subclass that buffers calls to ``read()`` and ``write()``.
370    This allows faster reads and writes of data to and from gzip-compressed
371    files at the cost of using more memory.
372
373    The default buffer size is 2MB.
374
375    ``BufferedGzipFile`` is useful for loading large gzipped pickle objects
376    as well as writing large encoded feature files for classifier training.
377    """
378
379    MB = 2 ** 20
380    SIZE = 2 * MB
381
382    @py3_data
383    def __init__(
384        self, filename=None, mode=None, compresslevel=9, fileobj=None, **kwargs
385    ):
386        """
387        Return a buffered gzip file object.
388
389        :param filename: a filesystem path
390        :type filename: str
391        :param mode: a file mode which can be any of 'r', 'rb', 'a', 'ab',
392            'w', or 'wb'
393        :type mode: str
394        :param compresslevel: The compresslevel argument is an integer from 1
395            to 9 controlling the level of compression; 1 is fastest and
396            produces the least compression, and 9 is slowest and produces the
397            most compression. The default is 9.
398        :type compresslevel: int
399        :param fileobj: a BytesIO stream to read from instead of a file.
400        :type fileobj: BytesIO
401        :param size: number of bytes to buffer during calls to read() and write()
402        :type size: int
403        :rtype: BufferedGzipFile
404        """
405        GzipFile.__init__(self, filename, mode, compresslevel, fileobj)
406        self._size = kwargs.get('size', self.SIZE)
407        self._nltk_buffer = BytesIO()
408        # cStringIO does not support len.
409        self._len = 0
410
411    def _reset_buffer(self):
412        # For some reason calling BytesIO.truncate() here will lead to
413        # inconsistent writes so just set _buffer to a new BytesIO object.
414        self._nltk_buffer = BytesIO()
415        self._len = 0
416
417    def _write_buffer(self, data):
418        # Simply write to the buffer and increment the buffer size.
419        if data is not None:
420            self._nltk_buffer.write(data)
421            self._len += len(data)
422
423    def _write_gzip(self, data):
424        # Write the current buffer to the GzipFile.
425        GzipFile.write(self, self._nltk_buffer.getvalue())
426        # Then reset the buffer and write the new data to the buffer.
427        self._reset_buffer()
428        self._write_buffer(data)
429
430    def close(self):
431        # GzipFile.close() doesn't actuallly close anything.
432        if self.mode == GZ_WRITE:
433            self._write_gzip(None)
434            self._reset_buffer()
435        return GzipFile.close(self)
436
437    def flush(self, lib_mode=FLUSH):
438        self._nltk_buffer.flush()
439        GzipFile.flush(self, lib_mode)
440
441    def read(self, size=None):
442        if not size:
443            size = self._size
444            contents = BytesIO()
445            while True:
446                blocks = GzipFile.read(self, size)
447                if not blocks:
448                    contents.flush()
449                    break
450                contents.write(blocks)
451            return contents.getvalue()
452        else:
453            return GzipFile.read(self, size)
454
455    def write(self, data, size=-1):
456        """
457        :param data: bytes to write to file or buffer
458        :type data: bytes
459        :param size: buffer at least size bytes before writing to file
460        :type size: int
461        """
462        if not size:
463            size = self._size
464        if self._len + len(data) <= size:
465            self._write_buffer(data)
466        else:
467            self._write_gzip(data)
468
469
470class GzipFileSystemPathPointer(FileSystemPathPointer):
471    """
472    A subclass of ``FileSystemPathPointer`` that identifies a gzip-compressed
473    file located at a given absolute path.  ``GzipFileSystemPathPointer`` is
474    appropriate for loading large gzip-compressed pickle objects efficiently.
475    """
476
477    def open(self, encoding=None):
478        # Note: In >= Python3.5, GzipFile is already using a
479        # buffered reader in the backend which has a variable self._buffer
480        # See https://github.com/nltk/nltk/issues/1308
481        if sys.version.startswith('2.7') or sys.version.startswith('3.4'):
482            stream = BufferedGzipFile(self._path, 'rb')
483        else:
484            stream = GzipFile(self._path, 'rb')
485        if encoding:
486            stream = SeekableUnicodeStreamReader(stream, encoding)
487        return stream
488
489
490class ZipFilePathPointer(PathPointer):
491    """
492    A path pointer that identifies a file contained within a zipfile,
493    which can be accessed by reading that zipfile.
494    """
495
496    @py3_data
497    def __init__(self, zipfile, entry=''):
498        """
499        Create a new path pointer pointing at the specified entry
500        in the given zipfile.
501
502        :raise IOError: If the given zipfile does not exist, or if it
503        does not contain the specified entry.
504        """
505        if isinstance(zipfile, string_types):
506            zipfile = OpenOnDemandZipFile(os.path.abspath(zipfile))
507
508        # Check that the entry exists:
509        if entry:
510
511            # Normalize the entry string, it should be relative:
512            entry = normalize_resource_name(entry, True, '/').lstrip('/')
513
514            try:
515                zipfile.getinfo(entry)
516            except Exception:
517                # Sometimes directories aren't explicitly listed in
518                # the zip file.  So if `entry` is a directory name,
519                # then check if the zipfile contains any files that
520                # are under the given directory.
521                if entry.endswith('/') and [
522                    n for n in zipfile.namelist() if n.startswith(entry)
523                ]:
524                    pass  # zipfile contains a file in that directory.
525                else:
526                    # Otherwise, complain.
527                    raise IOError(
528                        'Zipfile %r does not contain %r' % (zipfile.filename, entry)
529                    )
530        self._zipfile = zipfile
531        self._entry = entry
532
533    @property
534    def zipfile(self):
535        """
536        The zipfile.ZipFile object used to access the zip file
537        containing the entry identified by this path pointer.
538        """
539        return self._zipfile
540
541    @property
542    def entry(self):
543        """
544        The name of the file within zipfile that this path
545        pointer points to.
546        """
547        return self._entry
548
549    def open(self, encoding=None):
550        data = self._zipfile.read(self._entry)
551        stream = BytesIO(data)
552        if self._entry.endswith('.gz'):
553            # Note: In >= Python3.5, GzipFile is already using a
554            # buffered reader in the backend which has a variable self._buffer
555            # See https://github.com/nltk/nltk/issues/1308
556            if sys.version.startswith('2.7') or sys.version.startswith('3.4'):
557                stream = BufferedGzipFile(self._entry, fileobj=stream)
558            else:
559                stream = GzipFile(self._entry, fileobj=stream)
560        elif encoding is not None:
561            stream = SeekableUnicodeStreamReader(stream, encoding)
562        return stream
563
564    def file_size(self):
565        return self._zipfile.getinfo(self._entry).file_size
566
567    def join(self, fileid):
568        entry = '%s/%s' % (self._entry, fileid)
569        return ZipFilePathPointer(self._zipfile, entry)
570
571    def __repr__(self):
572        return str('ZipFilePathPointer(%r, %r)') % (self._zipfile.filename, self._entry)
573
574    def __str__(self):
575        return os.path.normpath(os.path.join(self._zipfile.filename, self._entry))
576
577
578######################################################################
579# Access Functions
580######################################################################
581
582# Don't use a weak dictionary, because in the common case this
583# causes a lot more reloading that necessary.
584_resource_cache = {}
585"""A dictionary used to cache resources so that they won't
586   need to be loaded more than once."""
587
588
589def find(resource_name, paths=None):
590    """
591    Find the given resource by searching through the directories and
592    zip files in paths, where a None or empty string specifies an absolute path.
593    Returns a corresponding path name.  If the given resource is not
594    found, raise a ``LookupError``, whose message gives a pointer to
595    the installation instructions for the NLTK downloader.
596
597    Zip File Handling:
598
599      - If ``resource_name`` contains a component with a ``.zip``
600        extension, then it is assumed to be a zipfile; and the
601        remaining path components are used to look inside the zipfile.
602
603      - If any element of ``nltk.data.path`` has a ``.zip`` extension,
604        then it is assumed to be a zipfile.
605
606      - If a given resource name that does not contain any zipfile
607        component is not found initially, then ``find()`` will make a
608        second attempt to find that resource, by replacing each
609        component *p* in the path with *p.zip/p*.  For example, this
610        allows ``find()`` to map the resource name
611        ``corpora/chat80/cities.pl`` to a zip file path pointer to
612        ``corpora/chat80.zip/chat80/cities.pl``.
613
614      - When using ``find()`` to locate a directory contained in a
615        zipfile, the resource name must end with the forward slash
616        character.  Otherwise, ``find()`` will not locate the
617        directory.
618
619    :type resource_name: str or unicode
620    :param resource_name: The name of the resource to search for.
621        Resource names are posix-style relative path names, such as
622        ``corpora/brown``.  Directory names will be
623        automatically converted to a platform-appropriate path separator.
624    :rtype: str
625    """
626    resource_name = normalize_resource_name(resource_name, True)
627
628    # Resolve default paths at runtime in-case the user overrides
629    # nltk.data.path
630    if paths is None:
631        paths = path
632
633    # Check if the resource name includes a zipfile name
634    m = re.match(r'(.*\.zip)/?(.*)$|', resource_name)
635    zipfile, zipentry = m.groups()
636
637    # Check each item in our path
638    for path_ in paths:
639        # Is the path item a zipfile?
640        if path_ and (os.path.isfile(path_) and path_.endswith('.zip')):
641            try:
642                return ZipFilePathPointer(path_, resource_name)
643            except IOError:
644                # resource not in zipfile
645                continue
646
647        # Is the path item a directory or is resource_name an absolute path?
648        elif not path_ or os.path.isdir(path_):
649            if zipfile is None:
650                p = os.path.join(path_, url2pathname(resource_name))
651                if os.path.exists(p):
652                    if p.endswith('.gz'):
653                        return GzipFileSystemPathPointer(p)
654                    else:
655                        return FileSystemPathPointer(p)
656            else:
657                p = os.path.join(path_, url2pathname(zipfile))
658                if os.path.exists(p):
659                    try:
660                        return ZipFilePathPointer(p, zipentry)
661                    except IOError:
662                        # resource not in zipfile
663                        continue
664
665    # Fallback: if the path doesn't include a zip file, then try
666    # again, assuming that one of the path components is inside a
667    # zipfile of the same name.
668    if zipfile is None:
669        pieces = resource_name.split('/')
670        for i in range(len(pieces)):
671            modified_name = '/'.join(pieces[:i] + [pieces[i] + '.zip'] + pieces[i:])
672            try:
673                return find(modified_name, paths)
674            except LookupError:
675                pass
676
677    # Identify the package (i.e. the .zip file) to download.
678    resource_zipname = resource_name.split('/')[1]
679    if resource_zipname.endswith('.zip'):
680        resource_zipname = resource_zipname.rpartition('.')[0]
681    # Display a friendly error message if the resource wasn't found:
682    msg = str(
683        "Resource \33[93m{resource}\033[0m not found.\n"
684        "Please use the NLTK Downloader to obtain the resource:\n\n"
685        "\33[31m"  # To display red text in terminal.
686        ">>> import nltk\n"
687        ">>> nltk.download(\'{resource}\')\n"
688        "\033[0m"
689    ).format(resource=resource_zipname)
690    msg = textwrap_indent(msg)
691
692    msg += '\n  For more information see: https://www.nltk.org/data.html\n'
693
694    msg += '\n  Attempted to load \33[93m{resource_name}\033[0m\n'.format(
695        resource_name=resource_name
696    )
697
698    msg += '\n  Searched in:' + ''.join('\n    - %r' % d for d in paths)
699    sep = '*' * 70
700    resource_not_found = '\n%s\n%s\n%s\n' % (sep, msg, sep)
701    raise LookupError(resource_not_found)
702
703
704def retrieve(resource_url, filename=None, verbose=True):
705    """
706    Copy the given resource to a local file.  If no filename is
707    specified, then use the URL's filename.  If there is already a
708    file named ``filename``, then raise a ``ValueError``.
709
710    :type resource_url: str
711    :param resource_url: A URL specifying where the resource should be
712        loaded from.  The default protocol is "nltk:", which searches
713        for the file in the the NLTK data package.
714    """
715    resource_url = normalize_resource_url(resource_url)
716    if filename is None:
717        if resource_url.startswith('file:'):
718            filename = os.path.split(resource_url)[-1]
719        else:
720            filename = re.sub(r'(^\w+:)?.*/', '', resource_url)
721    if os.path.exists(filename):
722        filename = os.path.abspath(filename)
723        raise ValueError("File %r already exists!" % filename)
724
725    if verbose:
726        print('Retrieving %r, saving to %r' % (resource_url, filename))
727
728    # Open the input & output streams.
729    infile = _open(resource_url)
730
731    # Copy infile -> outfile, using 64k blocks.
732    with open(filename, "wb") as outfile:
733        while True:
734            s = infile.read(1024 * 64)  # 64k blocks.
735            outfile.write(s)
736            if not s:
737                break
738
739    infile.close()
740
741
742#: A dictionary describing the formats that are supported by NLTK's
743#: load() method.  Keys are format names, and values are format
744#: descriptions.
745FORMATS = {
746    'pickle': "A serialized python object, stored using the pickle module.",
747    'json': "A serialized python object, stored using the json module.",
748    'yaml': "A serialized python object, stored using the yaml module.",
749    'cfg': "A context free grammar.",
750    'pcfg': "A probabilistic CFG.",
751    'fcfg': "A feature CFG.",
752    'fol': "A list of first order logic expressions, parsed with "
753    "nltk.sem.logic.Expression.fromstring.",
754    'logic': "A list of first order logic expressions, parsed with "
755    "nltk.sem.logic.LogicParser.  Requires an additional logic_parser "
756    "parameter",
757    'val': "A semantic valuation, parsed by nltk.sem.Valuation.fromstring.",
758    'raw': "The raw (byte string) contents of a file.",
759    'text': "The raw (unicode string) contents of a file. ",
760}
761
762#: A dictionary mapping from file extensions to format names, used
763#: by load() when format="auto" to decide the format for a
764#: given resource url.
765AUTO_FORMATS = {
766    'pickle': 'pickle',
767    'json': 'json',
768    'yaml': 'yaml',
769    'cfg': 'cfg',
770    'pcfg': 'pcfg',
771    'fcfg': 'fcfg',
772    'fol': 'fol',
773    'logic': 'logic',
774    'val': 'val',
775    'txt': 'text',
776    'text': 'text',
777}
778
779
780def load(
781    resource_url,
782    format='auto',
783    cache=True,
784    verbose=False,
785    logic_parser=None,
786    fstruct_reader=None,
787    encoding=None,
788):
789    """
790    Load a given resource from the NLTK data package.  The following
791    resource formats are currently supported:
792
793      - ``pickle``
794      - ``json``
795      - ``yaml``
796      - ``cfg`` (context free grammars)
797      - ``pcfg`` (probabilistic CFGs)
798      - ``fcfg`` (feature-based CFGs)
799      - ``fol`` (formulas of First Order Logic)
800      - ``logic`` (Logical formulas to be parsed by the given logic_parser)
801      - ``val`` (valuation of First Order Logic model)
802      - ``text`` (the file contents as a unicode string)
803      - ``raw`` (the raw file contents as a byte string)
804
805    If no format is specified, ``load()`` will attempt to determine a
806    format based on the resource name's file extension.  If that
807    fails, ``load()`` will raise a ``ValueError`` exception.
808
809    For all text formats (everything except ``pickle``, ``json``, ``yaml`` and ``raw``),
810    it tries to decode the raw contents using UTF-8, and if that doesn't
811    work, it tries with ISO-8859-1 (Latin-1), unless the ``encoding``
812    is specified.
813
814    :type resource_url: str
815    :param resource_url: A URL specifying where the resource should be
816        loaded from.  The default protocol is "nltk:", which searches
817        for the file in the the NLTK data package.
818    :type cache: bool
819    :param cache: If true, add this resource to a cache.  If load()
820        finds a resource in its cache, then it will return it from the
821        cache rather than loading it.  The cache uses weak references,
822        so a resource wil automatically be expunged from the cache
823        when no more objects are using it.
824    :type verbose: bool
825    :param verbose: If true, print a message when loading a resource.
826        Messages are not displayed when a resource is retrieved from
827        the cache.
828    :type logic_parser: LogicParser
829    :param logic_parser: The parser that will be used to parse logical
830        expressions.
831    :type fstruct_reader: FeatStructReader
832    :param fstruct_reader: The parser that will be used to parse the
833        feature structure of an fcfg.
834    :type encoding: str
835    :param encoding: the encoding of the input; only used for text formats.
836    """
837    resource_url = normalize_resource_url(resource_url)
838    resource_url = add_py3_data(resource_url)
839
840    # Determine the format of the resource.
841    if format == 'auto':
842        resource_url_parts = resource_url.split('.')
843        ext = resource_url_parts[-1]
844        if ext == 'gz':
845            ext = resource_url_parts[-2]
846        format = AUTO_FORMATS.get(ext)
847        if format is None:
848            raise ValueError(
849                'Could not determine format for %s based '
850                'on its file\nextension; use the "format" '
851                'argument to specify the format explicitly.' % resource_url
852            )
853
854    if format not in FORMATS:
855        raise ValueError('Unknown format type: %s!' % (format,))
856
857    # If we've cached the resource, then just return it.
858    if cache:
859        resource_val = _resource_cache.get((resource_url, format))
860        if resource_val is not None:
861            if verbose:
862                print('<<Using cached copy of %s>>' % (resource_url,))
863            return resource_val
864
865    # Let the user know what's going on.
866    if verbose:
867        print('<<Loading %s>>' % (resource_url,))
868
869    # Load the resource.
870    opened_resource = _open(resource_url)
871
872    if format == 'raw':
873        resource_val = opened_resource.read()
874    elif format == 'pickle':
875        resource_val = pickle.load(opened_resource)
876    elif format == 'json':
877        import json
878        from nltk.jsontags import json_tags
879
880        resource_val = json.load(opened_resource)
881        tag = None
882        if len(resource_val) != 1:
883            tag = next(resource_val.keys())
884        if tag not in json_tags:
885            raise ValueError('Unknown json tag.')
886    elif format == 'yaml':
887        import yaml
888
889        resource_val = yaml.load(opened_resource)
890    else:
891        # The resource is a text format.
892        binary_data = opened_resource.read()
893        if encoding is not None:
894            string_data = binary_data.decode(encoding)
895        else:
896            try:
897                string_data = binary_data.decode('utf-8')
898            except UnicodeDecodeError:
899                string_data = binary_data.decode('latin-1')
900        if format == 'text':
901            resource_val = string_data
902        elif format == 'cfg':
903            resource_val = nltk.grammar.CFG.fromstring(string_data, encoding=encoding)
904        elif format == 'pcfg':
905            resource_val = nltk.grammar.PCFG.fromstring(string_data, encoding=encoding)
906        elif format == 'fcfg':
907            resource_val = nltk.grammar.FeatureGrammar.fromstring(
908                string_data,
909                logic_parser=logic_parser,
910                fstruct_reader=fstruct_reader,
911                encoding=encoding,
912            )
913        elif format == 'fol':
914            resource_val = nltk.sem.read_logic(
915                string_data,
916                logic_parser=nltk.sem.logic.LogicParser(),
917                encoding=encoding,
918            )
919        elif format == 'logic':
920            resource_val = nltk.sem.read_logic(
921                string_data, logic_parser=logic_parser, encoding=encoding
922            )
923        elif format == 'val':
924            resource_val = nltk.sem.read_valuation(string_data, encoding=encoding)
925        else:
926            raise AssertionError(
927                "Internal NLTK error: Format %s isn't "
928                "handled by nltk.data.load()" % (format,)
929            )
930
931    opened_resource.close()
932
933    # If requested, add it to the cache.
934    if cache:
935        try:
936            _resource_cache[(resource_url, format)] = resource_val
937            # TODO: add this line
938            # print('<<Caching a copy of %s>>' % (resource_url,))
939        except TypeError:
940            # We can't create weak references to some object types, like
941            # strings and tuples.  For now, just don't cache them.
942            pass
943
944    return resource_val
945
946
947def show_cfg(resource_url, escape='##'):
948    """
949    Write out a grammar file, ignoring escaped and empty lines.
950
951    :type resource_url: str
952    :param resource_url: A URL specifying where the resource should be
953        loaded from.  The default protocol is "nltk:", which searches
954        for the file in the the NLTK data package.
955    :type escape: str
956    :param escape: Prepended string that signals lines to be ignored
957    """
958    resource_url = normalize_resource_url(resource_url)
959    resource_val = load(resource_url, format='text', cache=False)
960    lines = resource_val.splitlines()
961    for l in lines:
962        if l.startswith(escape):
963            continue
964        if re.match('^$', l):
965            continue
966        print(l)
967
968
969def clear_cache():
970    """
971    Remove all objects from the resource cache.
972    :see: load()
973    """
974    _resource_cache.clear()
975
976
977def _open(resource_url):
978    """
979    Helper function that returns an open file object for a resource,
980    given its resource URL.  If the given resource URL uses the "nltk:"
981    protocol, or uses no protocol, then use ``nltk.data.find`` to find
982    its path, and open it with the given mode; if the resource URL
983    uses the 'file' protocol, then open the file with the given mode;
984    otherwise, delegate to ``urllib2.urlopen``.
985
986    :type resource_url: str
987    :param resource_url: A URL specifying where the resource should be
988        loaded from.  The default protocol is "nltk:", which searches
989        for the file in the the NLTK data package.
990    """
991    resource_url = normalize_resource_url(resource_url)
992    protocol, path_ = split_resource_url(resource_url)
993
994    if protocol is None or protocol.lower() == 'nltk':
995        return find(path_, path + ['']).open()
996    elif protocol.lower() == 'file':
997        # urllib might not use mode='rb', so handle this one ourselves:
998        return find(path_, ['']).open()
999    else:
1000        return urlopen(resource_url)
1001
1002
1003######################################################################
1004# Lazy Resource Loader
1005######################################################################
1006
1007# We shouldn't apply @python_2_unicode_compatible
1008# decorator to LazyLoader, this is resource.__class__ responsibility.
1009
1010
1011class LazyLoader(object):
1012    @py3_data
1013    def __init__(self, _path):
1014        self._path = _path
1015
1016    def __load(self):
1017        resource = load(self._path)
1018        # This is where the magic happens!  Transform ourselves into
1019        # the object by modifying our own __dict__ and __class__ to
1020        # match that of `resource`.
1021        self.__dict__ = resource.__dict__
1022        self.__class__ = resource.__class__
1023
1024    def __getattr__(self, attr):
1025        self.__load()
1026        # This looks circular, but its not, since __load() changes our
1027        # __class__ to something new:
1028        return getattr(self, attr)
1029
1030    def __repr__(self):
1031        self.__load()
1032        # This looks circular, but its not, since __load() changes our
1033        # __class__ to something new:
1034        return repr(self)
1035
1036
1037######################################################################
1038# Open-On-Demand ZipFile
1039######################################################################
1040
1041
1042class OpenOnDemandZipFile(zipfile.ZipFile):
1043    """
1044    A subclass of ``zipfile.ZipFile`` that closes its file pointer
1045    whenever it is not using it; and re-opens it when it needs to read
1046    data from the zipfile.  This is useful for reducing the number of
1047    open file handles when many zip files are being accessed at once.
1048    ``OpenOnDemandZipFile`` must be constructed from a filename, not a
1049    file-like object (to allow re-opening).  ``OpenOnDemandZipFile`` is
1050    read-only (i.e. ``write()`` and ``writestr()`` are disabled.
1051    """
1052
1053    @py3_data
1054    def __init__(self, filename):
1055        if not isinstance(filename, string_types):
1056            raise TypeError('ReopenableZipFile filename must be a string')
1057        zipfile.ZipFile.__init__(self, filename)
1058        assert self.filename == filename
1059        self.close()
1060        # After closing a ZipFile object, the _fileRefCnt needs to be cleared
1061        # for Python2and3 compatible code.
1062        self._fileRefCnt = 0
1063
1064    def read(self, name):
1065        assert self.fp is None
1066        self.fp = open(self.filename, 'rb')
1067        value = zipfile.ZipFile.read(self, name)
1068        # Ensure that _fileRefCnt needs to be set for Python2and3 compatible code.
1069        # Since we only opened one file here, we add 1.
1070        self._fileRefCnt += 1
1071        self.close()
1072        return value
1073
1074    def write(self, *args, **kwargs):
1075        """:raise NotImplementedError: OpenOnDemandZipfile is read-only"""
1076        raise NotImplementedError('OpenOnDemandZipfile is read-only')
1077
1078    def writestr(self, *args, **kwargs):
1079        """:raise NotImplementedError: OpenOnDemandZipfile is read-only"""
1080        raise NotImplementedError('OpenOnDemandZipfile is read-only')
1081
1082    def __repr__(self):
1083        return repr(str('OpenOnDemandZipFile(%r)') % self.filename)
1084
1085
1086######################################################################
1087# { Seekable Unicode Stream Reader
1088######################################################################
1089
1090
1091class SeekableUnicodeStreamReader(object):
1092    """
1093    A stream reader that automatically encodes the source byte stream
1094    into unicode (like ``codecs.StreamReader``); but still supports the
1095    ``seek()`` and ``tell()`` operations correctly.  This is in contrast
1096    to ``codecs.StreamReader``, which provide *broken* ``seek()`` and
1097    ``tell()`` methods.
1098
1099    This class was motivated by ``StreamBackedCorpusView``, which
1100    makes extensive use of ``seek()`` and ``tell()``, and needs to be
1101    able to handle unicode-encoded files.
1102
1103    Note: this class requires stateless decoders.  To my knowledge,
1104    this shouldn't cause a problem with any of python's builtin
1105    unicode encodings.
1106    """
1107
1108    DEBUG = True  # : If true, then perform extra sanity checks.
1109
1110    @py3_data
1111    def __init__(self, stream, encoding, errors='strict'):
1112        # Rewind the stream to its beginning.
1113        stream.seek(0)
1114
1115        self.stream = stream
1116        """The underlying stream."""
1117
1118        self.encoding = encoding
1119        """The name of the encoding that should be used to encode the
1120           underlying stream."""
1121
1122        self.errors = errors
1123        """The error mode that should be used when decoding data from
1124           the underlying stream.  Can be 'strict', 'ignore', or
1125           'replace'."""
1126
1127        self.decode = codecs.getdecoder(encoding)
1128        """The function that is used to decode byte strings into
1129           unicode strings."""
1130
1131        self.bytebuffer = b''
1132        """A buffer to use bytes that have been read but have not yet
1133           been decoded.  This is only used when the final bytes from
1134           a read do not form a complete encoding for a character."""
1135
1136        self.linebuffer = None
1137        """A buffer used by ``readline()`` to hold characters that have
1138           been read, but have not yet been returned by ``read()`` or
1139           ``readline()``.  This buffer consists of a list of unicode
1140           strings, where each string corresponds to a single line.
1141           The final element of the list may or may not be a complete
1142           line.  Note that the existence of a linebuffer makes the
1143           ``tell()`` operation more complex, because it must backtrack
1144           to the beginning of the buffer to determine the correct
1145           file position in the underlying byte stream."""
1146
1147        self._rewind_checkpoint = 0
1148        """The file position at which the most recent read on the
1149           underlying stream began.  This is used, together with
1150           ``_rewind_numchars``, to backtrack to the beginning of
1151           ``linebuffer`` (which is required by ``tell()``)."""
1152
1153        self._rewind_numchars = None
1154        """The number of characters that have been returned since the
1155           read that started at ``_rewind_checkpoint``.  This is used,
1156           together with ``_rewind_checkpoint``, to backtrack to the
1157           beginning of ``linebuffer`` (which is required by ``tell()``)."""
1158
1159        self._bom = self._check_bom()
1160        """The length of the byte order marker at the beginning of
1161           the stream (or None for no byte order marker)."""
1162
1163    # /////////////////////////////////////////////////////////////////
1164    # Read methods
1165    # /////////////////////////////////////////////////////////////////
1166
1167    def read(self, size=None):
1168        """
1169        Read up to ``size`` bytes, decode them using this reader's
1170        encoding, and return the resulting unicode string.
1171
1172        :param size: The maximum number of bytes to read.  If not
1173            specified, then read as many bytes as possible.
1174        :type size: int
1175        :rtype: unicode
1176        """
1177        chars = self._read(size)
1178
1179        # If linebuffer is not empty, then include it in the result
1180        if self.linebuffer:
1181            chars = ''.join(self.linebuffer) + chars
1182            self.linebuffer = None
1183            self._rewind_numchars = None
1184
1185        return chars
1186
1187    def discard_line(self):
1188        if self.linebuffer and len(self.linebuffer) > 1:
1189            line = self.linebuffer.pop(0)
1190            self._rewind_numchars += len(line)
1191        else:
1192            self.stream.readline()
1193
1194    def readline(self, size=None):
1195        """
1196        Read a line of text, decode it using this reader's encoding,
1197        and return the resulting unicode string.
1198
1199        :param size: The maximum number of bytes to read.  If no
1200            newline is encountered before ``size`` bytes have been read,
1201            then the returned value may not be a complete line of text.
1202        :type size: int
1203        """
1204        # If we have a non-empty linebuffer, then return the first
1205        # line from it.  (Note that the last element of linebuffer may
1206        # not be a complete line; so let _read() deal with it.)
1207        if self.linebuffer and len(self.linebuffer) > 1:
1208            line = self.linebuffer.pop(0)
1209            self._rewind_numchars += len(line)
1210            return line
1211
1212        readsize = size or 72
1213        chars = ''
1214
1215        # If there's a remaining incomplete line in the buffer, add it.
1216        if self.linebuffer:
1217            chars += self.linebuffer.pop()
1218            self.linebuffer = None
1219
1220        while True:
1221            startpos = self.stream.tell() - len(self.bytebuffer)
1222            new_chars = self._read(readsize)
1223
1224            # If we're at a '\r', then read one extra character, since
1225            # it might be a '\n', to get the proper line ending.
1226            if new_chars and new_chars.endswith('\r'):
1227                new_chars += self._read(1)
1228
1229            chars += new_chars
1230            lines = chars.splitlines(True)
1231            if len(lines) > 1:
1232                line = lines[0]
1233                self.linebuffer = lines[1:]
1234                self._rewind_numchars = len(new_chars) - (len(chars) - len(line))
1235                self._rewind_checkpoint = startpos
1236                break
1237            elif len(lines) == 1:
1238                line0withend = lines[0]
1239                line0withoutend = lines[0].splitlines(False)[0]
1240                if line0withend != line0withoutend:  # complete line
1241                    line = line0withend
1242                    break
1243
1244            if not new_chars or size is not None:
1245                line = chars
1246                break
1247
1248            # Read successively larger blocks of text.
1249            if readsize < 8000:
1250                readsize *= 2
1251
1252        return line
1253
1254    def readlines(self, sizehint=None, keepends=True):
1255        """
1256        Read this file's contents, decode them using this reader's
1257        encoding, and return it as a list of unicode lines.
1258
1259        :rtype: list(unicode)
1260        :param sizehint: Ignored.
1261        :param keepends: If false, then strip newlines.
1262        """
1263        return self.read().splitlines(keepends)
1264
1265    def next(self):
1266        """Return the next decoded line from the underlying stream."""
1267        line = self.readline()
1268        if line:
1269            return line
1270        else:
1271            raise StopIteration
1272
1273    def __next__(self):
1274        return self.next()
1275
1276    def __iter__(self):
1277        """Return self"""
1278        return self
1279
1280    def __del__(self):
1281        # let garbage collector deal with still opened streams
1282        if not self.closed:
1283            self.close()
1284
1285    def xreadlines(self):
1286        """Return self"""
1287        return self
1288
1289    # /////////////////////////////////////////////////////////////////
1290    # Pass-through methods & properties
1291    # /////////////////////////////////////////////////////////////////
1292
1293    @property
1294    def closed(self):
1295        """True if the underlying stream is closed."""
1296        return self.stream.closed
1297
1298    @property
1299    def name(self):
1300        """The name of the underlying stream."""
1301        return self.stream.name
1302
1303    @property
1304    def mode(self):
1305        """The mode of the underlying stream."""
1306        return self.stream.mode
1307
1308    def close(self):
1309        """
1310        Close the underlying stream.
1311        """
1312        self.stream.close()
1313
1314    # /////////////////////////////////////////////////////////////////
1315    # Seek and tell
1316    # /////////////////////////////////////////////////////////////////
1317
1318    def seek(self, offset, whence=0):
1319        """
1320        Move the stream to a new file position.  If the reader is
1321        maintaining any buffers, then they will be cleared.
1322
1323        :param offset: A byte count offset.
1324        :param whence: If 0, then the offset is from the start of the file
1325            (offset should be positive), if 1, then the offset is from the
1326            current position (offset may be positive or negative); and if 2,
1327            then the offset is from the end of the file (offset should
1328            typically be negative).
1329        """
1330        if whence == 1:
1331            raise ValueError(
1332                'Relative seek is not supported for '
1333                'SeekableUnicodeStreamReader -- consider '
1334                'using char_seek_forward() instead.'
1335            )
1336        self.stream.seek(offset, whence)
1337        self.linebuffer = None
1338        self.bytebuffer = b''
1339        self._rewind_numchars = None
1340        self._rewind_checkpoint = self.stream.tell()
1341
1342    def char_seek_forward(self, offset):
1343        """
1344        Move the read pointer forward by ``offset`` characters.
1345        """
1346        if offset < 0:
1347            raise ValueError('Negative offsets are not supported')
1348        # Clear all buffers.
1349        self.seek(self.tell())
1350        # Perform the seek operation.
1351        self._char_seek_forward(offset)
1352
1353    def _char_seek_forward(self, offset, est_bytes=None):
1354        """
1355        Move the file position forward by ``offset`` characters,
1356        ignoring all buffers.
1357
1358        :param est_bytes: A hint, giving an estimate of the number of
1359            bytes that will be needed to move forward by ``offset`` chars.
1360            Defaults to ``offset``.
1361        """
1362        if est_bytes is None:
1363            est_bytes = offset
1364        bytes = b''
1365
1366        while True:
1367            # Read in a block of bytes.
1368            newbytes = self.stream.read(est_bytes - len(bytes))
1369            bytes += newbytes
1370
1371            # Decode the bytes to characters.
1372            chars, bytes_decoded = self._incr_decode(bytes)
1373
1374            # If we got the right number of characters, then seek
1375            # backwards over any truncated characters, and return.
1376            if len(chars) == offset:
1377                self.stream.seek(-len(bytes) + bytes_decoded, 1)
1378                return
1379
1380            # If we went too far, then we can back-up until we get it
1381            # right, using the bytes we've already read.
1382            if len(chars) > offset:
1383                while len(chars) > offset:
1384                    # Assume at least one byte/char.
1385                    est_bytes += offset - len(chars)
1386                    chars, bytes_decoded = self._incr_decode(bytes[:est_bytes])
1387                self.stream.seek(-len(bytes) + bytes_decoded, 1)
1388                return
1389
1390            # Otherwise, we haven't read enough bytes yet; loop again.
1391            est_bytes += offset - len(chars)
1392
1393    def tell(self):
1394        """
1395        Return the current file position on the underlying byte
1396        stream.  If this reader is maintaining any buffers, then the
1397        returned file position will be the position of the beginning
1398        of those buffers.
1399        """
1400        # If nothing's buffered, then just return our current filepos:
1401        if self.linebuffer is None:
1402            return self.stream.tell() - len(self.bytebuffer)
1403
1404        # Otherwise, we'll need to backtrack the filepos until we
1405        # reach the beginning of the buffer.
1406
1407        # Store our original file position, so we can return here.
1408        orig_filepos = self.stream.tell()
1409
1410        # Calculate an estimate of where we think the newline is.
1411        bytes_read = (orig_filepos - len(self.bytebuffer)) - self._rewind_checkpoint
1412        buf_size = sum(len(line) for line in self.linebuffer)
1413        est_bytes = int(
1414            (bytes_read * self._rewind_numchars / (self._rewind_numchars + buf_size))
1415        )
1416
1417        self.stream.seek(self._rewind_checkpoint)
1418        self._char_seek_forward(self._rewind_numchars, est_bytes)
1419        filepos = self.stream.tell()
1420
1421        # Sanity check
1422        if self.DEBUG:
1423            self.stream.seek(filepos)
1424            check1 = self._incr_decode(self.stream.read(50))[0]
1425            check2 = ''.join(self.linebuffer)
1426            assert check1.startswith(check2) or check2.startswith(check1)
1427
1428        # Return to our original filepos (so we don't have to throw
1429        # out our buffer.)
1430        self.stream.seek(orig_filepos)
1431
1432        # Return the calculated filepos
1433        return filepos
1434
1435    # /////////////////////////////////////////////////////////////////
1436    # Helper methods
1437    # /////////////////////////////////////////////////////////////////
1438
1439    def _read(self, size=None):
1440        """
1441        Read up to ``size`` bytes from the underlying stream, decode
1442        them using this reader's encoding, and return the resulting
1443        unicode string.  ``linebuffer`` is not included in the result.
1444        """
1445        if size == 0:
1446            return ''
1447
1448        # Skip past the byte order marker, if present.
1449        if self._bom and self.stream.tell() == 0:
1450            self.stream.read(self._bom)
1451
1452        # Read the requested number of bytes.
1453        if size is None:
1454            new_bytes = self.stream.read()
1455        else:
1456            new_bytes = self.stream.read(size)
1457        bytes = self.bytebuffer + new_bytes
1458
1459        # Decode the bytes into unicode characters
1460        chars, bytes_decoded = self._incr_decode(bytes)
1461
1462        # If we got bytes but couldn't decode any, then read further.
1463        if (size is not None) and (not chars) and (len(new_bytes) > 0):
1464            while not chars:
1465                new_bytes = self.stream.read(1)
1466                if not new_bytes:
1467                    break  # end of file.
1468                bytes += new_bytes
1469                chars, bytes_decoded = self._incr_decode(bytes)
1470
1471        # Record any bytes we didn't consume.
1472        self.bytebuffer = bytes[bytes_decoded:]
1473
1474        # Return the result
1475        return chars
1476
1477    def _incr_decode(self, bytes):
1478        """
1479        Decode the given byte string into a unicode string, using this
1480        reader's encoding.  If an exception is encountered that
1481        appears to be caused by a truncation error, then just decode
1482        the byte string without the bytes that cause the trunctaion
1483        error.
1484
1485        Return a tuple ``(chars, num_consumed)``, where ``chars`` is
1486        the decoded unicode string, and ``num_consumed`` is the
1487        number of bytes that were consumed.
1488        """
1489        while True:
1490            try:
1491                return self.decode(bytes, 'strict')
1492            except UnicodeDecodeError as exc:
1493                # If the exception occurs at the end of the string,
1494                # then assume that it's a truncation error.
1495                if exc.end == len(bytes):
1496                    return self.decode(bytes[: exc.start], self.errors)
1497
1498                # Otherwise, if we're being strict, then raise it.
1499                elif self.errors == 'strict':
1500                    raise
1501
1502                # If we're not strict, then re-process it with our
1503                # errors setting.  This *may* raise an exception.
1504                else:
1505                    return self.decode(bytes, self.errors)
1506
1507    _BOM_TABLE = {
1508        'utf8': [(codecs.BOM_UTF8, None)],
1509        'utf16': [(codecs.BOM_UTF16_LE, 'utf16-le'), (codecs.BOM_UTF16_BE, 'utf16-be')],
1510        'utf16le': [(codecs.BOM_UTF16_LE, None)],
1511        'utf16be': [(codecs.BOM_UTF16_BE, None)],
1512        'utf32': [(codecs.BOM_UTF32_LE, 'utf32-le'), (codecs.BOM_UTF32_BE, 'utf32-be')],
1513        'utf32le': [(codecs.BOM_UTF32_LE, None)],
1514        'utf32be': [(codecs.BOM_UTF32_BE, None)],
1515    }
1516
1517    def _check_bom(self):
1518        # Normalize our encoding name
1519        enc = re.sub('[ -]', '', self.encoding.lower())
1520
1521        # Look up our encoding in the BOM table.
1522        bom_info = self._BOM_TABLE.get(enc)
1523
1524        if bom_info:
1525            # Read a prefix, to check against the BOM(s)
1526            bytes = self.stream.read(16)
1527            self.stream.seek(0)
1528
1529            # Check for each possible BOM.
1530            for (bom, new_encoding) in bom_info:
1531                if bytes.startswith(bom):
1532                    if new_encoding:
1533                        self.encoding = new_encoding
1534                    return len(bom)
1535
1536        return None
1537
1538
1539__all__ = [
1540    'path',
1541    'PathPointer',
1542    'FileSystemPathPointer',
1543    'BufferedGzipFile',
1544    'GzipFileSystemPathPointer',
1545    'GzipFileSystemPathPointer',
1546    'find',
1547    'retrieve',
1548    'FORMATS',
1549    'AUTO_FORMATS',
1550    'load',
1551    'show_cfg',
1552    'clear_cache',
1553    'LazyLoader',
1554    'OpenOnDemandZipFile',
1555    'GzipFileSystemPathPointer',
1556    'SeekableUnicodeStreamReader',
1557]
1558