1# Natural Language Toolkit: Utility functions 2# 3# Copyright (C) 2001-2019 NLTK Project 4# Author: Edward Loper <edloper@gmail.com> 5# URL: <http://nltk.org/> 6# For license information, see LICENSE.TXT 7 8""" 9Functions to find and load NLTK resource files, such as corpora, 10grammars, and saved processing objects. Resource files are identified 11using URLs, such as ``nltk:corpora/abc/rural.txt`` or 12``http://nltk.org/sample/toy.cfg``. The following URL protocols are 13supported: 14 15 - ``file:path``: Specifies the file whose path is *path*. 16 Both relative and absolute paths may be used. 17 18 - ``http://host/path``: Specifies the file stored on the web 19 server *host* at path *path*. 20 21 - ``nltk:path``: Specifies the file stored in the NLTK data 22 package at *path*. NLTK will search for these files in the 23 directories specified by ``nltk.data.path``. 24 25If no protocol is specified, then the default protocol ``nltk:`` will 26be used. 27 28This module provides to functions that can be used to access a 29resource file, given its URL: ``load()`` loads a given resource, and 30adds it to a resource cache; and ``retrieve()`` copies a given resource 31to a local file. 32""" 33from __future__ import print_function, unicode_literals, division 34 35import functools 36import textwrap 37import io 38import os 39import re 40import sys 41import zipfile 42import codecs 43 44from abc import ABCMeta, abstractmethod 45from gzip import GzipFile, WRITE as GZ_WRITE 46 47from six import add_metaclass 48from six import string_types, text_type 49from six.moves.urllib.request import urlopen, url2pathname 50 51try: 52 import cPickle as pickle 53except ImportError: 54 import pickle 55 56try: # Python 3. 57 textwrap_indent = functools.partial(textwrap.indent, prefix=' ') 58except AttributeError: # Python 2; indent() not available for Python2. 59 textwrap_fill = functools.partial( 60 textwrap.fill, 61 initial_indent=' ', 62 subsequent_indent=' ', 63 replace_whitespace=False, 64 ) 65 66 def textwrap_indent(text): 67 return '\n'.join(textwrap_fill(line) for line in text.splitlines()) 68 69 70try: 71 from zlib import Z_SYNC_FLUSH as FLUSH 72except ImportError: 73 from zlib import Z_FINISH as FLUSH 74 75# this import should be more specific: 76import nltk 77from nltk.compat import py3_data, add_py3_data, BytesIO 78 79###################################################################### 80# Search Path 81###################################################################### 82 83path = [] 84"""A list of directories where the NLTK data package might reside. 85 These directories will be checked in order when looking for a 86 resource in the data package. Note that this allows users to 87 substitute in their own versions of resources, if they have them 88 (e.g., in their home directory under ~/nltk_data).""" 89 90# User-specified locations: 91_paths_from_env = os.environ.get('NLTK_DATA', str('')).split(os.pathsep) 92path += [d for d in _paths_from_env if d] 93if 'APPENGINE_RUNTIME' not in os.environ and os.path.expanduser('~/') != '~/': 94 path.append(os.path.expanduser(str('~/nltk_data'))) 95 96if sys.platform.startswith('win'): 97 # Common locations on Windows: 98 path += [ 99 os.path.join(sys.prefix, str('nltk_data')), 100 os.path.join(sys.prefix, str('share'), str('nltk_data')), 101 os.path.join(sys.prefix, str('lib'), str('nltk_data')), 102 os.path.join(os.environ.get(str('APPDATA'), str('C:\\')), str('nltk_data')), 103 str(r'C:\nltk_data'), 104 str(r'D:\nltk_data'), 105 str(r'E:\nltk_data'), 106 ] 107else: 108 # Common locations on UNIX & OS X: 109 path += [ 110 os.path.join(sys.prefix, str('nltk_data')), 111 os.path.join(sys.prefix, str('share'), str('nltk_data')), 112 os.path.join(sys.prefix, str('lib'), str('nltk_data')), 113 str('/usr/share/nltk_data'), 114 str('/usr/local/share/nltk_data'), 115 str('/usr/lib/nltk_data'), 116 str('/usr/local/lib/nltk_data'), 117 ] 118 119 120###################################################################### 121# Util Functions 122###################################################################### 123 124 125def gzip_open_unicode( 126 filename, 127 mode="rb", 128 compresslevel=9, 129 encoding='utf-8', 130 fileobj=None, 131 errors=None, 132 newline=None, 133): 134 if fileobj is None: 135 fileobj = GzipFile(filename, mode, compresslevel, fileobj) 136 return io.TextIOWrapper(fileobj, encoding, errors, newline) 137 138 139def split_resource_url(resource_url): 140 """ 141 Splits a resource url into "<protocol>:<path>". 142 143 >>> windows = sys.platform.startswith('win') 144 >>> split_resource_url('nltk:home/nltk') 145 ('nltk', 'home/nltk') 146 >>> split_resource_url('nltk:/home/nltk') 147 ('nltk', '/home/nltk') 148 >>> split_resource_url('file:/home/nltk') 149 ('file', '/home/nltk') 150 >>> split_resource_url('file:///home/nltk') 151 ('file', '/home/nltk') 152 >>> split_resource_url('file:///C:/home/nltk') 153 ('file', '/C:/home/nltk') 154 """ 155 protocol, path_ = resource_url.split(':', 1) 156 if protocol == 'nltk': 157 pass 158 elif protocol == 'file': 159 if path_.startswith('/'): 160 path_ = '/' + path_.lstrip('/') 161 else: 162 path_ = re.sub(r'^/{0,2}', '', path_) 163 return protocol, path_ 164 165 166def normalize_resource_url(resource_url): 167 r""" 168 Normalizes a resource url 169 170 >>> windows = sys.platform.startswith('win') 171 >>> os.path.normpath(split_resource_url(normalize_resource_url('file:grammar.fcfg'))[1]) == \ 172 ... ('\\' if windows else '') + os.path.abspath(os.path.join(os.curdir, 'grammar.fcfg')) 173 True 174 >>> not windows or normalize_resource_url('file:C:/dir/file') == 'file:///C:/dir/file' 175 True 176 >>> not windows or normalize_resource_url('file:C:\\dir\\file') == 'file:///C:/dir/file' 177 True 178 >>> not windows or normalize_resource_url('file:C:\\dir/file') == 'file:///C:/dir/file' 179 True 180 >>> not windows or normalize_resource_url('file://C:/dir/file') == 'file:///C:/dir/file' 181 True 182 >>> not windows or normalize_resource_url('file:////C:/dir/file') == 'file:///C:/dir/file' 183 True 184 >>> not windows or normalize_resource_url('nltk:C:/dir/file') == 'file:///C:/dir/file' 185 True 186 >>> not windows or normalize_resource_url('nltk:C:\\dir\\file') == 'file:///C:/dir/file' 187 True 188 >>> windows or normalize_resource_url('file:/dir/file/toy.cfg') == 'file:///dir/file/toy.cfg' 189 True 190 >>> normalize_resource_url('nltk:home/nltk') 191 'nltk:home/nltk' 192 >>> windows or normalize_resource_url('nltk:/home/nltk') == 'file:///home/nltk' 193 True 194 >>> normalize_resource_url('http://example.com/dir/file') 195 'http://example.com/dir/file' 196 >>> normalize_resource_url('dir/file') 197 'nltk:dir/file' 198 """ 199 try: 200 protocol, name = split_resource_url(resource_url) 201 except ValueError: 202 # the resource url has no protocol, use the nltk protocol by default 203 protocol = 'nltk' 204 name = resource_url 205 # use file protocol if the path is an absolute path 206 if protocol == 'nltk' and os.path.isabs(name): 207 protocol = 'file://' 208 name = normalize_resource_name(name, False, None) 209 elif protocol == 'file': 210 protocol = 'file://' 211 # name is absolute 212 name = normalize_resource_name(name, False, None) 213 elif protocol == 'nltk': 214 protocol = 'nltk:' 215 name = normalize_resource_name(name, True) 216 else: 217 # handled by urllib 218 protocol += '://' 219 return ''.join([protocol, name]) 220 221 222def normalize_resource_name(resource_name, allow_relative=True, relative_path=None): 223 """ 224 :type resource_name: str or unicode 225 :param resource_name: The name of the resource to search for. 226 Resource names are posix-style relative path names, such as 227 ``corpora/brown``. Directory names will automatically 228 be converted to a platform-appropriate path separator. 229 Directory trailing slashes are preserved 230 231 >>> windows = sys.platform.startswith('win') 232 >>> normalize_resource_name('.', True) 233 './' 234 >>> normalize_resource_name('./', True) 235 './' 236 >>> windows or normalize_resource_name('dir/file', False, '/') == '/dir/file' 237 True 238 >>> not windows or normalize_resource_name('C:/file', False, '/') == '/C:/file' 239 True 240 >>> windows or normalize_resource_name('/dir/file', False, '/') == '/dir/file' 241 True 242 >>> windows or normalize_resource_name('../dir/file', False, '/') == '/dir/file' 243 True 244 >>> not windows or normalize_resource_name('/dir/file', True, '/') == 'dir/file' 245 True 246 >>> windows or normalize_resource_name('/dir/file', True, '/') == '/dir/file' 247 True 248 """ 249 is_dir = bool(re.search(r'[\\/.]$', resource_name)) or resource_name.endswith( 250 os.path.sep 251 ) 252 if sys.platform.startswith('win'): 253 resource_name = resource_name.lstrip('/') 254 else: 255 resource_name = re.sub(r'^/+', '/', resource_name) 256 if allow_relative: 257 resource_name = os.path.normpath(resource_name) 258 else: 259 if relative_path is None: 260 relative_path = os.curdir 261 resource_name = os.path.abspath(os.path.join(relative_path, resource_name)) 262 resource_name = resource_name.replace('\\', '/').replace(os.path.sep, '/') 263 if sys.platform.startswith('win') and os.path.isabs(resource_name): 264 resource_name = '/' + resource_name 265 if is_dir and not resource_name.endswith('/'): 266 resource_name += '/' 267 return resource_name 268 269 270###################################################################### 271# Path Pointers 272###################################################################### 273 274 275@add_metaclass(ABCMeta) 276class PathPointer(object): 277 """ 278 An abstract base class for 'path pointers,' used by NLTK's data 279 package to identify specific paths. Two subclasses exist: 280 ``FileSystemPathPointer`` identifies a file that can be accessed 281 directly via a given absolute path. ``ZipFilePathPointer`` 282 identifies a file contained within a zipfile, that can be accessed 283 by reading that zipfile. 284 """ 285 286 @abstractmethod 287 def open(self, encoding=None): 288 """ 289 Return a seekable read-only stream that can be used to read 290 the contents of the file identified by this path pointer. 291 292 :raise IOError: If the path specified by this pointer does 293 not contain a readable file. 294 """ 295 296 @abstractmethod 297 def file_size(self): 298 """ 299 Return the size of the file pointed to by this path pointer, 300 in bytes. 301 302 :raise IOError: If the path specified by this pointer does 303 not contain a readable file. 304 """ 305 306 @abstractmethod 307 def join(self, fileid): 308 """ 309 Return a new path pointer formed by starting at the path 310 identified by this pointer, and then following the relative 311 path given by ``fileid``. The path components of ``fileid`` 312 should be separated by forward slashes, regardless of 313 the underlying file system's path seperator character. 314 """ 315 316 317class FileSystemPathPointer(PathPointer, text_type): 318 """ 319 A path pointer that identifies a file which can be accessed 320 directly via a given absolute path. 321 """ 322 323 @py3_data 324 def __init__(self, _path): 325 """ 326 Create a new path pointer for the given absolute path. 327 328 :raise IOError: If the given path does not exist. 329 """ 330 331 _path = os.path.abspath(_path) 332 if not os.path.exists(_path): 333 raise IOError('No such file or directory: %r' % _path) 334 self._path = _path 335 336 # There's no need to call str.__init__(), since it's a no-op; 337 # str does all of its setup work in __new__. 338 339 @property 340 def path(self): 341 """The absolute path identified by this path pointer.""" 342 return self._path 343 344 def open(self, encoding=None): 345 stream = open(self._path, 'rb') 346 if encoding is not None: 347 stream = SeekableUnicodeStreamReader(stream, encoding) 348 return stream 349 350 def file_size(self): 351 return os.stat(self._path).st_size 352 353 def join(self, fileid): 354 _path = os.path.join(self._path, fileid) 355 return FileSystemPathPointer(_path) 356 357 def __repr__(self): 358 # This should be a byte string under Python 2.x; 359 # we don't want transliteration here so 360 # @python_2_unicode_compatible is not used. 361 return str('FileSystemPathPointer(%r)' % self._path) 362 363 def __str__(self): 364 return self._path 365 366 367class BufferedGzipFile(GzipFile): 368 """ 369 A ``GzipFile`` subclass that buffers calls to ``read()`` and ``write()``. 370 This allows faster reads and writes of data to and from gzip-compressed 371 files at the cost of using more memory. 372 373 The default buffer size is 2MB. 374 375 ``BufferedGzipFile`` is useful for loading large gzipped pickle objects 376 as well as writing large encoded feature files for classifier training. 377 """ 378 379 MB = 2 ** 20 380 SIZE = 2 * MB 381 382 @py3_data 383 def __init__( 384 self, filename=None, mode=None, compresslevel=9, fileobj=None, **kwargs 385 ): 386 """ 387 Return a buffered gzip file object. 388 389 :param filename: a filesystem path 390 :type filename: str 391 :param mode: a file mode which can be any of 'r', 'rb', 'a', 'ab', 392 'w', or 'wb' 393 :type mode: str 394 :param compresslevel: The compresslevel argument is an integer from 1 395 to 9 controlling the level of compression; 1 is fastest and 396 produces the least compression, and 9 is slowest and produces the 397 most compression. The default is 9. 398 :type compresslevel: int 399 :param fileobj: a BytesIO stream to read from instead of a file. 400 :type fileobj: BytesIO 401 :param size: number of bytes to buffer during calls to read() and write() 402 :type size: int 403 :rtype: BufferedGzipFile 404 """ 405 GzipFile.__init__(self, filename, mode, compresslevel, fileobj) 406 self._size = kwargs.get('size', self.SIZE) 407 self._nltk_buffer = BytesIO() 408 # cStringIO does not support len. 409 self._len = 0 410 411 def _reset_buffer(self): 412 # For some reason calling BytesIO.truncate() here will lead to 413 # inconsistent writes so just set _buffer to a new BytesIO object. 414 self._nltk_buffer = BytesIO() 415 self._len = 0 416 417 def _write_buffer(self, data): 418 # Simply write to the buffer and increment the buffer size. 419 if data is not None: 420 self._nltk_buffer.write(data) 421 self._len += len(data) 422 423 def _write_gzip(self, data): 424 # Write the current buffer to the GzipFile. 425 GzipFile.write(self, self._nltk_buffer.getvalue()) 426 # Then reset the buffer and write the new data to the buffer. 427 self._reset_buffer() 428 self._write_buffer(data) 429 430 def close(self): 431 # GzipFile.close() doesn't actuallly close anything. 432 if self.mode == GZ_WRITE: 433 self._write_gzip(None) 434 self._reset_buffer() 435 return GzipFile.close(self) 436 437 def flush(self, lib_mode=FLUSH): 438 self._nltk_buffer.flush() 439 GzipFile.flush(self, lib_mode) 440 441 def read(self, size=None): 442 if not size: 443 size = self._size 444 contents = BytesIO() 445 while True: 446 blocks = GzipFile.read(self, size) 447 if not blocks: 448 contents.flush() 449 break 450 contents.write(blocks) 451 return contents.getvalue() 452 else: 453 return GzipFile.read(self, size) 454 455 def write(self, data, size=-1): 456 """ 457 :param data: bytes to write to file or buffer 458 :type data: bytes 459 :param size: buffer at least size bytes before writing to file 460 :type size: int 461 """ 462 if not size: 463 size = self._size 464 if self._len + len(data) <= size: 465 self._write_buffer(data) 466 else: 467 self._write_gzip(data) 468 469 470class GzipFileSystemPathPointer(FileSystemPathPointer): 471 """ 472 A subclass of ``FileSystemPathPointer`` that identifies a gzip-compressed 473 file located at a given absolute path. ``GzipFileSystemPathPointer`` is 474 appropriate for loading large gzip-compressed pickle objects efficiently. 475 """ 476 477 def open(self, encoding=None): 478 # Note: In >= Python3.5, GzipFile is already using a 479 # buffered reader in the backend which has a variable self._buffer 480 # See https://github.com/nltk/nltk/issues/1308 481 if sys.version.startswith('2.7') or sys.version.startswith('3.4'): 482 stream = BufferedGzipFile(self._path, 'rb') 483 else: 484 stream = GzipFile(self._path, 'rb') 485 if encoding: 486 stream = SeekableUnicodeStreamReader(stream, encoding) 487 return stream 488 489 490class ZipFilePathPointer(PathPointer): 491 """ 492 A path pointer that identifies a file contained within a zipfile, 493 which can be accessed by reading that zipfile. 494 """ 495 496 @py3_data 497 def __init__(self, zipfile, entry=''): 498 """ 499 Create a new path pointer pointing at the specified entry 500 in the given zipfile. 501 502 :raise IOError: If the given zipfile does not exist, or if it 503 does not contain the specified entry. 504 """ 505 if isinstance(zipfile, string_types): 506 zipfile = OpenOnDemandZipFile(os.path.abspath(zipfile)) 507 508 # Check that the entry exists: 509 if entry: 510 511 # Normalize the entry string, it should be relative: 512 entry = normalize_resource_name(entry, True, '/').lstrip('/') 513 514 try: 515 zipfile.getinfo(entry) 516 except Exception: 517 # Sometimes directories aren't explicitly listed in 518 # the zip file. So if `entry` is a directory name, 519 # then check if the zipfile contains any files that 520 # are under the given directory. 521 if entry.endswith('/') and [ 522 n for n in zipfile.namelist() if n.startswith(entry) 523 ]: 524 pass # zipfile contains a file in that directory. 525 else: 526 # Otherwise, complain. 527 raise IOError( 528 'Zipfile %r does not contain %r' % (zipfile.filename, entry) 529 ) 530 self._zipfile = zipfile 531 self._entry = entry 532 533 @property 534 def zipfile(self): 535 """ 536 The zipfile.ZipFile object used to access the zip file 537 containing the entry identified by this path pointer. 538 """ 539 return self._zipfile 540 541 @property 542 def entry(self): 543 """ 544 The name of the file within zipfile that this path 545 pointer points to. 546 """ 547 return self._entry 548 549 def open(self, encoding=None): 550 data = self._zipfile.read(self._entry) 551 stream = BytesIO(data) 552 if self._entry.endswith('.gz'): 553 # Note: In >= Python3.5, GzipFile is already using a 554 # buffered reader in the backend which has a variable self._buffer 555 # See https://github.com/nltk/nltk/issues/1308 556 if sys.version.startswith('2.7') or sys.version.startswith('3.4'): 557 stream = BufferedGzipFile(self._entry, fileobj=stream) 558 else: 559 stream = GzipFile(self._entry, fileobj=stream) 560 elif encoding is not None: 561 stream = SeekableUnicodeStreamReader(stream, encoding) 562 return stream 563 564 def file_size(self): 565 return self._zipfile.getinfo(self._entry).file_size 566 567 def join(self, fileid): 568 entry = '%s/%s' % (self._entry, fileid) 569 return ZipFilePathPointer(self._zipfile, entry) 570 571 def __repr__(self): 572 return str('ZipFilePathPointer(%r, %r)') % (self._zipfile.filename, self._entry) 573 574 def __str__(self): 575 return os.path.normpath(os.path.join(self._zipfile.filename, self._entry)) 576 577 578###################################################################### 579# Access Functions 580###################################################################### 581 582# Don't use a weak dictionary, because in the common case this 583# causes a lot more reloading that necessary. 584_resource_cache = {} 585"""A dictionary used to cache resources so that they won't 586 need to be loaded more than once.""" 587 588 589def find(resource_name, paths=None): 590 """ 591 Find the given resource by searching through the directories and 592 zip files in paths, where a None or empty string specifies an absolute path. 593 Returns a corresponding path name. If the given resource is not 594 found, raise a ``LookupError``, whose message gives a pointer to 595 the installation instructions for the NLTK downloader. 596 597 Zip File Handling: 598 599 - If ``resource_name`` contains a component with a ``.zip`` 600 extension, then it is assumed to be a zipfile; and the 601 remaining path components are used to look inside the zipfile. 602 603 - If any element of ``nltk.data.path`` has a ``.zip`` extension, 604 then it is assumed to be a zipfile. 605 606 - If a given resource name that does not contain any zipfile 607 component is not found initially, then ``find()`` will make a 608 second attempt to find that resource, by replacing each 609 component *p* in the path with *p.zip/p*. For example, this 610 allows ``find()`` to map the resource name 611 ``corpora/chat80/cities.pl`` to a zip file path pointer to 612 ``corpora/chat80.zip/chat80/cities.pl``. 613 614 - When using ``find()`` to locate a directory contained in a 615 zipfile, the resource name must end with the forward slash 616 character. Otherwise, ``find()`` will not locate the 617 directory. 618 619 :type resource_name: str or unicode 620 :param resource_name: The name of the resource to search for. 621 Resource names are posix-style relative path names, such as 622 ``corpora/brown``. Directory names will be 623 automatically converted to a platform-appropriate path separator. 624 :rtype: str 625 """ 626 resource_name = normalize_resource_name(resource_name, True) 627 628 # Resolve default paths at runtime in-case the user overrides 629 # nltk.data.path 630 if paths is None: 631 paths = path 632 633 # Check if the resource name includes a zipfile name 634 m = re.match(r'(.*\.zip)/?(.*)$|', resource_name) 635 zipfile, zipentry = m.groups() 636 637 # Check each item in our path 638 for path_ in paths: 639 # Is the path item a zipfile? 640 if path_ and (os.path.isfile(path_) and path_.endswith('.zip')): 641 try: 642 return ZipFilePathPointer(path_, resource_name) 643 except IOError: 644 # resource not in zipfile 645 continue 646 647 # Is the path item a directory or is resource_name an absolute path? 648 elif not path_ or os.path.isdir(path_): 649 if zipfile is None: 650 p = os.path.join(path_, url2pathname(resource_name)) 651 if os.path.exists(p): 652 if p.endswith('.gz'): 653 return GzipFileSystemPathPointer(p) 654 else: 655 return FileSystemPathPointer(p) 656 else: 657 p = os.path.join(path_, url2pathname(zipfile)) 658 if os.path.exists(p): 659 try: 660 return ZipFilePathPointer(p, zipentry) 661 except IOError: 662 # resource not in zipfile 663 continue 664 665 # Fallback: if the path doesn't include a zip file, then try 666 # again, assuming that one of the path components is inside a 667 # zipfile of the same name. 668 if zipfile is None: 669 pieces = resource_name.split('/') 670 for i in range(len(pieces)): 671 modified_name = '/'.join(pieces[:i] + [pieces[i] + '.zip'] + pieces[i:]) 672 try: 673 return find(modified_name, paths) 674 except LookupError: 675 pass 676 677 # Identify the package (i.e. the .zip file) to download. 678 resource_zipname = resource_name.split('/')[1] 679 if resource_zipname.endswith('.zip'): 680 resource_zipname = resource_zipname.rpartition('.')[0] 681 # Display a friendly error message if the resource wasn't found: 682 msg = str( 683 "Resource \33[93m{resource}\033[0m not found.\n" 684 "Please use the NLTK Downloader to obtain the resource:\n\n" 685 "\33[31m" # To display red text in terminal. 686 ">>> import nltk\n" 687 ">>> nltk.download(\'{resource}\')\n" 688 "\033[0m" 689 ).format(resource=resource_zipname) 690 msg = textwrap_indent(msg) 691 692 msg += '\n For more information see: https://www.nltk.org/data.html\n' 693 694 msg += '\n Attempted to load \33[93m{resource_name}\033[0m\n'.format( 695 resource_name=resource_name 696 ) 697 698 msg += '\n Searched in:' + ''.join('\n - %r' % d for d in paths) 699 sep = '*' * 70 700 resource_not_found = '\n%s\n%s\n%s\n' % (sep, msg, sep) 701 raise LookupError(resource_not_found) 702 703 704def retrieve(resource_url, filename=None, verbose=True): 705 """ 706 Copy the given resource to a local file. If no filename is 707 specified, then use the URL's filename. If there is already a 708 file named ``filename``, then raise a ``ValueError``. 709 710 :type resource_url: str 711 :param resource_url: A URL specifying where the resource should be 712 loaded from. The default protocol is "nltk:", which searches 713 for the file in the the NLTK data package. 714 """ 715 resource_url = normalize_resource_url(resource_url) 716 if filename is None: 717 if resource_url.startswith('file:'): 718 filename = os.path.split(resource_url)[-1] 719 else: 720 filename = re.sub(r'(^\w+:)?.*/', '', resource_url) 721 if os.path.exists(filename): 722 filename = os.path.abspath(filename) 723 raise ValueError("File %r already exists!" % filename) 724 725 if verbose: 726 print('Retrieving %r, saving to %r' % (resource_url, filename)) 727 728 # Open the input & output streams. 729 infile = _open(resource_url) 730 731 # Copy infile -> outfile, using 64k blocks. 732 with open(filename, "wb") as outfile: 733 while True: 734 s = infile.read(1024 * 64) # 64k blocks. 735 outfile.write(s) 736 if not s: 737 break 738 739 infile.close() 740 741 742#: A dictionary describing the formats that are supported by NLTK's 743#: load() method. Keys are format names, and values are format 744#: descriptions. 745FORMATS = { 746 'pickle': "A serialized python object, stored using the pickle module.", 747 'json': "A serialized python object, stored using the json module.", 748 'yaml': "A serialized python object, stored using the yaml module.", 749 'cfg': "A context free grammar.", 750 'pcfg': "A probabilistic CFG.", 751 'fcfg': "A feature CFG.", 752 'fol': "A list of first order logic expressions, parsed with " 753 "nltk.sem.logic.Expression.fromstring.", 754 'logic': "A list of first order logic expressions, parsed with " 755 "nltk.sem.logic.LogicParser. Requires an additional logic_parser " 756 "parameter", 757 'val': "A semantic valuation, parsed by nltk.sem.Valuation.fromstring.", 758 'raw': "The raw (byte string) contents of a file.", 759 'text': "The raw (unicode string) contents of a file. ", 760} 761 762#: A dictionary mapping from file extensions to format names, used 763#: by load() when format="auto" to decide the format for a 764#: given resource url. 765AUTO_FORMATS = { 766 'pickle': 'pickle', 767 'json': 'json', 768 'yaml': 'yaml', 769 'cfg': 'cfg', 770 'pcfg': 'pcfg', 771 'fcfg': 'fcfg', 772 'fol': 'fol', 773 'logic': 'logic', 774 'val': 'val', 775 'txt': 'text', 776 'text': 'text', 777} 778 779 780def load( 781 resource_url, 782 format='auto', 783 cache=True, 784 verbose=False, 785 logic_parser=None, 786 fstruct_reader=None, 787 encoding=None, 788): 789 """ 790 Load a given resource from the NLTK data package. The following 791 resource formats are currently supported: 792 793 - ``pickle`` 794 - ``json`` 795 - ``yaml`` 796 - ``cfg`` (context free grammars) 797 - ``pcfg`` (probabilistic CFGs) 798 - ``fcfg`` (feature-based CFGs) 799 - ``fol`` (formulas of First Order Logic) 800 - ``logic`` (Logical formulas to be parsed by the given logic_parser) 801 - ``val`` (valuation of First Order Logic model) 802 - ``text`` (the file contents as a unicode string) 803 - ``raw`` (the raw file contents as a byte string) 804 805 If no format is specified, ``load()`` will attempt to determine a 806 format based on the resource name's file extension. If that 807 fails, ``load()`` will raise a ``ValueError`` exception. 808 809 For all text formats (everything except ``pickle``, ``json``, ``yaml`` and ``raw``), 810 it tries to decode the raw contents using UTF-8, and if that doesn't 811 work, it tries with ISO-8859-1 (Latin-1), unless the ``encoding`` 812 is specified. 813 814 :type resource_url: str 815 :param resource_url: A URL specifying where the resource should be 816 loaded from. The default protocol is "nltk:", which searches 817 for the file in the the NLTK data package. 818 :type cache: bool 819 :param cache: If true, add this resource to a cache. If load() 820 finds a resource in its cache, then it will return it from the 821 cache rather than loading it. The cache uses weak references, 822 so a resource wil automatically be expunged from the cache 823 when no more objects are using it. 824 :type verbose: bool 825 :param verbose: If true, print a message when loading a resource. 826 Messages are not displayed when a resource is retrieved from 827 the cache. 828 :type logic_parser: LogicParser 829 :param logic_parser: The parser that will be used to parse logical 830 expressions. 831 :type fstruct_reader: FeatStructReader 832 :param fstruct_reader: The parser that will be used to parse the 833 feature structure of an fcfg. 834 :type encoding: str 835 :param encoding: the encoding of the input; only used for text formats. 836 """ 837 resource_url = normalize_resource_url(resource_url) 838 resource_url = add_py3_data(resource_url) 839 840 # Determine the format of the resource. 841 if format == 'auto': 842 resource_url_parts = resource_url.split('.') 843 ext = resource_url_parts[-1] 844 if ext == 'gz': 845 ext = resource_url_parts[-2] 846 format = AUTO_FORMATS.get(ext) 847 if format is None: 848 raise ValueError( 849 'Could not determine format for %s based ' 850 'on its file\nextension; use the "format" ' 851 'argument to specify the format explicitly.' % resource_url 852 ) 853 854 if format not in FORMATS: 855 raise ValueError('Unknown format type: %s!' % (format,)) 856 857 # If we've cached the resource, then just return it. 858 if cache: 859 resource_val = _resource_cache.get((resource_url, format)) 860 if resource_val is not None: 861 if verbose: 862 print('<<Using cached copy of %s>>' % (resource_url,)) 863 return resource_val 864 865 # Let the user know what's going on. 866 if verbose: 867 print('<<Loading %s>>' % (resource_url,)) 868 869 # Load the resource. 870 opened_resource = _open(resource_url) 871 872 if format == 'raw': 873 resource_val = opened_resource.read() 874 elif format == 'pickle': 875 resource_val = pickle.load(opened_resource) 876 elif format == 'json': 877 import json 878 from nltk.jsontags import json_tags 879 880 resource_val = json.load(opened_resource) 881 tag = None 882 if len(resource_val) != 1: 883 tag = next(resource_val.keys()) 884 if tag not in json_tags: 885 raise ValueError('Unknown json tag.') 886 elif format == 'yaml': 887 import yaml 888 889 resource_val = yaml.load(opened_resource) 890 else: 891 # The resource is a text format. 892 binary_data = opened_resource.read() 893 if encoding is not None: 894 string_data = binary_data.decode(encoding) 895 else: 896 try: 897 string_data = binary_data.decode('utf-8') 898 except UnicodeDecodeError: 899 string_data = binary_data.decode('latin-1') 900 if format == 'text': 901 resource_val = string_data 902 elif format == 'cfg': 903 resource_val = nltk.grammar.CFG.fromstring(string_data, encoding=encoding) 904 elif format == 'pcfg': 905 resource_val = nltk.grammar.PCFG.fromstring(string_data, encoding=encoding) 906 elif format == 'fcfg': 907 resource_val = nltk.grammar.FeatureGrammar.fromstring( 908 string_data, 909 logic_parser=logic_parser, 910 fstruct_reader=fstruct_reader, 911 encoding=encoding, 912 ) 913 elif format == 'fol': 914 resource_val = nltk.sem.read_logic( 915 string_data, 916 logic_parser=nltk.sem.logic.LogicParser(), 917 encoding=encoding, 918 ) 919 elif format == 'logic': 920 resource_val = nltk.sem.read_logic( 921 string_data, logic_parser=logic_parser, encoding=encoding 922 ) 923 elif format == 'val': 924 resource_val = nltk.sem.read_valuation(string_data, encoding=encoding) 925 else: 926 raise AssertionError( 927 "Internal NLTK error: Format %s isn't " 928 "handled by nltk.data.load()" % (format,) 929 ) 930 931 opened_resource.close() 932 933 # If requested, add it to the cache. 934 if cache: 935 try: 936 _resource_cache[(resource_url, format)] = resource_val 937 # TODO: add this line 938 # print('<<Caching a copy of %s>>' % (resource_url,)) 939 except TypeError: 940 # We can't create weak references to some object types, like 941 # strings and tuples. For now, just don't cache them. 942 pass 943 944 return resource_val 945 946 947def show_cfg(resource_url, escape='##'): 948 """ 949 Write out a grammar file, ignoring escaped and empty lines. 950 951 :type resource_url: str 952 :param resource_url: A URL specifying where the resource should be 953 loaded from. The default protocol is "nltk:", which searches 954 for the file in the the NLTK data package. 955 :type escape: str 956 :param escape: Prepended string that signals lines to be ignored 957 """ 958 resource_url = normalize_resource_url(resource_url) 959 resource_val = load(resource_url, format='text', cache=False) 960 lines = resource_val.splitlines() 961 for l in lines: 962 if l.startswith(escape): 963 continue 964 if re.match('^$', l): 965 continue 966 print(l) 967 968 969def clear_cache(): 970 """ 971 Remove all objects from the resource cache. 972 :see: load() 973 """ 974 _resource_cache.clear() 975 976 977def _open(resource_url): 978 """ 979 Helper function that returns an open file object for a resource, 980 given its resource URL. If the given resource URL uses the "nltk:" 981 protocol, or uses no protocol, then use ``nltk.data.find`` to find 982 its path, and open it with the given mode; if the resource URL 983 uses the 'file' protocol, then open the file with the given mode; 984 otherwise, delegate to ``urllib2.urlopen``. 985 986 :type resource_url: str 987 :param resource_url: A URL specifying where the resource should be 988 loaded from. The default protocol is "nltk:", which searches 989 for the file in the the NLTK data package. 990 """ 991 resource_url = normalize_resource_url(resource_url) 992 protocol, path_ = split_resource_url(resource_url) 993 994 if protocol is None or protocol.lower() == 'nltk': 995 return find(path_, path + ['']).open() 996 elif protocol.lower() == 'file': 997 # urllib might not use mode='rb', so handle this one ourselves: 998 return find(path_, ['']).open() 999 else: 1000 return urlopen(resource_url) 1001 1002 1003###################################################################### 1004# Lazy Resource Loader 1005###################################################################### 1006 1007# We shouldn't apply @python_2_unicode_compatible 1008# decorator to LazyLoader, this is resource.__class__ responsibility. 1009 1010 1011class LazyLoader(object): 1012 @py3_data 1013 def __init__(self, _path): 1014 self._path = _path 1015 1016 def __load(self): 1017 resource = load(self._path) 1018 # This is where the magic happens! Transform ourselves into 1019 # the object by modifying our own __dict__ and __class__ to 1020 # match that of `resource`. 1021 self.__dict__ = resource.__dict__ 1022 self.__class__ = resource.__class__ 1023 1024 def __getattr__(self, attr): 1025 self.__load() 1026 # This looks circular, but its not, since __load() changes our 1027 # __class__ to something new: 1028 return getattr(self, attr) 1029 1030 def __repr__(self): 1031 self.__load() 1032 # This looks circular, but its not, since __load() changes our 1033 # __class__ to something new: 1034 return repr(self) 1035 1036 1037###################################################################### 1038# Open-On-Demand ZipFile 1039###################################################################### 1040 1041 1042class OpenOnDemandZipFile(zipfile.ZipFile): 1043 """ 1044 A subclass of ``zipfile.ZipFile`` that closes its file pointer 1045 whenever it is not using it; and re-opens it when it needs to read 1046 data from the zipfile. This is useful for reducing the number of 1047 open file handles when many zip files are being accessed at once. 1048 ``OpenOnDemandZipFile`` must be constructed from a filename, not a 1049 file-like object (to allow re-opening). ``OpenOnDemandZipFile`` is 1050 read-only (i.e. ``write()`` and ``writestr()`` are disabled. 1051 """ 1052 1053 @py3_data 1054 def __init__(self, filename): 1055 if not isinstance(filename, string_types): 1056 raise TypeError('ReopenableZipFile filename must be a string') 1057 zipfile.ZipFile.__init__(self, filename) 1058 assert self.filename == filename 1059 self.close() 1060 # After closing a ZipFile object, the _fileRefCnt needs to be cleared 1061 # for Python2and3 compatible code. 1062 self._fileRefCnt = 0 1063 1064 def read(self, name): 1065 assert self.fp is None 1066 self.fp = open(self.filename, 'rb') 1067 value = zipfile.ZipFile.read(self, name) 1068 # Ensure that _fileRefCnt needs to be set for Python2and3 compatible code. 1069 # Since we only opened one file here, we add 1. 1070 self._fileRefCnt += 1 1071 self.close() 1072 return value 1073 1074 def write(self, *args, **kwargs): 1075 """:raise NotImplementedError: OpenOnDemandZipfile is read-only""" 1076 raise NotImplementedError('OpenOnDemandZipfile is read-only') 1077 1078 def writestr(self, *args, **kwargs): 1079 """:raise NotImplementedError: OpenOnDemandZipfile is read-only""" 1080 raise NotImplementedError('OpenOnDemandZipfile is read-only') 1081 1082 def __repr__(self): 1083 return repr(str('OpenOnDemandZipFile(%r)') % self.filename) 1084 1085 1086###################################################################### 1087# { Seekable Unicode Stream Reader 1088###################################################################### 1089 1090 1091class SeekableUnicodeStreamReader(object): 1092 """ 1093 A stream reader that automatically encodes the source byte stream 1094 into unicode (like ``codecs.StreamReader``); but still supports the 1095 ``seek()`` and ``tell()`` operations correctly. This is in contrast 1096 to ``codecs.StreamReader``, which provide *broken* ``seek()`` and 1097 ``tell()`` methods. 1098 1099 This class was motivated by ``StreamBackedCorpusView``, which 1100 makes extensive use of ``seek()`` and ``tell()``, and needs to be 1101 able to handle unicode-encoded files. 1102 1103 Note: this class requires stateless decoders. To my knowledge, 1104 this shouldn't cause a problem with any of python's builtin 1105 unicode encodings. 1106 """ 1107 1108 DEBUG = True # : If true, then perform extra sanity checks. 1109 1110 @py3_data 1111 def __init__(self, stream, encoding, errors='strict'): 1112 # Rewind the stream to its beginning. 1113 stream.seek(0) 1114 1115 self.stream = stream 1116 """The underlying stream.""" 1117 1118 self.encoding = encoding 1119 """The name of the encoding that should be used to encode the 1120 underlying stream.""" 1121 1122 self.errors = errors 1123 """The error mode that should be used when decoding data from 1124 the underlying stream. Can be 'strict', 'ignore', or 1125 'replace'.""" 1126 1127 self.decode = codecs.getdecoder(encoding) 1128 """The function that is used to decode byte strings into 1129 unicode strings.""" 1130 1131 self.bytebuffer = b'' 1132 """A buffer to use bytes that have been read but have not yet 1133 been decoded. This is only used when the final bytes from 1134 a read do not form a complete encoding for a character.""" 1135 1136 self.linebuffer = None 1137 """A buffer used by ``readline()`` to hold characters that have 1138 been read, but have not yet been returned by ``read()`` or 1139 ``readline()``. This buffer consists of a list of unicode 1140 strings, where each string corresponds to a single line. 1141 The final element of the list may or may not be a complete 1142 line. Note that the existence of a linebuffer makes the 1143 ``tell()`` operation more complex, because it must backtrack 1144 to the beginning of the buffer to determine the correct 1145 file position in the underlying byte stream.""" 1146 1147 self._rewind_checkpoint = 0 1148 """The file position at which the most recent read on the 1149 underlying stream began. This is used, together with 1150 ``_rewind_numchars``, to backtrack to the beginning of 1151 ``linebuffer`` (which is required by ``tell()``).""" 1152 1153 self._rewind_numchars = None 1154 """The number of characters that have been returned since the 1155 read that started at ``_rewind_checkpoint``. This is used, 1156 together with ``_rewind_checkpoint``, to backtrack to the 1157 beginning of ``linebuffer`` (which is required by ``tell()``).""" 1158 1159 self._bom = self._check_bom() 1160 """The length of the byte order marker at the beginning of 1161 the stream (or None for no byte order marker).""" 1162 1163 # ///////////////////////////////////////////////////////////////// 1164 # Read methods 1165 # ///////////////////////////////////////////////////////////////// 1166 1167 def read(self, size=None): 1168 """ 1169 Read up to ``size`` bytes, decode them using this reader's 1170 encoding, and return the resulting unicode string. 1171 1172 :param size: The maximum number of bytes to read. If not 1173 specified, then read as many bytes as possible. 1174 :type size: int 1175 :rtype: unicode 1176 """ 1177 chars = self._read(size) 1178 1179 # If linebuffer is not empty, then include it in the result 1180 if self.linebuffer: 1181 chars = ''.join(self.linebuffer) + chars 1182 self.linebuffer = None 1183 self._rewind_numchars = None 1184 1185 return chars 1186 1187 def discard_line(self): 1188 if self.linebuffer and len(self.linebuffer) > 1: 1189 line = self.linebuffer.pop(0) 1190 self._rewind_numchars += len(line) 1191 else: 1192 self.stream.readline() 1193 1194 def readline(self, size=None): 1195 """ 1196 Read a line of text, decode it using this reader's encoding, 1197 and return the resulting unicode string. 1198 1199 :param size: The maximum number of bytes to read. If no 1200 newline is encountered before ``size`` bytes have been read, 1201 then the returned value may not be a complete line of text. 1202 :type size: int 1203 """ 1204 # If we have a non-empty linebuffer, then return the first 1205 # line from it. (Note that the last element of linebuffer may 1206 # not be a complete line; so let _read() deal with it.) 1207 if self.linebuffer and len(self.linebuffer) > 1: 1208 line = self.linebuffer.pop(0) 1209 self._rewind_numchars += len(line) 1210 return line 1211 1212 readsize = size or 72 1213 chars = '' 1214 1215 # If there's a remaining incomplete line in the buffer, add it. 1216 if self.linebuffer: 1217 chars += self.linebuffer.pop() 1218 self.linebuffer = None 1219 1220 while True: 1221 startpos = self.stream.tell() - len(self.bytebuffer) 1222 new_chars = self._read(readsize) 1223 1224 # If we're at a '\r', then read one extra character, since 1225 # it might be a '\n', to get the proper line ending. 1226 if new_chars and new_chars.endswith('\r'): 1227 new_chars += self._read(1) 1228 1229 chars += new_chars 1230 lines = chars.splitlines(True) 1231 if len(lines) > 1: 1232 line = lines[0] 1233 self.linebuffer = lines[1:] 1234 self._rewind_numchars = len(new_chars) - (len(chars) - len(line)) 1235 self._rewind_checkpoint = startpos 1236 break 1237 elif len(lines) == 1: 1238 line0withend = lines[0] 1239 line0withoutend = lines[0].splitlines(False)[0] 1240 if line0withend != line0withoutend: # complete line 1241 line = line0withend 1242 break 1243 1244 if not new_chars or size is not None: 1245 line = chars 1246 break 1247 1248 # Read successively larger blocks of text. 1249 if readsize < 8000: 1250 readsize *= 2 1251 1252 return line 1253 1254 def readlines(self, sizehint=None, keepends=True): 1255 """ 1256 Read this file's contents, decode them using this reader's 1257 encoding, and return it as a list of unicode lines. 1258 1259 :rtype: list(unicode) 1260 :param sizehint: Ignored. 1261 :param keepends: If false, then strip newlines. 1262 """ 1263 return self.read().splitlines(keepends) 1264 1265 def next(self): 1266 """Return the next decoded line from the underlying stream.""" 1267 line = self.readline() 1268 if line: 1269 return line 1270 else: 1271 raise StopIteration 1272 1273 def __next__(self): 1274 return self.next() 1275 1276 def __iter__(self): 1277 """Return self""" 1278 return self 1279 1280 def __del__(self): 1281 # let garbage collector deal with still opened streams 1282 if not self.closed: 1283 self.close() 1284 1285 def xreadlines(self): 1286 """Return self""" 1287 return self 1288 1289 # ///////////////////////////////////////////////////////////////// 1290 # Pass-through methods & properties 1291 # ///////////////////////////////////////////////////////////////// 1292 1293 @property 1294 def closed(self): 1295 """True if the underlying stream is closed.""" 1296 return self.stream.closed 1297 1298 @property 1299 def name(self): 1300 """The name of the underlying stream.""" 1301 return self.stream.name 1302 1303 @property 1304 def mode(self): 1305 """The mode of the underlying stream.""" 1306 return self.stream.mode 1307 1308 def close(self): 1309 """ 1310 Close the underlying stream. 1311 """ 1312 self.stream.close() 1313 1314 # ///////////////////////////////////////////////////////////////// 1315 # Seek and tell 1316 # ///////////////////////////////////////////////////////////////// 1317 1318 def seek(self, offset, whence=0): 1319 """ 1320 Move the stream to a new file position. If the reader is 1321 maintaining any buffers, then they will be cleared. 1322 1323 :param offset: A byte count offset. 1324 :param whence: If 0, then the offset is from the start of the file 1325 (offset should be positive), if 1, then the offset is from the 1326 current position (offset may be positive or negative); and if 2, 1327 then the offset is from the end of the file (offset should 1328 typically be negative). 1329 """ 1330 if whence == 1: 1331 raise ValueError( 1332 'Relative seek is not supported for ' 1333 'SeekableUnicodeStreamReader -- consider ' 1334 'using char_seek_forward() instead.' 1335 ) 1336 self.stream.seek(offset, whence) 1337 self.linebuffer = None 1338 self.bytebuffer = b'' 1339 self._rewind_numchars = None 1340 self._rewind_checkpoint = self.stream.tell() 1341 1342 def char_seek_forward(self, offset): 1343 """ 1344 Move the read pointer forward by ``offset`` characters. 1345 """ 1346 if offset < 0: 1347 raise ValueError('Negative offsets are not supported') 1348 # Clear all buffers. 1349 self.seek(self.tell()) 1350 # Perform the seek operation. 1351 self._char_seek_forward(offset) 1352 1353 def _char_seek_forward(self, offset, est_bytes=None): 1354 """ 1355 Move the file position forward by ``offset`` characters, 1356 ignoring all buffers. 1357 1358 :param est_bytes: A hint, giving an estimate of the number of 1359 bytes that will be needed to move forward by ``offset`` chars. 1360 Defaults to ``offset``. 1361 """ 1362 if est_bytes is None: 1363 est_bytes = offset 1364 bytes = b'' 1365 1366 while True: 1367 # Read in a block of bytes. 1368 newbytes = self.stream.read(est_bytes - len(bytes)) 1369 bytes += newbytes 1370 1371 # Decode the bytes to characters. 1372 chars, bytes_decoded = self._incr_decode(bytes) 1373 1374 # If we got the right number of characters, then seek 1375 # backwards over any truncated characters, and return. 1376 if len(chars) == offset: 1377 self.stream.seek(-len(bytes) + bytes_decoded, 1) 1378 return 1379 1380 # If we went too far, then we can back-up until we get it 1381 # right, using the bytes we've already read. 1382 if len(chars) > offset: 1383 while len(chars) > offset: 1384 # Assume at least one byte/char. 1385 est_bytes += offset - len(chars) 1386 chars, bytes_decoded = self._incr_decode(bytes[:est_bytes]) 1387 self.stream.seek(-len(bytes) + bytes_decoded, 1) 1388 return 1389 1390 # Otherwise, we haven't read enough bytes yet; loop again. 1391 est_bytes += offset - len(chars) 1392 1393 def tell(self): 1394 """ 1395 Return the current file position on the underlying byte 1396 stream. If this reader is maintaining any buffers, then the 1397 returned file position will be the position of the beginning 1398 of those buffers. 1399 """ 1400 # If nothing's buffered, then just return our current filepos: 1401 if self.linebuffer is None: 1402 return self.stream.tell() - len(self.bytebuffer) 1403 1404 # Otherwise, we'll need to backtrack the filepos until we 1405 # reach the beginning of the buffer. 1406 1407 # Store our original file position, so we can return here. 1408 orig_filepos = self.stream.tell() 1409 1410 # Calculate an estimate of where we think the newline is. 1411 bytes_read = (orig_filepos - len(self.bytebuffer)) - self._rewind_checkpoint 1412 buf_size = sum(len(line) for line in self.linebuffer) 1413 est_bytes = int( 1414 (bytes_read * self._rewind_numchars / (self._rewind_numchars + buf_size)) 1415 ) 1416 1417 self.stream.seek(self._rewind_checkpoint) 1418 self._char_seek_forward(self._rewind_numchars, est_bytes) 1419 filepos = self.stream.tell() 1420 1421 # Sanity check 1422 if self.DEBUG: 1423 self.stream.seek(filepos) 1424 check1 = self._incr_decode(self.stream.read(50))[0] 1425 check2 = ''.join(self.linebuffer) 1426 assert check1.startswith(check2) or check2.startswith(check1) 1427 1428 # Return to our original filepos (so we don't have to throw 1429 # out our buffer.) 1430 self.stream.seek(orig_filepos) 1431 1432 # Return the calculated filepos 1433 return filepos 1434 1435 # ///////////////////////////////////////////////////////////////// 1436 # Helper methods 1437 # ///////////////////////////////////////////////////////////////// 1438 1439 def _read(self, size=None): 1440 """ 1441 Read up to ``size`` bytes from the underlying stream, decode 1442 them using this reader's encoding, and return the resulting 1443 unicode string. ``linebuffer`` is not included in the result. 1444 """ 1445 if size == 0: 1446 return '' 1447 1448 # Skip past the byte order marker, if present. 1449 if self._bom and self.stream.tell() == 0: 1450 self.stream.read(self._bom) 1451 1452 # Read the requested number of bytes. 1453 if size is None: 1454 new_bytes = self.stream.read() 1455 else: 1456 new_bytes = self.stream.read(size) 1457 bytes = self.bytebuffer + new_bytes 1458 1459 # Decode the bytes into unicode characters 1460 chars, bytes_decoded = self._incr_decode(bytes) 1461 1462 # If we got bytes but couldn't decode any, then read further. 1463 if (size is not None) and (not chars) and (len(new_bytes) > 0): 1464 while not chars: 1465 new_bytes = self.stream.read(1) 1466 if not new_bytes: 1467 break # end of file. 1468 bytes += new_bytes 1469 chars, bytes_decoded = self._incr_decode(bytes) 1470 1471 # Record any bytes we didn't consume. 1472 self.bytebuffer = bytes[bytes_decoded:] 1473 1474 # Return the result 1475 return chars 1476 1477 def _incr_decode(self, bytes): 1478 """ 1479 Decode the given byte string into a unicode string, using this 1480 reader's encoding. If an exception is encountered that 1481 appears to be caused by a truncation error, then just decode 1482 the byte string without the bytes that cause the trunctaion 1483 error. 1484 1485 Return a tuple ``(chars, num_consumed)``, where ``chars`` is 1486 the decoded unicode string, and ``num_consumed`` is the 1487 number of bytes that were consumed. 1488 """ 1489 while True: 1490 try: 1491 return self.decode(bytes, 'strict') 1492 except UnicodeDecodeError as exc: 1493 # If the exception occurs at the end of the string, 1494 # then assume that it's a truncation error. 1495 if exc.end == len(bytes): 1496 return self.decode(bytes[: exc.start], self.errors) 1497 1498 # Otherwise, if we're being strict, then raise it. 1499 elif self.errors == 'strict': 1500 raise 1501 1502 # If we're not strict, then re-process it with our 1503 # errors setting. This *may* raise an exception. 1504 else: 1505 return self.decode(bytes, self.errors) 1506 1507 _BOM_TABLE = { 1508 'utf8': [(codecs.BOM_UTF8, None)], 1509 'utf16': [(codecs.BOM_UTF16_LE, 'utf16-le'), (codecs.BOM_UTF16_BE, 'utf16-be')], 1510 'utf16le': [(codecs.BOM_UTF16_LE, None)], 1511 'utf16be': [(codecs.BOM_UTF16_BE, None)], 1512 'utf32': [(codecs.BOM_UTF32_LE, 'utf32-le'), (codecs.BOM_UTF32_BE, 'utf32-be')], 1513 'utf32le': [(codecs.BOM_UTF32_LE, None)], 1514 'utf32be': [(codecs.BOM_UTF32_BE, None)], 1515 } 1516 1517 def _check_bom(self): 1518 # Normalize our encoding name 1519 enc = re.sub('[ -]', '', self.encoding.lower()) 1520 1521 # Look up our encoding in the BOM table. 1522 bom_info = self._BOM_TABLE.get(enc) 1523 1524 if bom_info: 1525 # Read a prefix, to check against the BOM(s) 1526 bytes = self.stream.read(16) 1527 self.stream.seek(0) 1528 1529 # Check for each possible BOM. 1530 for (bom, new_encoding) in bom_info: 1531 if bytes.startswith(bom): 1532 if new_encoding: 1533 self.encoding = new_encoding 1534 return len(bom) 1535 1536 return None 1537 1538 1539__all__ = [ 1540 'path', 1541 'PathPointer', 1542 'FileSystemPathPointer', 1543 'BufferedGzipFile', 1544 'GzipFileSystemPathPointer', 1545 'GzipFileSystemPathPointer', 1546 'find', 1547 'retrieve', 1548 'FORMATS', 1549 'AUTO_FORMATS', 1550 'load', 1551 'show_cfg', 1552 'clear_cache', 1553 'LazyLoader', 1554 'OpenOnDemandZipFile', 1555 'GzipFileSystemPathPointer', 1556 'SeekableUnicodeStreamReader', 1557] 1558