1# Natural Language Toolkit: Corpus Reader Utilities 2# 3# Copyright (C) 2001-2019 NLTK Project 4# Author: Steven Bird <stevenbird1@gmail.com> 5# Edward Loper <edloper@gmail.com> 6# URL: <http://nltk.org/> 7# For license information, see LICENSE.TXT 8 9import os 10import bisect 11import re 12import tempfile 13from functools import reduce 14 15try: 16 import cPickle as pickle 17except ImportError: 18 import pickle 19 20try: # Use the c version of ElementTree, which is faster, if possible. 21 from xml.etree import cElementTree as ElementTree 22except ImportError: 23 from xml.etree import ElementTree 24 25from six import string_types, text_type 26 27from nltk.tokenize import wordpunct_tokenize 28from nltk.internals import slice_bounds 29from nltk.data import PathPointer, FileSystemPathPointer, ZipFilePathPointer 30from nltk.data import SeekableUnicodeStreamReader 31from nltk.util import AbstractLazySequence, LazySubsequence, LazyConcatenation, py25 32 33###################################################################### 34# { Corpus View 35###################################################################### 36 37 38class StreamBackedCorpusView(AbstractLazySequence): 39 """ 40 A 'view' of a corpus file, which acts like a sequence of tokens: 41 it can be accessed by index, iterated over, etc. However, the 42 tokens are only constructed as-needed -- the entire corpus is 43 never stored in memory at once. 44 45 The constructor to ``StreamBackedCorpusView`` takes two arguments: 46 a corpus fileid (specified as a string or as a ``PathPointer``); 47 and a block reader. A "block reader" is a function that reads 48 zero or more tokens from a stream, and returns them as a list. A 49 very simple example of a block reader is: 50 51 >>> def simple_block_reader(stream): 52 ... return stream.readline().split() 53 54 This simple block reader reads a single line at a time, and 55 returns a single token (consisting of a string) for each 56 whitespace-separated substring on the line. 57 58 When deciding how to define the block reader for a given 59 corpus, careful consideration should be given to the size of 60 blocks handled by the block reader. Smaller block sizes will 61 increase the memory requirements of the corpus view's internal 62 data structures (by 2 integers per block). On the other hand, 63 larger block sizes may decrease performance for random access to 64 the corpus. (But note that larger block sizes will *not* 65 decrease performance for iteration.) 66 67 Internally, ``CorpusView`` maintains a partial mapping from token 68 index to file position, with one entry per block. When a token 69 with a given index *i* is requested, the ``CorpusView`` constructs 70 it as follows: 71 72 1. First, it searches the toknum/filepos mapping for the token 73 index closest to (but less than or equal to) *i*. 74 75 2. Then, starting at the file position corresponding to that 76 index, it reads one block at a time using the block reader 77 until it reaches the requested token. 78 79 The toknum/filepos mapping is created lazily: it is initially 80 empty, but every time a new block is read, the block's 81 initial token is added to the mapping. (Thus, the toknum/filepos 82 map has one entry per block.) 83 84 In order to increase efficiency for random access patterns that 85 have high degrees of locality, the corpus view may cache one or 86 more blocks. 87 88 :note: Each ``CorpusView`` object internally maintains an open file 89 object for its underlying corpus file. This file should be 90 automatically closed when the ``CorpusView`` is garbage collected, 91 but if you wish to close it manually, use the ``close()`` 92 method. If you access a ``CorpusView``'s items after it has been 93 closed, the file object will be automatically re-opened. 94 95 :warning: If the contents of the file are modified during the 96 lifetime of the ``CorpusView``, then the ``CorpusView``'s behavior 97 is undefined. 98 99 :warning: If a unicode encoding is specified when constructing a 100 ``CorpusView``, then the block reader may only call 101 ``stream.seek()`` with offsets that have been returned by 102 ``stream.tell()``; in particular, calling ``stream.seek()`` with 103 relative offsets, or with offsets based on string lengths, may 104 lead to incorrect behavior. 105 106 :ivar _block_reader: The function used to read 107 a single block from the underlying file stream. 108 :ivar _toknum: A list containing the token index of each block 109 that has been processed. In particular, ``_toknum[i]`` is the 110 token index of the first token in block ``i``. Together 111 with ``_filepos``, this forms a partial mapping between token 112 indices and file positions. 113 :ivar _filepos: A list containing the file position of each block 114 that has been processed. In particular, ``_toknum[i]`` is the 115 file position of the first character in block ``i``. Together 116 with ``_toknum``, this forms a partial mapping between token 117 indices and file positions. 118 :ivar _stream: The stream used to access the underlying corpus file. 119 :ivar _len: The total number of tokens in the corpus, if known; 120 or None, if the number of tokens is not yet known. 121 :ivar _eofpos: The character position of the last character in the 122 file. This is calculated when the corpus view is initialized, 123 and is used to decide when the end of file has been reached. 124 :ivar _cache: A cache of the most recently read block. It 125 is encoded as a tuple (start_toknum, end_toknum, tokens), where 126 start_toknum is the token index of the first token in the block; 127 end_toknum is the token index of the first token not in the 128 block; and tokens is a list of the tokens in the block. 129 """ 130 131 def __init__(self, fileid, block_reader=None, startpos=0, encoding='utf8'): 132 """ 133 Create a new corpus view, based on the file ``fileid``, and 134 read with ``block_reader``. See the class documentation 135 for more information. 136 137 :param fileid: The path to the file that is read by this 138 corpus view. ``fileid`` can either be a string or a 139 ``PathPointer``. 140 141 :param startpos: The file position at which the view will 142 start reading. This can be used to skip over preface 143 sections. 144 145 :param encoding: The unicode encoding that should be used to 146 read the file's contents. If no encoding is specified, 147 then the file's contents will be read as a non-unicode 148 string (i.e., a str). 149 """ 150 if block_reader: 151 self.read_block = block_reader 152 # Initialize our toknum/filepos mapping. 153 self._toknum = [0] 154 self._filepos = [startpos] 155 self._encoding = encoding 156 # We don't know our length (number of tokens) yet. 157 self._len = None 158 159 self._fileid = fileid 160 self._stream = None 161 162 self._current_toknum = None 163 """This variable is set to the index of the next token that 164 will be read, immediately before ``self.read_block()`` is 165 called. This is provided for the benefit of the block 166 reader, which under rare circumstances may need to know 167 the current token number.""" 168 169 self._current_blocknum = None 170 """This variable is set to the index of the next block that 171 will be read, immediately before ``self.read_block()`` is 172 called. This is provided for the benefit of the block 173 reader, which under rare circumstances may need to know 174 the current block number.""" 175 176 # Find the length of the file. 177 try: 178 if isinstance(self._fileid, PathPointer): 179 self._eofpos = self._fileid.file_size() 180 else: 181 self._eofpos = os.stat(self._fileid).st_size 182 except Exception as exc: 183 raise ValueError('Unable to open or access %r -- %s' % (fileid, exc)) 184 185 # Maintain a cache of the most recently read block, to 186 # increase efficiency of random access. 187 self._cache = (-1, -1, None) 188 189 fileid = property( 190 lambda self: self._fileid, 191 doc=""" 192 The fileid of the file that is accessed by this view. 193 194 :type: str or PathPointer""", 195 ) 196 197 def read_block(self, stream): 198 """ 199 Read a block from the input stream. 200 201 :return: a block of tokens from the input stream 202 :rtype: list(any) 203 :param stream: an input stream 204 :type stream: stream 205 """ 206 raise NotImplementedError('Abstract Method') 207 208 def _open(self): 209 """ 210 Open the file stream associated with this corpus view. This 211 will be called performed if any value is read from the view 212 while its file stream is closed. 213 """ 214 if isinstance(self._fileid, PathPointer): 215 self._stream = self._fileid.open(self._encoding) 216 elif self._encoding: 217 self._stream = SeekableUnicodeStreamReader( 218 open(self._fileid, 'rb'), self._encoding 219 ) 220 else: 221 self._stream = open(self._fileid, 'rb') 222 223 def close(self): 224 """ 225 Close the file stream associated with this corpus view. This 226 can be useful if you are worried about running out of file 227 handles (although the stream should automatically be closed 228 upon garbage collection of the corpus view). If the corpus 229 view is accessed after it is closed, it will be automatically 230 re-opened. 231 """ 232 if self._stream is not None: 233 self._stream.close() 234 self._stream = None 235 236 def __len__(self): 237 if self._len is None: 238 # iterate_from() sets self._len when it reaches the end 239 # of the file: 240 for tok in self.iterate_from(self._toknum[-1]): 241 pass 242 return self._len 243 244 def __getitem__(self, i): 245 if isinstance(i, slice): 246 start, stop = slice_bounds(self, i) 247 # Check if it's in the cache. 248 offset = self._cache[0] 249 if offset <= start and stop <= self._cache[1]: 250 return self._cache[2][start - offset : stop - offset] 251 # Construct & return the result. 252 return LazySubsequence(self, start, stop) 253 else: 254 # Handle negative indices 255 if i < 0: 256 i += len(self) 257 if i < 0: 258 raise IndexError('index out of range') 259 # Check if it's in the cache. 260 offset = self._cache[0] 261 if offset <= i < self._cache[1]: 262 return self._cache[2][i - offset] 263 # Use iterate_from to extract it. 264 try: 265 return next(self.iterate_from(i)) 266 except StopIteration: 267 raise IndexError('index out of range') 268 269 # If we wanted to be thread-safe, then this method would need to 270 # do some locking. 271 def iterate_from(self, start_tok): 272 # Start by feeding from the cache, if possible. 273 if self._cache[0] <= start_tok < self._cache[1]: 274 for tok in self._cache[2][start_tok - self._cache[0] :]: 275 yield tok 276 start_tok += 1 277 278 # Decide where in the file we should start. If `start` is in 279 # our mapping, then we can jump straight to the correct block; 280 # otherwise, start at the last block we've processed. 281 if start_tok < self._toknum[-1]: 282 block_index = bisect.bisect_right(self._toknum, start_tok) - 1 283 toknum = self._toknum[block_index] 284 filepos = self._filepos[block_index] 285 else: 286 block_index = len(self._toknum) - 1 287 toknum = self._toknum[-1] 288 filepos = self._filepos[-1] 289 290 # Open the stream, if it's not open already. 291 if self._stream is None: 292 self._open() 293 294 # If the file is empty, the while loop will never run. 295 # This *seems* to be all the state we need to set: 296 if self._eofpos == 0: 297 self._len = 0 298 299 # Each iteration through this loop, we read a single block 300 # from the stream. 301 while filepos < self._eofpos: 302 # Read the next block. 303 self._stream.seek(filepos) 304 self._current_toknum = toknum 305 self._current_blocknum = block_index 306 tokens = self.read_block(self._stream) 307 assert isinstance(tokens, (tuple, list, AbstractLazySequence)), ( 308 'block reader %s() should return list or tuple.' 309 % self.read_block.__name__ 310 ) 311 num_toks = len(tokens) 312 new_filepos = self._stream.tell() 313 assert new_filepos > filepos, ( 314 'block reader %s() should consume at least 1 byte (filepos=%d)' 315 % (self.read_block.__name__, filepos) 316 ) 317 318 # Update our cache. 319 self._cache = (toknum, toknum + num_toks, list(tokens)) 320 321 # Update our mapping. 322 assert toknum <= self._toknum[-1] 323 if num_toks > 0: 324 block_index += 1 325 if toknum == self._toknum[-1]: 326 assert new_filepos > self._filepos[-1] # monotonic! 327 self._filepos.append(new_filepos) 328 self._toknum.append(toknum + num_toks) 329 else: 330 # Check for consistency: 331 assert ( 332 new_filepos == self._filepos[block_index] 333 ), 'inconsistent block reader (num chars read)' 334 assert ( 335 toknum + num_toks == self._toknum[block_index] 336 ), 'inconsistent block reader (num tokens returned)' 337 338 # If we reached the end of the file, then update self._len 339 if new_filepos == self._eofpos: 340 self._len = toknum + num_toks 341 # Generate the tokens in this block (but skip any tokens 342 # before start_tok). Note that between yields, our state 343 # may be modified. 344 for tok in tokens[max(0, start_tok - toknum) :]: 345 yield tok 346 # If we're at the end of the file, then we're done. 347 assert new_filepos <= self._eofpos 348 if new_filepos == self._eofpos: 349 break 350 # Update our indices 351 toknum += num_toks 352 filepos = new_filepos 353 354 # If we reach this point, then we should know our length. 355 assert self._len is not None 356 # Enforce closing of stream once we reached end of file 357 # We should have reached EOF once we're out of the while loop. 358 self.close() 359 360 # Use concat for these, so we can use a ConcatenatedCorpusView 361 # when possible. 362 def __add__(self, other): 363 return concat([self, other]) 364 365 def __radd__(self, other): 366 return concat([other, self]) 367 368 def __mul__(self, count): 369 return concat([self] * count) 370 371 def __rmul__(self, count): 372 return concat([self] * count) 373 374 375class ConcatenatedCorpusView(AbstractLazySequence): 376 """ 377 A 'view' of a corpus file that joins together one or more 378 ``StreamBackedCorpusViews<StreamBackedCorpusView>``. At most 379 one file handle is left open at any time. 380 """ 381 382 def __init__(self, corpus_views): 383 self._pieces = corpus_views 384 """A list of the corpus subviews that make up this 385 concatenation.""" 386 387 self._offsets = [0] 388 """A list of offsets, indicating the index at which each 389 subview begins. In particular:: 390 offsets[i] = sum([len(p) for p in pieces[:i]])""" 391 392 self._open_piece = None 393 """The most recently accessed corpus subview (or None). 394 Before a new subview is accessed, this subview will be closed.""" 395 396 def __len__(self): 397 if len(self._offsets) <= len(self._pieces): 398 # Iterate to the end of the corpus. 399 for tok in self.iterate_from(self._offsets[-1]): 400 pass 401 402 return self._offsets[-1] 403 404 def close(self): 405 for piece in self._pieces: 406 piece.close() 407 408 def iterate_from(self, start_tok): 409 piecenum = bisect.bisect_right(self._offsets, start_tok) - 1 410 411 while piecenum < len(self._pieces): 412 offset = self._offsets[piecenum] 413 piece = self._pieces[piecenum] 414 415 # If we've got another piece open, close it first. 416 if self._open_piece is not piece: 417 if self._open_piece is not None: 418 self._open_piece.close() 419 self._open_piece = piece 420 421 # Get everything we can from this piece. 422 for tok in piece.iterate_from(max(0, start_tok - offset)): 423 yield tok 424 425 # Update the offset table. 426 if piecenum + 1 == len(self._offsets): 427 self._offsets.append(self._offsets[-1] + len(piece)) 428 429 # Move on to the next piece. 430 piecenum += 1 431 432 433def concat(docs): 434 """ 435 Concatenate together the contents of multiple documents from a 436 single corpus, using an appropriate concatenation function. This 437 utility function is used by corpus readers when the user requests 438 more than one document at a time. 439 """ 440 if len(docs) == 1: 441 return docs[0] 442 if len(docs) == 0: 443 raise ValueError('concat() expects at least one object!') 444 445 types = set(d.__class__ for d in docs) 446 447 # If they're all strings, use string concatenation. 448 if all(isinstance(doc, string_types) for doc in docs): 449 return ''.join(docs) 450 451 # If they're all corpus views, then use ConcatenatedCorpusView. 452 for typ in types: 453 if not issubclass(typ, (StreamBackedCorpusView, ConcatenatedCorpusView)): 454 break 455 else: 456 return ConcatenatedCorpusView(docs) 457 458 # If they're all lazy sequences, use a lazy concatenation 459 for typ in types: 460 if not issubclass(typ, AbstractLazySequence): 461 break 462 else: 463 return LazyConcatenation(docs) 464 465 # Otherwise, see what we can do: 466 if len(types) == 1: 467 typ = list(types)[0] 468 469 if issubclass(typ, list): 470 return reduce((lambda a, b: a + b), docs, []) 471 472 if issubclass(typ, tuple): 473 return reduce((lambda a, b: a + b), docs, ()) 474 475 if ElementTree.iselement(typ): 476 xmltree = ElementTree.Element('documents') 477 for doc in docs: 478 xmltree.append(doc) 479 return xmltree 480 481 # No method found! 482 raise ValueError("Don't know how to concatenate types: %r" % types) 483 484 485###################################################################### 486# { Corpus View for Pickled Sequences 487###################################################################### 488 489 490class PickleCorpusView(StreamBackedCorpusView): 491 """ 492 A stream backed corpus view for corpus files that consist of 493 sequences of serialized Python objects (serialized using 494 ``pickle.dump``). One use case for this class is to store the 495 result of running feature detection on a corpus to disk. This can 496 be useful when performing feature detection is expensive (so we 497 don't want to repeat it); but the corpus is too large to store in 498 memory. The following example illustrates this technique: 499 500 >>> from nltk.corpus.reader.util import PickleCorpusView 501 >>> from nltk.util import LazyMap 502 >>> feature_corpus = LazyMap(detect_features, corpus) # doctest: +SKIP 503 >>> PickleCorpusView.write(feature_corpus, some_fileid) # doctest: +SKIP 504 >>> pcv = PickleCorpusView(some_fileid) # doctest: +SKIP 505 """ 506 507 BLOCK_SIZE = 100 508 PROTOCOL = -1 509 510 def __init__(self, fileid, delete_on_gc=False): 511 """ 512 Create a new corpus view that reads the pickle corpus 513 ``fileid``. 514 515 :param delete_on_gc: If true, then ``fileid`` will be deleted 516 whenever this object gets garbage-collected. 517 """ 518 self._delete_on_gc = delete_on_gc 519 StreamBackedCorpusView.__init__(self, fileid) 520 521 def read_block(self, stream): 522 result = [] 523 for i in range(self.BLOCK_SIZE): 524 try: 525 result.append(pickle.load(stream)) 526 except EOFError: 527 break 528 return result 529 530 def __del__(self): 531 """ 532 If ``delete_on_gc`` was set to true when this 533 ``PickleCorpusView`` was created, then delete the corpus view's 534 fileid. (This method is called whenever a 535 ``PickledCorpusView`` is garbage-collected. 536 """ 537 if getattr(self, '_delete_on_gc'): 538 if os.path.exists(self._fileid): 539 try: 540 os.remove(self._fileid) 541 except (OSError, IOError): 542 pass 543 self.__dict__.clear() # make the garbage collector's job easier 544 545 @classmethod 546 def write(cls, sequence, output_file): 547 if isinstance(output_file, string_types): 548 output_file = open(output_file, 'wb') 549 for item in sequence: 550 pickle.dump(item, output_file, cls.PROTOCOL) 551 552 @classmethod 553 def cache_to_tempfile(cls, sequence, delete_on_gc=True): 554 """ 555 Write the given sequence to a temporary file as a pickle 556 corpus; and then return a ``PickleCorpusView`` view for that 557 temporary corpus file. 558 559 :param delete_on_gc: If true, then the temporary file will be 560 deleted whenever this object gets garbage-collected. 561 """ 562 try: 563 fd, output_file_name = tempfile.mkstemp('.pcv', 'nltk-') 564 output_file = os.fdopen(fd, 'wb') 565 cls.write(sequence, output_file) 566 output_file.close() 567 return PickleCorpusView(output_file_name, delete_on_gc) 568 except (OSError, IOError) as e: 569 raise ValueError('Error while creating temp file: %s' % e) 570 571 572###################################################################### 573# { Block Readers 574###################################################################### 575 576 577def read_whitespace_block(stream): 578 toks = [] 579 for i in range(20): # Read 20 lines at a time. 580 toks.extend(stream.readline().split()) 581 return toks 582 583 584def read_wordpunct_block(stream): 585 toks = [] 586 for i in range(20): # Read 20 lines at a time. 587 toks.extend(wordpunct_tokenize(stream.readline())) 588 return toks 589 590 591def read_line_block(stream): 592 toks = [] 593 for i in range(20): 594 line = stream.readline() 595 if not line: 596 return toks 597 toks.append(line.rstrip('\n')) 598 return toks 599 600 601def read_blankline_block(stream): 602 s = '' 603 while True: 604 line = stream.readline() 605 # End of file: 606 if not line: 607 if s: 608 return [s] 609 else: 610 return [] 611 # Blank line: 612 elif line and not line.strip(): 613 if s: 614 return [s] 615 # Other line: 616 else: 617 s += line 618 619 620def read_alignedsent_block(stream): 621 s = '' 622 while True: 623 line = stream.readline() 624 if line[0] == '=' or line[0] == '\n' or line[:2] == '\r\n': 625 continue 626 # End of file: 627 if not line: 628 if s: 629 return [s] 630 else: 631 return [] 632 # Other line: 633 else: 634 s += line 635 if re.match('^\d+-\d+', line) is not None: 636 return [s] 637 638 639def read_regexp_block(stream, start_re, end_re=None): 640 """ 641 Read a sequence of tokens from a stream, where tokens begin with 642 lines that match ``start_re``. If ``end_re`` is specified, then 643 tokens end with lines that match ``end_re``; otherwise, tokens end 644 whenever the next line matching ``start_re`` or EOF is found. 645 """ 646 # Scan until we find a line matching the start regexp. 647 while True: 648 line = stream.readline() 649 if not line: 650 return [] # end of file. 651 if re.match(start_re, line): 652 break 653 654 # Scan until we find another line matching the regexp, or EOF. 655 lines = [line] 656 while True: 657 oldpos = stream.tell() 658 line = stream.readline() 659 # End of file: 660 if not line: 661 return [''.join(lines)] 662 # End of token: 663 if end_re is not None and re.match(end_re, line): 664 return [''.join(lines)] 665 # Start of new token: backup to just before it starts, and 666 # return the token we've already collected. 667 if end_re is None and re.match(start_re, line): 668 stream.seek(oldpos) 669 return [''.join(lines)] 670 # Anything else is part of the token. 671 lines.append(line) 672 673 674def read_sexpr_block(stream, block_size=16384, comment_char=None): 675 """ 676 Read a sequence of s-expressions from the stream, and leave the 677 stream's file position at the end the last complete s-expression 678 read. This function will always return at least one s-expression, 679 unless there are no more s-expressions in the file. 680 681 If the file ends in in the middle of an s-expression, then that 682 incomplete s-expression is returned when the end of the file is 683 reached. 684 685 :param block_size: The default block size for reading. If an 686 s-expression is longer than one block, then more than one 687 block will be read. 688 :param comment_char: A character that marks comments. Any lines 689 that begin with this character will be stripped out. 690 (If spaces or tabs precede the comment character, then the 691 line will not be stripped.) 692 """ 693 start = stream.tell() 694 block = stream.read(block_size) 695 encoding = getattr(stream, 'encoding', None) 696 assert encoding is not None or isinstance(block, text_type) 697 if encoding not in (None, 'utf-8'): 698 import warnings 699 700 warnings.warn( 701 'Parsing may fail, depending on the properties ' 702 'of the %s encoding!' % encoding 703 ) 704 # (e.g., the utf-16 encoding does not work because it insists 705 # on adding BOMs to the beginning of encoded strings.) 706 707 if comment_char: 708 COMMENT = re.compile('(?m)^%s.*$' % re.escape(comment_char)) 709 while True: 710 try: 711 # If we're stripping comments, then make sure our block ends 712 # on a line boundary; and then replace any comments with 713 # space characters. (We can't just strip them out -- that 714 # would make our offset wrong.) 715 if comment_char: 716 block += stream.readline() 717 block = re.sub(COMMENT, _sub_space, block) 718 # Read the block. 719 tokens, offset = _parse_sexpr_block(block) 720 # Skip whitespace 721 offset = re.compile(r'\s*').search(block, offset).end() 722 723 # Move to the end position. 724 if encoding is None: 725 stream.seek(start + offset) 726 else: 727 stream.seek(start + len(block[:offset].encode(encoding))) 728 729 # Return the list of tokens we processed 730 return tokens 731 except ValueError as e: 732 if e.args[0] == 'Block too small': 733 next_block = stream.read(block_size) 734 if next_block: 735 block += next_block 736 continue 737 else: 738 # The file ended mid-sexpr -- return what we got. 739 return [block.strip()] 740 else: 741 raise 742 743 744def _sub_space(m): 745 """Helper function: given a regexp match, return a string of 746 spaces that's the same length as the matched string.""" 747 return ' ' * (m.end() - m.start()) 748 749 750def _parse_sexpr_block(block): 751 tokens = [] 752 start = end = 0 753 754 while end < len(block): 755 m = re.compile(r'\S').search(block, end) 756 if not m: 757 return tokens, end 758 759 start = m.start() 760 761 # Case 1: sexpr is not parenthesized. 762 if m.group() != '(': 763 m2 = re.compile(r'[\s(]').search(block, start) 764 if m2: 765 end = m2.start() 766 else: 767 if tokens: 768 return tokens, end 769 raise ValueError('Block too small') 770 771 # Case 2: parenthesized sexpr. 772 else: 773 nesting = 0 774 for m in re.compile(r'[()]').finditer(block, start): 775 if m.group() == '(': 776 nesting += 1 777 else: 778 nesting -= 1 779 if nesting == 0: 780 end = m.end() 781 break 782 else: 783 if tokens: 784 return tokens, end 785 raise ValueError('Block too small') 786 787 tokens.append(block[start:end]) 788 789 return tokens, end 790 791 792###################################################################### 793# { Finding Corpus Items 794###################################################################### 795 796 797def find_corpus_fileids(root, regexp): 798 if not isinstance(root, PathPointer): 799 raise TypeError('find_corpus_fileids: expected a PathPointer') 800 regexp += '$' 801 802 # Find fileids in a zipfile: scan the zipfile's namelist. Filter 803 # out entries that end in '/' -- they're directories. 804 if isinstance(root, ZipFilePathPointer): 805 fileids = [ 806 name[len(root.entry) :] 807 for name in root.zipfile.namelist() 808 if not name.endswith('/') 809 ] 810 items = [name for name in fileids if re.match(regexp, name)] 811 return sorted(items) 812 813 # Find fileids in a directory: use os.walk to search all (proper 814 # or symlinked) subdirectories, and match paths against the regexp. 815 elif isinstance(root, FileSystemPathPointer): 816 items = [] 817 # workaround for py25 which doesn't support followlinks 818 kwargs = {} 819 if not py25(): 820 kwargs = {'followlinks': True} 821 for dirname, subdirs, fileids in os.walk(root.path, **kwargs): 822 prefix = ''.join('%s/' % p for p in _path_from(root.path, dirname)) 823 items += [ 824 prefix + fileid 825 for fileid in fileids 826 if re.match(regexp, prefix + fileid) 827 ] 828 # Don't visit svn directories: 829 if '.svn' in subdirs: 830 subdirs.remove('.svn') 831 return sorted(items) 832 833 else: 834 raise AssertionError("Don't know how to handle %r" % root) 835 836 837def _path_from(parent, child): 838 if os.path.split(parent)[1] == '': 839 parent = os.path.split(parent)[0] 840 path = [] 841 while parent != child: 842 child, dirname = os.path.split(child) 843 path.insert(0, dirname) 844 assert os.path.split(child)[0] != child 845 return path 846 847 848###################################################################### 849# { Paragraph structure in Treebank files 850###################################################################### 851 852 853def tagged_treebank_para_block_reader(stream): 854 # Read the next paragraph. 855 para = '' 856 while True: 857 line = stream.readline() 858 # End of paragraph: 859 if re.match('======+\s*$', line): 860 if para.strip(): 861 return [para] 862 # End of file: 863 elif line == '': 864 if para.strip(): 865 return [para] 866 else: 867 return [] 868 # Content line: 869 else: 870 para += line 871