1# Natural Language Toolkit: Corpus Reader Utilities
2#
3# Copyright (C) 2001-2019 NLTK Project
4# Author: Steven Bird <stevenbird1@gmail.com>
5#         Edward Loper <edloper@gmail.com>
6# URL: <http://nltk.org/>
7# For license information, see LICENSE.TXT
8
9import os
10import bisect
11import re
12import tempfile
13from functools import reduce
14
15try:
16    import cPickle as pickle
17except ImportError:
18    import pickle
19
20try:  # Use the c version of ElementTree, which is faster, if possible.
21    from xml.etree import cElementTree as ElementTree
22except ImportError:
23    from xml.etree import ElementTree
24
25from six import string_types, text_type
26
27from nltk.tokenize import wordpunct_tokenize
28from nltk.internals import slice_bounds
29from nltk.data import PathPointer, FileSystemPathPointer, ZipFilePathPointer
30from nltk.data import SeekableUnicodeStreamReader
31from nltk.util import AbstractLazySequence, LazySubsequence, LazyConcatenation, py25
32
33######################################################################
34# { Corpus View
35######################################################################
36
37
38class StreamBackedCorpusView(AbstractLazySequence):
39    """
40    A 'view' of a corpus file, which acts like a sequence of tokens:
41    it can be accessed by index, iterated over, etc.  However, the
42    tokens are only constructed as-needed -- the entire corpus is
43    never stored in memory at once.
44
45    The constructor to ``StreamBackedCorpusView`` takes two arguments:
46    a corpus fileid (specified as a string or as a ``PathPointer``);
47    and a block reader.  A "block reader" is a function that reads
48    zero or more tokens from a stream, and returns them as a list.  A
49    very simple example of a block reader is:
50
51        >>> def simple_block_reader(stream):
52        ...     return stream.readline().split()
53
54    This simple block reader reads a single line at a time, and
55    returns a single token (consisting of a string) for each
56    whitespace-separated substring on the line.
57
58    When deciding how to define the block reader for a given
59    corpus, careful consideration should be given to the size of
60    blocks handled by the block reader.  Smaller block sizes will
61    increase the memory requirements of the corpus view's internal
62    data structures (by 2 integers per block).  On the other hand,
63    larger block sizes may decrease performance for random access to
64    the corpus.  (But note that larger block sizes will *not*
65    decrease performance for iteration.)
66
67    Internally, ``CorpusView`` maintains a partial mapping from token
68    index to file position, with one entry per block.  When a token
69    with a given index *i* is requested, the ``CorpusView`` constructs
70    it as follows:
71
72      1. First, it searches the toknum/filepos mapping for the token
73         index closest to (but less than or equal to) *i*.
74
75      2. Then, starting at the file position corresponding to that
76         index, it reads one block at a time using the block reader
77         until it reaches the requested token.
78
79    The toknum/filepos mapping is created lazily: it is initially
80    empty, but every time a new block is read, the block's
81    initial token is added to the mapping.  (Thus, the toknum/filepos
82    map has one entry per block.)
83
84    In order to increase efficiency for random access patterns that
85    have high degrees of locality, the corpus view may cache one or
86    more blocks.
87
88    :note: Each ``CorpusView`` object internally maintains an open file
89        object for its underlying corpus file.  This file should be
90        automatically closed when the ``CorpusView`` is garbage collected,
91        but if you wish to close it manually, use the ``close()``
92        method.  If you access a ``CorpusView``'s items after it has been
93        closed, the file object will be automatically re-opened.
94
95    :warning: If the contents of the file are modified during the
96        lifetime of the ``CorpusView``, then the ``CorpusView``'s behavior
97        is undefined.
98
99    :warning: If a unicode encoding is specified when constructing a
100        ``CorpusView``, then the block reader may only call
101        ``stream.seek()`` with offsets that have been returned by
102        ``stream.tell()``; in particular, calling ``stream.seek()`` with
103        relative offsets, or with offsets based on string lengths, may
104        lead to incorrect behavior.
105
106    :ivar _block_reader: The function used to read
107        a single block from the underlying file stream.
108    :ivar _toknum: A list containing the token index of each block
109        that has been processed.  In particular, ``_toknum[i]`` is the
110        token index of the first token in block ``i``.  Together
111        with ``_filepos``, this forms a partial mapping between token
112        indices and file positions.
113    :ivar _filepos: A list containing the file position of each block
114        that has been processed.  In particular, ``_toknum[i]`` is the
115        file position of the first character in block ``i``.  Together
116        with ``_toknum``, this forms a partial mapping between token
117        indices and file positions.
118    :ivar _stream: The stream used to access the underlying corpus file.
119    :ivar _len: The total number of tokens in the corpus, if known;
120        or None, if the number of tokens is not yet known.
121    :ivar _eofpos: The character position of the last character in the
122        file.  This is calculated when the corpus view is initialized,
123        and is used to decide when the end of file has been reached.
124    :ivar _cache: A cache of the most recently read block.  It
125       is encoded as a tuple (start_toknum, end_toknum, tokens), where
126       start_toknum is the token index of the first token in the block;
127       end_toknum is the token index of the first token not in the
128       block; and tokens is a list of the tokens in the block.
129    """
130
131    def __init__(self, fileid, block_reader=None, startpos=0, encoding='utf8'):
132        """
133        Create a new corpus view, based on the file ``fileid``, and
134        read with ``block_reader``.  See the class documentation
135        for more information.
136
137        :param fileid: The path to the file that is read by this
138            corpus view.  ``fileid`` can either be a string or a
139            ``PathPointer``.
140
141        :param startpos: The file position at which the view will
142            start reading.  This can be used to skip over preface
143            sections.
144
145        :param encoding: The unicode encoding that should be used to
146            read the file's contents.  If no encoding is specified,
147            then the file's contents will be read as a non-unicode
148            string (i.e., a str).
149        """
150        if block_reader:
151            self.read_block = block_reader
152        # Initialize our toknum/filepos mapping.
153        self._toknum = [0]
154        self._filepos = [startpos]
155        self._encoding = encoding
156        # We don't know our length (number of tokens) yet.
157        self._len = None
158
159        self._fileid = fileid
160        self._stream = None
161
162        self._current_toknum = None
163        """This variable is set to the index of the next token that
164           will be read, immediately before ``self.read_block()`` is
165           called.  This is provided for the benefit of the block
166           reader, which under rare circumstances may need to know
167           the current token number."""
168
169        self._current_blocknum = None
170        """This variable is set to the index of the next block that
171           will be read, immediately before ``self.read_block()`` is
172           called.  This is provided for the benefit of the block
173           reader, which under rare circumstances may need to know
174           the current block number."""
175
176        # Find the length of the file.
177        try:
178            if isinstance(self._fileid, PathPointer):
179                self._eofpos = self._fileid.file_size()
180            else:
181                self._eofpos = os.stat(self._fileid).st_size
182        except Exception as exc:
183            raise ValueError('Unable to open or access %r -- %s' % (fileid, exc))
184
185        # Maintain a cache of the most recently read block, to
186        # increase efficiency of random access.
187        self._cache = (-1, -1, None)
188
189    fileid = property(
190        lambda self: self._fileid,
191        doc="""
192        The fileid of the file that is accessed by this view.
193
194        :type: str or PathPointer""",
195    )
196
197    def read_block(self, stream):
198        """
199        Read a block from the input stream.
200
201        :return: a block of tokens from the input stream
202        :rtype: list(any)
203        :param stream: an input stream
204        :type stream: stream
205        """
206        raise NotImplementedError('Abstract Method')
207
208    def _open(self):
209        """
210        Open the file stream associated with this corpus view.  This
211        will be called performed if any value is read from the view
212        while its file stream is closed.
213        """
214        if isinstance(self._fileid, PathPointer):
215            self._stream = self._fileid.open(self._encoding)
216        elif self._encoding:
217            self._stream = SeekableUnicodeStreamReader(
218                open(self._fileid, 'rb'), self._encoding
219            )
220        else:
221            self._stream = open(self._fileid, 'rb')
222
223    def close(self):
224        """
225        Close the file stream associated with this corpus view.  This
226        can be useful if you are worried about running out of file
227        handles (although the stream should automatically be closed
228        upon garbage collection of the corpus view).  If the corpus
229        view is accessed after it is closed, it will be automatically
230        re-opened.
231        """
232        if self._stream is not None:
233            self._stream.close()
234        self._stream = None
235
236    def __len__(self):
237        if self._len is None:
238            # iterate_from() sets self._len when it reaches the end
239            # of the file:
240            for tok in self.iterate_from(self._toknum[-1]):
241                pass
242        return self._len
243
244    def __getitem__(self, i):
245        if isinstance(i, slice):
246            start, stop = slice_bounds(self, i)
247            # Check if it's in the cache.
248            offset = self._cache[0]
249            if offset <= start and stop <= self._cache[1]:
250                return self._cache[2][start - offset : stop - offset]
251            # Construct & return the result.
252            return LazySubsequence(self, start, stop)
253        else:
254            # Handle negative indices
255            if i < 0:
256                i += len(self)
257            if i < 0:
258                raise IndexError('index out of range')
259            # Check if it's in the cache.
260            offset = self._cache[0]
261            if offset <= i < self._cache[1]:
262                return self._cache[2][i - offset]
263            # Use iterate_from to extract it.
264            try:
265                return next(self.iterate_from(i))
266            except StopIteration:
267                raise IndexError('index out of range')
268
269    # If we wanted to be thread-safe, then this method would need to
270    # do some locking.
271    def iterate_from(self, start_tok):
272        # Start by feeding from the cache, if possible.
273        if self._cache[0] <= start_tok < self._cache[1]:
274            for tok in self._cache[2][start_tok - self._cache[0] :]:
275                yield tok
276                start_tok += 1
277
278        # Decide where in the file we should start.  If `start` is in
279        # our mapping, then we can jump straight to the correct block;
280        # otherwise, start at the last block we've processed.
281        if start_tok < self._toknum[-1]:
282            block_index = bisect.bisect_right(self._toknum, start_tok) - 1
283            toknum = self._toknum[block_index]
284            filepos = self._filepos[block_index]
285        else:
286            block_index = len(self._toknum) - 1
287            toknum = self._toknum[-1]
288            filepos = self._filepos[-1]
289
290        # Open the stream, if it's not open already.
291        if self._stream is None:
292            self._open()
293
294        # If the file is empty, the while loop will never run.
295        # This *seems* to be all the state we need to set:
296        if self._eofpos == 0:
297            self._len = 0
298
299        # Each iteration through this loop, we read a single block
300        # from the stream.
301        while filepos < self._eofpos:
302            # Read the next block.
303            self._stream.seek(filepos)
304            self._current_toknum = toknum
305            self._current_blocknum = block_index
306            tokens = self.read_block(self._stream)
307            assert isinstance(tokens, (tuple, list, AbstractLazySequence)), (
308                'block reader %s() should return list or tuple.'
309                % self.read_block.__name__
310            )
311            num_toks = len(tokens)
312            new_filepos = self._stream.tell()
313            assert new_filepos > filepos, (
314                'block reader %s() should consume at least 1 byte (filepos=%d)'
315                % (self.read_block.__name__, filepos)
316            )
317
318            # Update our cache.
319            self._cache = (toknum, toknum + num_toks, list(tokens))
320
321            # Update our mapping.
322            assert toknum <= self._toknum[-1]
323            if num_toks > 0:
324                block_index += 1
325                if toknum == self._toknum[-1]:
326                    assert new_filepos > self._filepos[-1]  # monotonic!
327                    self._filepos.append(new_filepos)
328                    self._toknum.append(toknum + num_toks)
329                else:
330                    # Check for consistency:
331                    assert (
332                        new_filepos == self._filepos[block_index]
333                    ), 'inconsistent block reader (num chars read)'
334                    assert (
335                        toknum + num_toks == self._toknum[block_index]
336                    ), 'inconsistent block reader (num tokens returned)'
337
338            # If we reached the end of the file, then update self._len
339            if new_filepos == self._eofpos:
340                self._len = toknum + num_toks
341            # Generate the tokens in this block (but skip any tokens
342            # before start_tok).  Note that between yields, our state
343            # may be modified.
344            for tok in tokens[max(0, start_tok - toknum) :]:
345                yield tok
346            # If we're at the end of the file, then we're done.
347            assert new_filepos <= self._eofpos
348            if new_filepos == self._eofpos:
349                break
350            # Update our indices
351            toknum += num_toks
352            filepos = new_filepos
353
354        # If we reach this point, then we should know our length.
355        assert self._len is not None
356        # Enforce closing of stream once we reached end of file
357        # We should have reached EOF once we're out of the while loop.
358        self.close()
359
360    # Use concat for these, so we can use a ConcatenatedCorpusView
361    # when possible.
362    def __add__(self, other):
363        return concat([self, other])
364
365    def __radd__(self, other):
366        return concat([other, self])
367
368    def __mul__(self, count):
369        return concat([self] * count)
370
371    def __rmul__(self, count):
372        return concat([self] * count)
373
374
375class ConcatenatedCorpusView(AbstractLazySequence):
376    """
377    A 'view' of a corpus file that joins together one or more
378    ``StreamBackedCorpusViews<StreamBackedCorpusView>``.  At most
379    one file handle is left open at any time.
380    """
381
382    def __init__(self, corpus_views):
383        self._pieces = corpus_views
384        """A list of the corpus subviews that make up this
385        concatenation."""
386
387        self._offsets = [0]
388        """A list of offsets, indicating the index at which each
389        subview begins.  In particular::
390            offsets[i] = sum([len(p) for p in pieces[:i]])"""
391
392        self._open_piece = None
393        """The most recently accessed corpus subview (or None).
394        Before a new subview is accessed, this subview will be closed."""
395
396    def __len__(self):
397        if len(self._offsets) <= len(self._pieces):
398            # Iterate to the end of the corpus.
399            for tok in self.iterate_from(self._offsets[-1]):
400                pass
401
402        return self._offsets[-1]
403
404    def close(self):
405        for piece in self._pieces:
406            piece.close()
407
408    def iterate_from(self, start_tok):
409        piecenum = bisect.bisect_right(self._offsets, start_tok) - 1
410
411        while piecenum < len(self._pieces):
412            offset = self._offsets[piecenum]
413            piece = self._pieces[piecenum]
414
415            # If we've got another piece open, close it first.
416            if self._open_piece is not piece:
417                if self._open_piece is not None:
418                    self._open_piece.close()
419                self._open_piece = piece
420
421            # Get everything we can from this piece.
422            for tok in piece.iterate_from(max(0, start_tok - offset)):
423                yield tok
424
425            # Update the offset table.
426            if piecenum + 1 == len(self._offsets):
427                self._offsets.append(self._offsets[-1] + len(piece))
428
429            # Move on to the next piece.
430            piecenum += 1
431
432
433def concat(docs):
434    """
435    Concatenate together the contents of multiple documents from a
436    single corpus, using an appropriate concatenation function.  This
437    utility function is used by corpus readers when the user requests
438    more than one document at a time.
439    """
440    if len(docs) == 1:
441        return docs[0]
442    if len(docs) == 0:
443        raise ValueError('concat() expects at least one object!')
444
445    types = set(d.__class__ for d in docs)
446
447    # If they're all strings, use string concatenation.
448    if all(isinstance(doc, string_types) for doc in docs):
449        return ''.join(docs)
450
451    # If they're all corpus views, then use ConcatenatedCorpusView.
452    for typ in types:
453        if not issubclass(typ, (StreamBackedCorpusView, ConcatenatedCorpusView)):
454            break
455    else:
456        return ConcatenatedCorpusView(docs)
457
458    # If they're all lazy sequences, use a lazy concatenation
459    for typ in types:
460        if not issubclass(typ, AbstractLazySequence):
461            break
462    else:
463        return LazyConcatenation(docs)
464
465    # Otherwise, see what we can do:
466    if len(types) == 1:
467        typ = list(types)[0]
468
469        if issubclass(typ, list):
470            return reduce((lambda a, b: a + b), docs, [])
471
472        if issubclass(typ, tuple):
473            return reduce((lambda a, b: a + b), docs, ())
474
475        if ElementTree.iselement(typ):
476            xmltree = ElementTree.Element('documents')
477            for doc in docs:
478                xmltree.append(doc)
479            return xmltree
480
481    # No method found!
482    raise ValueError("Don't know how to concatenate types: %r" % types)
483
484
485######################################################################
486# { Corpus View for Pickled Sequences
487######################################################################
488
489
490class PickleCorpusView(StreamBackedCorpusView):
491    """
492    A stream backed corpus view for corpus files that consist of
493    sequences of serialized Python objects (serialized using
494    ``pickle.dump``).  One use case for this class is to store the
495    result of running feature detection on a corpus to disk.  This can
496    be useful when performing feature detection is expensive (so we
497    don't want to repeat it); but the corpus is too large to store in
498    memory.  The following example illustrates this technique:
499
500        >>> from nltk.corpus.reader.util import PickleCorpusView
501        >>> from nltk.util import LazyMap
502        >>> feature_corpus = LazyMap(detect_features, corpus) # doctest: +SKIP
503        >>> PickleCorpusView.write(feature_corpus, some_fileid)  # doctest: +SKIP
504        >>> pcv = PickleCorpusView(some_fileid) # doctest: +SKIP
505    """
506
507    BLOCK_SIZE = 100
508    PROTOCOL = -1
509
510    def __init__(self, fileid, delete_on_gc=False):
511        """
512        Create a new corpus view that reads the pickle corpus
513        ``fileid``.
514
515        :param delete_on_gc: If true, then ``fileid`` will be deleted
516            whenever this object gets garbage-collected.
517        """
518        self._delete_on_gc = delete_on_gc
519        StreamBackedCorpusView.__init__(self, fileid)
520
521    def read_block(self, stream):
522        result = []
523        for i in range(self.BLOCK_SIZE):
524            try:
525                result.append(pickle.load(stream))
526            except EOFError:
527                break
528        return result
529
530    def __del__(self):
531        """
532        If ``delete_on_gc`` was set to true when this
533        ``PickleCorpusView`` was created, then delete the corpus view's
534        fileid.  (This method is called whenever a
535        ``PickledCorpusView`` is garbage-collected.
536        """
537        if getattr(self, '_delete_on_gc'):
538            if os.path.exists(self._fileid):
539                try:
540                    os.remove(self._fileid)
541                except (OSError, IOError):
542                    pass
543        self.__dict__.clear()  # make the garbage collector's job easier
544
545    @classmethod
546    def write(cls, sequence, output_file):
547        if isinstance(output_file, string_types):
548            output_file = open(output_file, 'wb')
549        for item in sequence:
550            pickle.dump(item, output_file, cls.PROTOCOL)
551
552    @classmethod
553    def cache_to_tempfile(cls, sequence, delete_on_gc=True):
554        """
555        Write the given sequence to a temporary file as a pickle
556        corpus; and then return a ``PickleCorpusView`` view for that
557        temporary corpus file.
558
559        :param delete_on_gc: If true, then the temporary file will be
560            deleted whenever this object gets garbage-collected.
561        """
562        try:
563            fd, output_file_name = tempfile.mkstemp('.pcv', 'nltk-')
564            output_file = os.fdopen(fd, 'wb')
565            cls.write(sequence, output_file)
566            output_file.close()
567            return PickleCorpusView(output_file_name, delete_on_gc)
568        except (OSError, IOError) as e:
569            raise ValueError('Error while creating temp file: %s' % e)
570
571
572######################################################################
573# { Block Readers
574######################################################################
575
576
577def read_whitespace_block(stream):
578    toks = []
579    for i in range(20):  # Read 20 lines at a time.
580        toks.extend(stream.readline().split())
581    return toks
582
583
584def read_wordpunct_block(stream):
585    toks = []
586    for i in range(20):  # Read 20 lines at a time.
587        toks.extend(wordpunct_tokenize(stream.readline()))
588    return toks
589
590
591def read_line_block(stream):
592    toks = []
593    for i in range(20):
594        line = stream.readline()
595        if not line:
596            return toks
597        toks.append(line.rstrip('\n'))
598    return toks
599
600
601def read_blankline_block(stream):
602    s = ''
603    while True:
604        line = stream.readline()
605        # End of file:
606        if not line:
607            if s:
608                return [s]
609            else:
610                return []
611        # Blank line:
612        elif line and not line.strip():
613            if s:
614                return [s]
615        # Other line:
616        else:
617            s += line
618
619
620def read_alignedsent_block(stream):
621    s = ''
622    while True:
623        line = stream.readline()
624        if line[0] == '=' or line[0] == '\n' or line[:2] == '\r\n':
625            continue
626        # End of file:
627        if not line:
628            if s:
629                return [s]
630            else:
631                return []
632        # Other line:
633        else:
634            s += line
635            if re.match('^\d+-\d+', line) is not None:
636                return [s]
637
638
639def read_regexp_block(stream, start_re, end_re=None):
640    """
641    Read a sequence of tokens from a stream, where tokens begin with
642    lines that match ``start_re``.  If ``end_re`` is specified, then
643    tokens end with lines that match ``end_re``; otherwise, tokens end
644    whenever the next line matching ``start_re`` or EOF is found.
645    """
646    # Scan until we find a line matching the start regexp.
647    while True:
648        line = stream.readline()
649        if not line:
650            return []  # end of file.
651        if re.match(start_re, line):
652            break
653
654    # Scan until we find another line matching the regexp, or EOF.
655    lines = [line]
656    while True:
657        oldpos = stream.tell()
658        line = stream.readline()
659        # End of file:
660        if not line:
661            return [''.join(lines)]
662        # End of token:
663        if end_re is not None and re.match(end_re, line):
664            return [''.join(lines)]
665        # Start of new token: backup to just before it starts, and
666        # return the token we've already collected.
667        if end_re is None and re.match(start_re, line):
668            stream.seek(oldpos)
669            return [''.join(lines)]
670        # Anything else is part of the token.
671        lines.append(line)
672
673
674def read_sexpr_block(stream, block_size=16384, comment_char=None):
675    """
676    Read a sequence of s-expressions from the stream, and leave the
677    stream's file position at the end the last complete s-expression
678    read.  This function will always return at least one s-expression,
679    unless there are no more s-expressions in the file.
680
681    If the file ends in in the middle of an s-expression, then that
682    incomplete s-expression is returned when the end of the file is
683    reached.
684
685    :param block_size: The default block size for reading.  If an
686        s-expression is longer than one block, then more than one
687        block will be read.
688    :param comment_char: A character that marks comments.  Any lines
689        that begin with this character will be stripped out.
690        (If spaces or tabs precede the comment character, then the
691        line will not be stripped.)
692    """
693    start = stream.tell()
694    block = stream.read(block_size)
695    encoding = getattr(stream, 'encoding', None)
696    assert encoding is not None or isinstance(block, text_type)
697    if encoding not in (None, 'utf-8'):
698        import warnings
699
700        warnings.warn(
701            'Parsing may fail, depending on the properties '
702            'of the %s encoding!' % encoding
703        )
704        # (e.g., the utf-16 encoding does not work because it insists
705        # on adding BOMs to the beginning of encoded strings.)
706
707    if comment_char:
708        COMMENT = re.compile('(?m)^%s.*$' % re.escape(comment_char))
709    while True:
710        try:
711            # If we're stripping comments, then make sure our block ends
712            # on a line boundary; and then replace any comments with
713            # space characters.  (We can't just strip them out -- that
714            # would make our offset wrong.)
715            if comment_char:
716                block += stream.readline()
717                block = re.sub(COMMENT, _sub_space, block)
718            # Read the block.
719            tokens, offset = _parse_sexpr_block(block)
720            # Skip whitespace
721            offset = re.compile(r'\s*').search(block, offset).end()
722
723            # Move to the end position.
724            if encoding is None:
725                stream.seek(start + offset)
726            else:
727                stream.seek(start + len(block[:offset].encode(encoding)))
728
729            # Return the list of tokens we processed
730            return tokens
731        except ValueError as e:
732            if e.args[0] == 'Block too small':
733                next_block = stream.read(block_size)
734                if next_block:
735                    block += next_block
736                    continue
737                else:
738                    # The file ended mid-sexpr -- return what we got.
739                    return [block.strip()]
740            else:
741                raise
742
743
744def _sub_space(m):
745    """Helper function: given a regexp match, return a string of
746    spaces that's the same length as the matched string."""
747    return ' ' * (m.end() - m.start())
748
749
750def _parse_sexpr_block(block):
751    tokens = []
752    start = end = 0
753
754    while end < len(block):
755        m = re.compile(r'\S').search(block, end)
756        if not m:
757            return tokens, end
758
759        start = m.start()
760
761        # Case 1: sexpr is not parenthesized.
762        if m.group() != '(':
763            m2 = re.compile(r'[\s(]').search(block, start)
764            if m2:
765                end = m2.start()
766            else:
767                if tokens:
768                    return tokens, end
769                raise ValueError('Block too small')
770
771        # Case 2: parenthesized sexpr.
772        else:
773            nesting = 0
774            for m in re.compile(r'[()]').finditer(block, start):
775                if m.group() == '(':
776                    nesting += 1
777                else:
778                    nesting -= 1
779                if nesting == 0:
780                    end = m.end()
781                    break
782            else:
783                if tokens:
784                    return tokens, end
785                raise ValueError('Block too small')
786
787        tokens.append(block[start:end])
788
789    return tokens, end
790
791
792######################################################################
793# { Finding Corpus Items
794######################################################################
795
796
797def find_corpus_fileids(root, regexp):
798    if not isinstance(root, PathPointer):
799        raise TypeError('find_corpus_fileids: expected a PathPointer')
800    regexp += '$'
801
802    # Find fileids in a zipfile: scan the zipfile's namelist.  Filter
803    # out entries that end in '/' -- they're directories.
804    if isinstance(root, ZipFilePathPointer):
805        fileids = [
806            name[len(root.entry) :]
807            for name in root.zipfile.namelist()
808            if not name.endswith('/')
809        ]
810        items = [name for name in fileids if re.match(regexp, name)]
811        return sorted(items)
812
813    # Find fileids in a directory: use os.walk to search all (proper
814    # or symlinked) subdirectories, and match paths against the regexp.
815    elif isinstance(root, FileSystemPathPointer):
816        items = []
817        # workaround for py25 which doesn't support followlinks
818        kwargs = {}
819        if not py25():
820            kwargs = {'followlinks': True}
821        for dirname, subdirs, fileids in os.walk(root.path, **kwargs):
822            prefix = ''.join('%s/' % p for p in _path_from(root.path, dirname))
823            items += [
824                prefix + fileid
825                for fileid in fileids
826                if re.match(regexp, prefix + fileid)
827            ]
828            # Don't visit svn directories:
829            if '.svn' in subdirs:
830                subdirs.remove('.svn')
831        return sorted(items)
832
833    else:
834        raise AssertionError("Don't know how to handle %r" % root)
835
836
837def _path_from(parent, child):
838    if os.path.split(parent)[1] == '':
839        parent = os.path.split(parent)[0]
840    path = []
841    while parent != child:
842        child, dirname = os.path.split(child)
843        path.insert(0, dirname)
844        assert os.path.split(child)[0] != child
845    return path
846
847
848######################################################################
849# { Paragraph structure in Treebank files
850######################################################################
851
852
853def tagged_treebank_para_block_reader(stream):
854    # Read the next paragraph.
855    para = ''
856    while True:
857        line = stream.readline()
858        # End of paragraph:
859        if re.match('======+\s*$', line):
860            if para.strip():
861                return [para]
862        # End of file:
863        elif line == '':
864            if para.strip():
865                return [para]
866            else:
867                return []
868        # Content line:
869        else:
870            para += line
871