1# Natural Language Toolkit: Chunked Corpus Reader
2#
3# Copyright (C) 2001-2019 NLTK Project
4# Author: Steven Bird <stevenbird1@gmail.com>
5#         Edward Loper <edloper@gmail.com>
6# URL: <http://nltk.org/>
7# For license information, see LICENSE.TXT
8
9"""
10A reader for corpora that contain chunked (and optionally tagged)
11documents.
12"""
13
14import os.path, codecs
15
16from six import string_types
17
18import nltk
19from nltk.corpus.reader.bracket_parse import BracketParseCorpusReader
20from nltk.tree import Tree
21from nltk.tokenize import *
22from nltk.chunk import tagstr2tree
23from nltk.corpus.reader.util import *
24from nltk.corpus.reader.api import *
25
26
27class ChunkedCorpusReader(CorpusReader):
28    """
29    Reader for chunked (and optionally tagged) corpora.  Paragraphs
30    are split using a block reader.  They are then tokenized into
31    sentences using a sentence tokenizer.  Finally, these sentences
32    are parsed into chunk trees using a string-to-chunktree conversion
33    function.  Each of these steps can be performed using a default
34    function or a custom function.  By default, paragraphs are split
35    on blank lines; sentences are listed one per line; and sentences
36    are parsed into chunk trees using ``nltk.chunk.tagstr2tree``.
37    """
38
39    def __init__(
40        self,
41        root,
42        fileids,
43        extension='',
44        str2chunktree=tagstr2tree,
45        sent_tokenizer=RegexpTokenizer('\n', gaps=True),
46        para_block_reader=read_blankline_block,
47        encoding='utf8',
48        tagset=None,
49    ):
50        """
51        :param root: The root directory for this corpus.
52        :param fileids: A list or regexp specifying the fileids in this corpus.
53        """
54        CorpusReader.__init__(self, root, fileids, encoding)
55        self._cv_args = (str2chunktree, sent_tokenizer, para_block_reader, tagset)
56        """Arguments for corpus views generated by this corpus: a tuple
57        (str2chunktree, sent_tokenizer, para_block_tokenizer)"""
58
59    def raw(self, fileids=None):
60        """
61        :return: the given file(s) as a single string.
62        :rtype: str
63        """
64        if fileids is None:
65            fileids = self._fileids
66        elif isinstance(fileids, string_types):
67            fileids = [fileids]
68        return concat([self.open(f).read() for f in fileids])
69
70    def words(self, fileids=None):
71        """
72        :return: the given file(s) as a list of words
73            and punctuation symbols.
74        :rtype: list(str)
75        """
76        return concat(
77            [
78                ChunkedCorpusView(f, enc, 0, 0, 0, 0, *self._cv_args)
79                for (f, enc) in self.abspaths(fileids, True)
80            ]
81        )
82
83    def sents(self, fileids=None):
84        """
85        :return: the given file(s) as a list of
86            sentences or utterances, each encoded as a list of word
87            strings.
88        :rtype: list(list(str))
89        """
90        return concat(
91            [
92                ChunkedCorpusView(f, enc, 0, 1, 0, 0, *self._cv_args)
93                for (f, enc) in self.abspaths(fileids, True)
94            ]
95        )
96
97    def paras(self, fileids=None):
98        """
99        :return: the given file(s) as a list of
100            paragraphs, each encoded as a list of sentences, which are
101            in turn encoded as lists of word strings.
102        :rtype: list(list(list(str)))
103        """
104        return concat(
105            [
106                ChunkedCorpusView(f, enc, 0, 1, 1, 0, *self._cv_args)
107                for (f, enc) in self.abspaths(fileids, True)
108            ]
109        )
110
111    def tagged_words(self, fileids=None, tagset=None):
112        """
113        :return: the given file(s) as a list of tagged
114            words and punctuation symbols, encoded as tuples
115            ``(word,tag)``.
116        :rtype: list(tuple(str,str))
117        """
118        return concat(
119            [
120                ChunkedCorpusView(
121                    f, enc, 1, 0, 0, 0, *self._cv_args, target_tagset=tagset
122                )
123                for (f, enc) in self.abspaths(fileids, True)
124            ]
125        )
126
127    def tagged_sents(self, fileids=None, tagset=None):
128        """
129        :return: the given file(s) as a list of
130            sentences, each encoded as a list of ``(word,tag)`` tuples.
131
132        :rtype: list(list(tuple(str,str)))
133        """
134        return concat(
135            [
136                ChunkedCorpusView(
137                    f, enc, 1, 1, 0, 0, *self._cv_args, target_tagset=tagset
138                )
139                for (f, enc) in self.abspaths(fileids, True)
140            ]
141        )
142
143    def tagged_paras(self, fileids=None, tagset=None):
144        """
145        :return: the given file(s) as a list of
146            paragraphs, each encoded as a list of sentences, which are
147            in turn encoded as lists of ``(word,tag)`` tuples.
148        :rtype: list(list(list(tuple(str,str))))
149        """
150        return concat(
151            [
152                ChunkedCorpusView(
153                    f, enc, 1, 1, 1, 0, *self._cv_args, target_tagset=tagset
154                )
155                for (f, enc) in self.abspaths(fileids, True)
156            ]
157        )
158
159    def chunked_words(self, fileids=None, tagset=None):
160        """
161        :return: the given file(s) as a list of tagged
162            words and chunks.  Words are encoded as ``(word, tag)``
163            tuples (if the corpus has tags) or word strings (if the
164            corpus has no tags).  Chunks are encoded as depth-one
165            trees over ``(word,tag)`` tuples or word strings.
166        :rtype: list(tuple(str,str) and Tree)
167        """
168        return concat(
169            [
170                ChunkedCorpusView(
171                    f, enc, 1, 0, 0, 1, *self._cv_args, target_tagset=tagset
172                )
173                for (f, enc) in self.abspaths(fileids, True)
174            ]
175        )
176
177    def chunked_sents(self, fileids=None, tagset=None):
178        """
179        :return: the given file(s) as a list of
180            sentences, each encoded as a shallow Tree.  The leaves
181            of these trees are encoded as ``(word, tag)`` tuples (if
182            the corpus has tags) or word strings (if the corpus has no
183            tags).
184        :rtype: list(Tree)
185        """
186        return concat(
187            [
188                ChunkedCorpusView(
189                    f, enc, 1, 1, 0, 1, *self._cv_args, target_tagset=tagset
190                )
191                for (f, enc) in self.abspaths(fileids, True)
192            ]
193        )
194
195    def chunked_paras(self, fileids=None, tagset=None):
196        """
197        :return: the given file(s) as a list of
198            paragraphs, each encoded as a list of sentences, which are
199            in turn encoded as a shallow Tree.  The leaves of these
200            trees are encoded as ``(word, tag)`` tuples (if the corpus
201            has tags) or word strings (if the corpus has no tags).
202        :rtype: list(list(Tree))
203        """
204        return concat(
205            [
206                ChunkedCorpusView(
207                    f, enc, 1, 1, 1, 1, *self._cv_args, target_tagset=tagset
208                )
209                for (f, enc) in self.abspaths(fileids, True)
210            ]
211        )
212
213    def _read_block(self, stream):
214        return [tagstr2tree(t) for t in read_blankline_block(stream)]
215
216
217class ChunkedCorpusView(StreamBackedCorpusView):
218    def __init__(
219        self,
220        fileid,
221        encoding,
222        tagged,
223        group_by_sent,
224        group_by_para,
225        chunked,
226        str2chunktree,
227        sent_tokenizer,
228        para_block_reader,
229        source_tagset=None,
230        target_tagset=None,
231    ):
232        StreamBackedCorpusView.__init__(self, fileid, encoding=encoding)
233        self._tagged = tagged
234        self._group_by_sent = group_by_sent
235        self._group_by_para = group_by_para
236        self._chunked = chunked
237        self._str2chunktree = str2chunktree
238        self._sent_tokenizer = sent_tokenizer
239        self._para_block_reader = para_block_reader
240        self._source_tagset = source_tagset
241        self._target_tagset = target_tagset
242
243    def read_block(self, stream):
244        block = []
245        for para_str in self._para_block_reader(stream):
246            para = []
247            for sent_str in self._sent_tokenizer.tokenize(para_str):
248                sent = self._str2chunktree(
249                    sent_str,
250                    source_tagset=self._source_tagset,
251                    target_tagset=self._target_tagset,
252                )
253
254                # If requested, throw away the tags.
255                if not self._tagged:
256                    sent = self._untag(sent)
257
258                # If requested, throw away the chunks.
259                if not self._chunked:
260                    sent = sent.leaves()
261
262                # Add the sentence to `para`.
263                if self._group_by_sent:
264                    para.append(sent)
265                else:
266                    para.extend(sent)
267
268            # Add the paragraph to `block`.
269            if self._group_by_para:
270                block.append(para)
271            else:
272                block.extend(para)
273
274        # Return the block
275        return block
276
277    def _untag(self, tree):
278        for i, child in enumerate(tree):
279            if isinstance(child, Tree):
280                self._untag(child)
281            elif isinstance(child, tuple):
282                tree[i] = child[0]
283            else:
284                raise ValueError('expected child to be Tree or tuple')
285        return tree
286