1# Natural Language Toolkit: Penn Treebank Reader
2#
3# Copyright (C) 2001-2019 NLTK Project
4# Author: Steven Bird <stevenbird1@gmail.com>
5#         Edward Loper <edloper@gmail.com>
6# URL: <http://nltk.org/>
7# For license information, see LICENSE.TXT
8"""
9Corpus reader for corpora that consist of parenthesis-delineated parse trees.
10"""
11
12import sys
13
14from nltk.tree import Tree
15from nltk.tag import map_tag
16
17from nltk.corpus.reader.util import *
18from nltk.corpus.reader.api import *
19
20# we use [^\s()]+ instead of \S+? to avoid matching ()
21SORTTAGWRD = re.compile(r'\((\d+) ([^\s()]+) ([^\s()]+)\)')
22TAGWORD = re.compile(r'\(([^\s()]+) ([^\s()]+)\)')
23WORD = re.compile(r'\([^\s()]+ ([^\s()]+)\)')
24EMPTY_BRACKETS = re.compile(r'\s*\(\s*\(')
25
26
27class BracketParseCorpusReader(SyntaxCorpusReader):
28    """
29    Reader for corpora that consist of parenthesis-delineated parse trees,
30    like those found in the "combined" section of the Penn Treebank,
31    e.g. "(S (NP (DT the) (JJ little) (NN dog)) (VP (VBD barked)))".
32
33    """
34
35    def __init__(
36        self,
37        root,
38        fileids,
39        comment_char=None,
40        detect_blocks='unindented_paren',
41        encoding='utf8',
42        tagset=None,
43    ):
44        """
45        :param root: The root directory for this corpus.
46        :param fileids: A list or regexp specifying the fileids in this corpus.
47        :param comment_char: The character which can appear at the start of
48            a line to indicate that the rest of the line is a comment.
49        :param detect_blocks: The method that is used to find blocks
50          in the corpus; can be 'unindented_paren' (every unindented
51          parenthesis starts a new parse) or 'sexpr' (brackets are
52          matched).
53        :param tagset: The name of the tagset used by this corpus, to be used
54              for normalizing or converting the POS tags returned by the
55              tagged_...() methods.
56        """
57        # FIXME: Why is it inheritting from SyntaxCorpusReader but initializing
58        #       from CorpusReader?
59        CorpusReader.__init__(self, root, fileids, encoding)
60        self._comment_char = comment_char
61        self._detect_blocks = detect_blocks
62        self._tagset = tagset
63
64    def _read_block(self, stream):
65        if self._detect_blocks == 'sexpr':
66            return read_sexpr_block(stream, comment_char=self._comment_char)
67        elif self._detect_blocks == 'blankline':
68            return read_blankline_block(stream)
69        elif self._detect_blocks == 'unindented_paren':
70            # Tokens start with unindented left parens.
71            toks = read_regexp_block(stream, start_re=r'^\(')
72            # Strip any comments out of the tokens.
73            if self._comment_char:
74                toks = [
75                    re.sub('(?m)^%s.*' % re.escape(self._comment_char), '', tok)
76                    for tok in toks
77                ]
78            return toks
79        else:
80            assert 0, 'bad block type'
81
82    def _normalize(self, t):
83        # If there's an empty set of brackets surrounding the actual
84        # parse, then strip them off.
85        if EMPTY_BRACKETS.match(t):
86            t = t.strip()[1:-1]
87        # Replace leaves of the form (!), (,), with (! !), (, ,)
88        t = re.sub(r"\((.)\)", r"(\1 \1)", t)
89        # Replace leaves of the form (tag word root) with (tag word)
90        t = re.sub(r"\(([^\s()]+) ([^\s()]+) [^\s()]+\)", r"(\1 \2)", t)
91        return t
92
93    def _parse(self, t):
94        try:
95            return Tree.fromstring(self._normalize(t))
96
97        except ValueError as e:
98            sys.stderr.write("Bad tree detected; trying to recover...\n")
99            # Try to recover, if we can:
100            if e.args == ('mismatched parens',):
101                for n in range(1, 5):
102                    try:
103                        v = Tree(self._normalize(t + ')' * n))
104                        sys.stderr.write(
105                            "  Recovered by adding %d close " "paren(s)\n" % n
106                        )
107                        return v
108                    except ValueError:
109                        pass
110            # Try something else:
111            sys.stderr.write("  Recovered by returning a flat parse.\n")
112            # sys.stderr.write(' '.join(t.split())+'\n')
113            return Tree('S', self._tag(t))
114
115    def _tag(self, t, tagset=None):
116        tagged_sent = [(w, p) for (p, w) in TAGWORD.findall(self._normalize(t))]
117        if tagset and tagset != self._tagset:
118            tagged_sent = [
119                (w, map_tag(self._tagset, tagset, p)) for (w, p) in tagged_sent
120            ]
121        return tagged_sent
122
123    def _word(self, t):
124        return WORD.findall(self._normalize(t))
125
126
127class CategorizedBracketParseCorpusReader(
128    CategorizedCorpusReader, BracketParseCorpusReader
129):
130    """
131    A reader for parsed corpora whose documents are
132    divided into categories based on their file identifiers.
133    @author: Nathan Schneider <nschneid@cs.cmu.edu>
134    """
135
136    def __init__(self, *args, **kwargs):
137        """
138        Initialize the corpus reader.  Categorization arguments
139        (C{cat_pattern}, C{cat_map}, and C{cat_file}) are passed to
140        the L{CategorizedCorpusReader constructor
141        <CategorizedCorpusReader.__init__>}.  The remaining arguments
142        are passed to the L{BracketParseCorpusReader constructor
143        <BracketParseCorpusReader.__init__>}.
144        """
145        CategorizedCorpusReader.__init__(self, kwargs)
146        BracketParseCorpusReader.__init__(self, *args, **kwargs)
147
148    def _resolve(self, fileids, categories):
149        if fileids is not None and categories is not None:
150            raise ValueError('Specify fileids or categories, not both')
151        if categories is not None:
152            return self.fileids(categories)
153        else:
154            return fileids
155
156    def raw(self, fileids=None, categories=None):
157        return BracketParseCorpusReader.raw(self, self._resolve(fileids, categories))
158
159    def words(self, fileids=None, categories=None):
160        return BracketParseCorpusReader.words(self, self._resolve(fileids, categories))
161
162    def sents(self, fileids=None, categories=None):
163        return BracketParseCorpusReader.sents(self, self._resolve(fileids, categories))
164
165    def paras(self, fileids=None, categories=None):
166        return BracketParseCorpusReader.paras(self, self._resolve(fileids, categories))
167
168    def tagged_words(self, fileids=None, categories=None, tagset=None):
169        return BracketParseCorpusReader.tagged_words(
170            self, self._resolve(fileids, categories), tagset
171        )
172
173    def tagged_sents(self, fileids=None, categories=None, tagset=None):
174        return BracketParseCorpusReader.tagged_sents(
175            self, self._resolve(fileids, categories), tagset
176        )
177
178    def tagged_paras(self, fileids=None, categories=None, tagset=None):
179        return BracketParseCorpusReader.tagged_paras(
180            self, self._resolve(fileids, categories), tagset
181        )
182
183    def parsed_words(self, fileids=None, categories=None):
184        return BracketParseCorpusReader.parsed_words(
185            self, self._resolve(fileids, categories)
186        )
187
188    def parsed_sents(self, fileids=None, categories=None):
189        return BracketParseCorpusReader.parsed_sents(
190            self, self._resolve(fileids, categories)
191        )
192
193    def parsed_paras(self, fileids=None, categories=None):
194        return BracketParseCorpusReader.parsed_paras(
195            self, self._resolve(fileids, categories)
196        )
197
198
199class AlpinoCorpusReader(BracketParseCorpusReader):
200    """
201    Reader for the Alpino Dutch Treebank.
202    This corpus has a lexical breakdown structure embedded, as read by _parse
203    Unfortunately this puts punctuation and some other words out of the sentence
204    order in the xml element tree. This is no good for tag_ and word_
205    _tag and _word will be overridden to use a non-default new parameter 'ordered'
206    to the overridden _normalize function. The _parse function can then remain
207    untouched.
208    """
209
210    def __init__(self, root, encoding='ISO-8859-1', tagset=None):
211        BracketParseCorpusReader.__init__(
212            self,
213            root,
214            'alpino\.xml',
215            detect_blocks='blankline',
216            encoding=encoding,
217            tagset=tagset,
218        )
219
220    def _normalize(self, t, ordered=False):
221        """Normalize the xml sentence element in t.
222        The sentence elements <alpino_ds>, although embedded in a few overall
223        xml elements, are seperated by blank lines. That's how the reader can
224        deliver them one at a time.
225        Each sentence has a few category subnodes that are of no use to us.
226        The remaining word nodes may or may not appear in the proper order.
227        Each word node has attributes, among which:
228        - begin : the position of the word in the sentence
229        - pos   : Part of Speech: the Tag
230        - word  : the actual word
231        The return value is a string with all xml elementes replaced by
232        clauses: either a cat clause with nested clauses, or a word clause.
233        The order of the bracket clauses closely follows the xml.
234        If ordered == True, the word clauses include an order sequence number.
235        If ordered == False, the word clauses only have pos and word parts.
236        """
237        if t[:10] != "<alpino_ds":
238            return ""
239        # convert XML to sexpr notation
240        t = re.sub(r'  <node .*? cat="(\w+)".*>', r"(\1", t)
241        if ordered:
242            t = re.sub(
243                r'  <node. *?begin="(\d+)".*? pos="(\w+)".*? word="([^"]+)".*?/>',
244                r"(\1 \2 \3)",
245                t,
246            )
247        else:
248            t = re.sub(r'  <node .*?pos="(\w+)".*? word="([^"]+)".*?/>', r"(\1 \2)", t)
249        t = re.sub(r"  </node>", r")", t)
250        t = re.sub(r"<sentence>.*</sentence>", r"", t)
251        t = re.sub(r"</?alpino_ds.*>", r"", t)
252        return t
253
254    def _tag(self, t, tagset=None):
255        tagged_sent = [
256            (int(o), w, p)
257            for (o, p, w) in SORTTAGWRD.findall(self._normalize(t, ordered=True))
258        ]
259        tagged_sent.sort()
260        if tagset and tagset != self._tagset:
261            tagged_sent = [
262                (w, map_tag(self._tagset, tagset, p)) for (o, w, p) in tagged_sent
263            ]
264        else:
265            tagged_sent = [(w, p) for (o, w, p) in tagged_sent]
266        return tagged_sent
267
268    def _word(self, t):
269        """Return a correctly ordered list if words"""
270        tagged_sent = self._tag(t)
271        return [w for (w, p) in tagged_sent]
272