1# Natural Language Toolkit: SemCor Corpus Reader
2#
3# Copyright (C) 2001-2019 NLTK Project
4# Author: Nathan Schneider <nschneid@cs.cmu.edu>
5# URL: <http://nltk.org/>
6# For license information, see LICENSE.TXT
7
8"""
9Corpus reader for the SemCor Corpus.
10"""
11from __future__ import absolute_import, unicode_literals
12
13__docformat__ = 'epytext en'
14
15from nltk.corpus.reader.api import *
16from nltk.corpus.reader.xmldocs import XMLCorpusReader, XMLCorpusView
17from nltk.tree import Tree
18
19
20class SemcorCorpusReader(XMLCorpusReader):
21    """
22    Corpus reader for the SemCor Corpus.
23    For access to the complete XML data structure, use the ``xml()``
24    method.  For access to simple word lists and tagged word lists, use
25    ``words()``, ``sents()``, ``tagged_words()``, and ``tagged_sents()``.
26    """
27
28    def __init__(self, root, fileids, wordnet, lazy=True):
29        XMLCorpusReader.__init__(self, root, fileids)
30        self._lazy = lazy
31        self._wordnet = wordnet
32
33    def words(self, fileids=None):
34        """
35        :return: the given file(s) as a list of words and punctuation symbols.
36        :rtype: list(str)
37        """
38        return self._items(fileids, 'word', False, False, False)
39
40    def chunks(self, fileids=None):
41        """
42        :return: the given file(s) as a list of chunks,
43            each of which is a list of words and punctuation symbols
44            that form a unit.
45        :rtype: list(list(str))
46        """
47        return self._items(fileids, 'chunk', False, False, False)
48
49    def tagged_chunks(self, fileids=None, tag=('pos' or 'sem' or 'both')):
50        """
51        :return: the given file(s) as a list of tagged chunks, represented
52            in tree form.
53        :rtype: list(Tree)
54
55        :param tag: `'pos'` (part of speech), `'sem'` (semantic), or `'both'`
56            to indicate the kind of tags to include.  Semantic tags consist of
57            WordNet lemma IDs, plus an `'NE'` node if the chunk is a named entity
58            without a specific entry in WordNet.  (Named entities of type 'other'
59            have no lemma.  Other chunks not in WordNet have no semantic tag.
60            Punctuation tokens have `None` for their part of speech tag.)
61        """
62        return self._items(fileids, 'chunk', False, tag != 'sem', tag != 'pos')
63
64    def sents(self, fileids=None):
65        """
66        :return: the given file(s) as a list of sentences, each encoded
67            as a list of word strings.
68        :rtype: list(list(str))
69        """
70        return self._items(fileids, 'word', True, False, False)
71
72    def chunk_sents(self, fileids=None):
73        """
74        :return: the given file(s) as a list of sentences, each encoded
75            as a list of chunks.
76        :rtype: list(list(list(str)))
77        """
78        return self._items(fileids, 'chunk', True, False, False)
79
80    def tagged_sents(self, fileids=None, tag=('pos' or 'sem' or 'both')):
81        """
82        :return: the given file(s) as a list of sentences. Each sentence
83            is represented as a list of tagged chunks (in tree form).
84        :rtype: list(list(Tree))
85
86        :param tag: `'pos'` (part of speech), `'sem'` (semantic), or `'both'`
87            to indicate the kind of tags to include.  Semantic tags consist of
88            WordNet lemma IDs, plus an `'NE'` node if the chunk is a named entity
89            without a specific entry in WordNet.  (Named entities of type 'other'
90            have no lemma.  Other chunks not in WordNet have no semantic tag.
91            Punctuation tokens have `None` for their part of speech tag.)
92        """
93        return self._items(fileids, 'chunk', True, tag != 'sem', tag != 'pos')
94
95    def _items(self, fileids, unit, bracket_sent, pos_tag, sem_tag):
96        if unit == 'word' and not bracket_sent:
97            # the result of the SemcorWordView may be a multiword unit, so the
98            # LazyConcatenation will make sure the sentence is flattened
99            _ = lambda *args: LazyConcatenation(
100                (SemcorWordView if self._lazy else self._words)(*args)
101            )
102        else:
103            _ = SemcorWordView if self._lazy else self._words
104        return concat(
105            [
106                _(fileid, unit, bracket_sent, pos_tag, sem_tag, self._wordnet)
107                for fileid in self.abspaths(fileids)
108            ]
109        )
110
111    def _words(self, fileid, unit, bracket_sent, pos_tag, sem_tag):
112        """
113        Helper used to implement the view methods -- returns a list of
114        tokens, (segmented) words, chunks, or sentences. The tokens
115        and chunks may optionally be tagged (with POS and sense
116        information).
117
118        :param fileid: The name of the underlying file.
119        :param unit: One of `'token'`, `'word'`, or `'chunk'`.
120        :param bracket_sent: If true, include sentence bracketing.
121        :param pos_tag: Whether to include part-of-speech tags.
122        :param sem_tag: Whether to include semantic tags, namely WordNet lemma
123            and OOV named entity status.
124        """
125        assert unit in ('token', 'word', 'chunk')
126        result = []
127
128        xmldoc = ElementTree.parse(fileid).getroot()
129        for xmlsent in xmldoc.findall('.//s'):
130            sent = []
131            for xmlword in _all_xmlwords_in(xmlsent):
132                itm = SemcorCorpusReader._word(
133                    xmlword, unit, pos_tag, sem_tag, self._wordnet
134                )
135                if unit == 'word':
136                    sent.extend(itm)
137                else:
138                    sent.append(itm)
139
140            if bracket_sent:
141                result.append(SemcorSentence(xmlsent.attrib['snum'], sent))
142            else:
143                result.extend(sent)
144
145        assert None not in result
146        return result
147
148    @staticmethod
149    def _word(xmlword, unit, pos_tag, sem_tag, wordnet):
150        tkn = xmlword.text
151        if not tkn:
152            tkn = ""  # fixes issue 337?
153
154        lemma = xmlword.get('lemma', tkn)  # lemma or NE class
155        lexsn = xmlword.get('lexsn')  # lex_sense (locator for the lemma's sense)
156        if lexsn is not None:
157            sense_key = lemma + '%' + lexsn
158            wnpos = ('n', 'v', 'a', 'r', 's')[
159                int(lexsn.split(':')[0]) - 1
160            ]  # see http://wordnet.princeton.edu/man/senseidx.5WN.html
161        else:
162            sense_key = wnpos = None
163        redef = xmlword.get(
164            'rdf', tkn
165        )  # redefinition--this indicates the lookup string
166        # does not exactly match the enclosed string, e.g. due to typographical adjustments
167        # or discontinuity of a multiword expression. If a redefinition has occurred,
168        # the "rdf" attribute holds its inflected form and "lemma" holds its lemma.
169        # For NEs, "rdf", "lemma", and "pn" all hold the same value (the NE class).
170        sensenum = xmlword.get('wnsn')  # WordNet sense number
171        isOOVEntity = 'pn' in xmlword.keys()  # a "personal name" (NE) not in WordNet
172        pos = xmlword.get(
173            'pos'
174        )  # part of speech for the whole chunk (None for punctuation)
175
176        if unit == 'token':
177            if not pos_tag and not sem_tag:
178                itm = tkn
179            else:
180                itm = (
181                    (tkn,)
182                    + ((pos,) if pos_tag else ())
183                    + ((lemma, wnpos, sensenum, isOOVEntity) if sem_tag else ())
184                )
185            return itm
186        else:
187            ww = tkn.split('_')  # TODO: case where punctuation intervenes in MWE
188            if unit == 'word':
189                return ww
190            else:
191                if sensenum is not None:
192                    try:
193                        sense = wordnet.lemma_from_key(sense_key)  # Lemma object
194                    except Exception:
195                        # cannot retrieve the wordnet.Lemma object. possible reasons:
196                        #  (a) the wordnet corpus is not downloaded;
197                        #  (b) a nonexistant sense is annotated: e.g., such.s.00 triggers:
198                        #  nltk.corpus.reader.wordnet.WordNetError: No synset found for key u'such%5:00:01:specified:00'
199                        # solution: just use the lemma name as a string
200                        try:
201                            sense = '%s.%s.%02d' % (
202                                lemma,
203                                wnpos,
204                                int(sensenum),
205                            )  # e.g.: reach.v.02
206                        except ValueError:
207                            sense = (
208                                lemma + '.' + wnpos + '.' + sensenum
209                            )  # e.g. the sense number may be "2;1"
210
211                bottom = [Tree(pos, ww)] if pos_tag else ww
212
213                if sem_tag and isOOVEntity:
214                    if sensenum is not None:
215                        return Tree(sense, [Tree('NE', bottom)])
216                    else:  # 'other' NE
217                        return Tree('NE', bottom)
218                elif sem_tag and sensenum is not None:
219                    return Tree(sense, bottom)
220                elif pos_tag:
221                    return bottom[0]
222                else:
223                    return bottom  # chunk as a list
224
225
226def _all_xmlwords_in(elt, result=None):
227    if result is None:
228        result = []
229    for child in elt:
230        if child.tag in ('wf', 'punc'):
231            result.append(child)
232        else:
233            _all_xmlwords_in(child, result)
234    return result
235
236
237class SemcorSentence(list):
238    """
239    A list of words, augmented by an attribute ``num`` used to record
240    the sentence identifier (the ``n`` attribute from the XML).
241    """
242
243    def __init__(self, num, items):
244        self.num = num
245        list.__init__(self, items)
246
247
248class SemcorWordView(XMLCorpusView):
249    """
250    A stream backed corpus view specialized for use with the BNC corpus.
251    """
252
253    def __init__(self, fileid, unit, bracket_sent, pos_tag, sem_tag, wordnet):
254        """
255        :param fileid: The name of the underlying file.
256        :param unit: One of `'token'`, `'word'`, or `'chunk'`.
257        :param bracket_sent: If true, include sentence bracketing.
258        :param pos_tag: Whether to include part-of-speech tags.
259        :param sem_tag: Whether to include semantic tags, namely WordNet lemma
260            and OOV named entity status.
261        """
262        if bracket_sent:
263            tagspec = '.*/s'
264        else:
265            tagspec = '.*/s/(punc|wf)'
266
267        self._unit = unit
268        self._sent = bracket_sent
269        self._pos_tag = pos_tag
270        self._sem_tag = sem_tag
271        self._wordnet = wordnet
272
273        XMLCorpusView.__init__(self, fileid, tagspec)
274
275    def handle_elt(self, elt, context):
276        if self._sent:
277            return self.handle_sent(elt)
278        else:
279            return self.handle_word(elt)
280
281    def handle_word(self, elt):
282        return SemcorCorpusReader._word(
283            elt, self._unit, self._pos_tag, self._sem_tag, self._wordnet
284        )
285
286    def handle_sent(self, elt):
287        sent = []
288        for child in elt:
289            if child.tag in ('wf', 'punc'):
290                itm = self.handle_word(child)
291                if self._unit == 'word':
292                    sent.extend(itm)
293                else:
294                    sent.append(itm)
295            else:
296                raise ValueError('Unexpected element %s' % child.tag)
297        return SemcorSentence(elt.attrib['snum'], sent)
298