1# Natural Language Toolkit: SemCor Corpus Reader 2# 3# Copyright (C) 2001-2019 NLTK Project 4# Author: Nathan Schneider <nschneid@cs.cmu.edu> 5# URL: <http://nltk.org/> 6# For license information, see LICENSE.TXT 7 8""" 9Corpus reader for the SemCor Corpus. 10""" 11from __future__ import absolute_import, unicode_literals 12 13__docformat__ = 'epytext en' 14 15from nltk.corpus.reader.api import * 16from nltk.corpus.reader.xmldocs import XMLCorpusReader, XMLCorpusView 17from nltk.tree import Tree 18 19 20class SemcorCorpusReader(XMLCorpusReader): 21 """ 22 Corpus reader for the SemCor Corpus. 23 For access to the complete XML data structure, use the ``xml()`` 24 method. For access to simple word lists and tagged word lists, use 25 ``words()``, ``sents()``, ``tagged_words()``, and ``tagged_sents()``. 26 """ 27 28 def __init__(self, root, fileids, wordnet, lazy=True): 29 XMLCorpusReader.__init__(self, root, fileids) 30 self._lazy = lazy 31 self._wordnet = wordnet 32 33 def words(self, fileids=None): 34 """ 35 :return: the given file(s) as a list of words and punctuation symbols. 36 :rtype: list(str) 37 """ 38 return self._items(fileids, 'word', False, False, False) 39 40 def chunks(self, fileids=None): 41 """ 42 :return: the given file(s) as a list of chunks, 43 each of which is a list of words and punctuation symbols 44 that form a unit. 45 :rtype: list(list(str)) 46 """ 47 return self._items(fileids, 'chunk', False, False, False) 48 49 def tagged_chunks(self, fileids=None, tag=('pos' or 'sem' or 'both')): 50 """ 51 :return: the given file(s) as a list of tagged chunks, represented 52 in tree form. 53 :rtype: list(Tree) 54 55 :param tag: `'pos'` (part of speech), `'sem'` (semantic), or `'both'` 56 to indicate the kind of tags to include. Semantic tags consist of 57 WordNet lemma IDs, plus an `'NE'` node if the chunk is a named entity 58 without a specific entry in WordNet. (Named entities of type 'other' 59 have no lemma. Other chunks not in WordNet have no semantic tag. 60 Punctuation tokens have `None` for their part of speech tag.) 61 """ 62 return self._items(fileids, 'chunk', False, tag != 'sem', tag != 'pos') 63 64 def sents(self, fileids=None): 65 """ 66 :return: the given file(s) as a list of sentences, each encoded 67 as a list of word strings. 68 :rtype: list(list(str)) 69 """ 70 return self._items(fileids, 'word', True, False, False) 71 72 def chunk_sents(self, fileids=None): 73 """ 74 :return: the given file(s) as a list of sentences, each encoded 75 as a list of chunks. 76 :rtype: list(list(list(str))) 77 """ 78 return self._items(fileids, 'chunk', True, False, False) 79 80 def tagged_sents(self, fileids=None, tag=('pos' or 'sem' or 'both')): 81 """ 82 :return: the given file(s) as a list of sentences. Each sentence 83 is represented as a list of tagged chunks (in tree form). 84 :rtype: list(list(Tree)) 85 86 :param tag: `'pos'` (part of speech), `'sem'` (semantic), or `'both'` 87 to indicate the kind of tags to include. Semantic tags consist of 88 WordNet lemma IDs, plus an `'NE'` node if the chunk is a named entity 89 without a specific entry in WordNet. (Named entities of type 'other' 90 have no lemma. Other chunks not in WordNet have no semantic tag. 91 Punctuation tokens have `None` for their part of speech tag.) 92 """ 93 return self._items(fileids, 'chunk', True, tag != 'sem', tag != 'pos') 94 95 def _items(self, fileids, unit, bracket_sent, pos_tag, sem_tag): 96 if unit == 'word' and not bracket_sent: 97 # the result of the SemcorWordView may be a multiword unit, so the 98 # LazyConcatenation will make sure the sentence is flattened 99 _ = lambda *args: LazyConcatenation( 100 (SemcorWordView if self._lazy else self._words)(*args) 101 ) 102 else: 103 _ = SemcorWordView if self._lazy else self._words 104 return concat( 105 [ 106 _(fileid, unit, bracket_sent, pos_tag, sem_tag, self._wordnet) 107 for fileid in self.abspaths(fileids) 108 ] 109 ) 110 111 def _words(self, fileid, unit, bracket_sent, pos_tag, sem_tag): 112 """ 113 Helper used to implement the view methods -- returns a list of 114 tokens, (segmented) words, chunks, or sentences. The tokens 115 and chunks may optionally be tagged (with POS and sense 116 information). 117 118 :param fileid: The name of the underlying file. 119 :param unit: One of `'token'`, `'word'`, or `'chunk'`. 120 :param bracket_sent: If true, include sentence bracketing. 121 :param pos_tag: Whether to include part-of-speech tags. 122 :param sem_tag: Whether to include semantic tags, namely WordNet lemma 123 and OOV named entity status. 124 """ 125 assert unit in ('token', 'word', 'chunk') 126 result = [] 127 128 xmldoc = ElementTree.parse(fileid).getroot() 129 for xmlsent in xmldoc.findall('.//s'): 130 sent = [] 131 for xmlword in _all_xmlwords_in(xmlsent): 132 itm = SemcorCorpusReader._word( 133 xmlword, unit, pos_tag, sem_tag, self._wordnet 134 ) 135 if unit == 'word': 136 sent.extend(itm) 137 else: 138 sent.append(itm) 139 140 if bracket_sent: 141 result.append(SemcorSentence(xmlsent.attrib['snum'], sent)) 142 else: 143 result.extend(sent) 144 145 assert None not in result 146 return result 147 148 @staticmethod 149 def _word(xmlword, unit, pos_tag, sem_tag, wordnet): 150 tkn = xmlword.text 151 if not tkn: 152 tkn = "" # fixes issue 337? 153 154 lemma = xmlword.get('lemma', tkn) # lemma or NE class 155 lexsn = xmlword.get('lexsn') # lex_sense (locator for the lemma's sense) 156 if lexsn is not None: 157 sense_key = lemma + '%' + lexsn 158 wnpos = ('n', 'v', 'a', 'r', 's')[ 159 int(lexsn.split(':')[0]) - 1 160 ] # see http://wordnet.princeton.edu/man/senseidx.5WN.html 161 else: 162 sense_key = wnpos = None 163 redef = xmlword.get( 164 'rdf', tkn 165 ) # redefinition--this indicates the lookup string 166 # does not exactly match the enclosed string, e.g. due to typographical adjustments 167 # or discontinuity of a multiword expression. If a redefinition has occurred, 168 # the "rdf" attribute holds its inflected form and "lemma" holds its lemma. 169 # For NEs, "rdf", "lemma", and "pn" all hold the same value (the NE class). 170 sensenum = xmlword.get('wnsn') # WordNet sense number 171 isOOVEntity = 'pn' in xmlword.keys() # a "personal name" (NE) not in WordNet 172 pos = xmlword.get( 173 'pos' 174 ) # part of speech for the whole chunk (None for punctuation) 175 176 if unit == 'token': 177 if not pos_tag and not sem_tag: 178 itm = tkn 179 else: 180 itm = ( 181 (tkn,) 182 + ((pos,) if pos_tag else ()) 183 + ((lemma, wnpos, sensenum, isOOVEntity) if sem_tag else ()) 184 ) 185 return itm 186 else: 187 ww = tkn.split('_') # TODO: case where punctuation intervenes in MWE 188 if unit == 'word': 189 return ww 190 else: 191 if sensenum is not None: 192 try: 193 sense = wordnet.lemma_from_key(sense_key) # Lemma object 194 except Exception: 195 # cannot retrieve the wordnet.Lemma object. possible reasons: 196 # (a) the wordnet corpus is not downloaded; 197 # (b) a nonexistant sense is annotated: e.g., such.s.00 triggers: 198 # nltk.corpus.reader.wordnet.WordNetError: No synset found for key u'such%5:00:01:specified:00' 199 # solution: just use the lemma name as a string 200 try: 201 sense = '%s.%s.%02d' % ( 202 lemma, 203 wnpos, 204 int(sensenum), 205 ) # e.g.: reach.v.02 206 except ValueError: 207 sense = ( 208 lemma + '.' + wnpos + '.' + sensenum 209 ) # e.g. the sense number may be "2;1" 210 211 bottom = [Tree(pos, ww)] if pos_tag else ww 212 213 if sem_tag and isOOVEntity: 214 if sensenum is not None: 215 return Tree(sense, [Tree('NE', bottom)]) 216 else: # 'other' NE 217 return Tree('NE', bottom) 218 elif sem_tag and sensenum is not None: 219 return Tree(sense, bottom) 220 elif pos_tag: 221 return bottom[0] 222 else: 223 return bottom # chunk as a list 224 225 226def _all_xmlwords_in(elt, result=None): 227 if result is None: 228 result = [] 229 for child in elt: 230 if child.tag in ('wf', 'punc'): 231 result.append(child) 232 else: 233 _all_xmlwords_in(child, result) 234 return result 235 236 237class SemcorSentence(list): 238 """ 239 A list of words, augmented by an attribute ``num`` used to record 240 the sentence identifier (the ``n`` attribute from the XML). 241 """ 242 243 def __init__(self, num, items): 244 self.num = num 245 list.__init__(self, items) 246 247 248class SemcorWordView(XMLCorpusView): 249 """ 250 A stream backed corpus view specialized for use with the BNC corpus. 251 """ 252 253 def __init__(self, fileid, unit, bracket_sent, pos_tag, sem_tag, wordnet): 254 """ 255 :param fileid: The name of the underlying file. 256 :param unit: One of `'token'`, `'word'`, or `'chunk'`. 257 :param bracket_sent: If true, include sentence bracketing. 258 :param pos_tag: Whether to include part-of-speech tags. 259 :param sem_tag: Whether to include semantic tags, namely WordNet lemma 260 and OOV named entity status. 261 """ 262 if bracket_sent: 263 tagspec = '.*/s' 264 else: 265 tagspec = '.*/s/(punc|wf)' 266 267 self._unit = unit 268 self._sent = bracket_sent 269 self._pos_tag = pos_tag 270 self._sem_tag = sem_tag 271 self._wordnet = wordnet 272 273 XMLCorpusView.__init__(self, fileid, tagspec) 274 275 def handle_elt(self, elt, context): 276 if self._sent: 277 return self.handle_sent(elt) 278 else: 279 return self.handle_word(elt) 280 281 def handle_word(self, elt): 282 return SemcorCorpusReader._word( 283 elt, self._unit, self._pos_tag, self._sem_tag, self._wordnet 284 ) 285 286 def handle_sent(self, elt): 287 sent = [] 288 for child in elt: 289 if child.tag in ('wf', 'punc'): 290 itm = self.handle_word(child) 291 if self._unit == 'word': 292 sent.extend(itm) 293 else: 294 sent.append(itm) 295 else: 296 raise ValueError('Unexpected element %s' % child.tag) 297 return SemcorSentence(elt.attrib['snum'], sent) 298