1# Natural Language Toolkit: Penn Treebank Reader 2# 3# Copyright (C) 2001-2019 NLTK Project 4# Author: Steven Bird <stevenbird1@gmail.com> 5# Edward Loper <edloper@gmail.com> 6# URL: <http://nltk.org/> 7# For license information, see LICENSE.TXT 8""" 9Corpus reader for corpora that consist of parenthesis-delineated parse trees. 10""" 11 12import sys 13 14from nltk.tree import Tree 15from nltk.tag import map_tag 16 17from nltk.corpus.reader.util import * 18from nltk.corpus.reader.api import * 19 20# we use [^\s()]+ instead of \S+? to avoid matching () 21SORTTAGWRD = re.compile(r'\((\d+) ([^\s()]+) ([^\s()]+)\)') 22TAGWORD = re.compile(r'\(([^\s()]+) ([^\s()]+)\)') 23WORD = re.compile(r'\([^\s()]+ ([^\s()]+)\)') 24EMPTY_BRACKETS = re.compile(r'\s*\(\s*\(') 25 26 27class BracketParseCorpusReader(SyntaxCorpusReader): 28 """ 29 Reader for corpora that consist of parenthesis-delineated parse trees, 30 like those found in the "combined" section of the Penn Treebank, 31 e.g. "(S (NP (DT the) (JJ little) (NN dog)) (VP (VBD barked)))". 32 33 """ 34 35 def __init__( 36 self, 37 root, 38 fileids, 39 comment_char=None, 40 detect_blocks='unindented_paren', 41 encoding='utf8', 42 tagset=None, 43 ): 44 """ 45 :param root: The root directory for this corpus. 46 :param fileids: A list or regexp specifying the fileids in this corpus. 47 :param comment_char: The character which can appear at the start of 48 a line to indicate that the rest of the line is a comment. 49 :param detect_blocks: The method that is used to find blocks 50 in the corpus; can be 'unindented_paren' (every unindented 51 parenthesis starts a new parse) or 'sexpr' (brackets are 52 matched). 53 :param tagset: The name of the tagset used by this corpus, to be used 54 for normalizing or converting the POS tags returned by the 55 tagged_...() methods. 56 """ 57 # FIXME: Why is it inheritting from SyntaxCorpusReader but initializing 58 # from CorpusReader? 59 CorpusReader.__init__(self, root, fileids, encoding) 60 self._comment_char = comment_char 61 self._detect_blocks = detect_blocks 62 self._tagset = tagset 63 64 def _read_block(self, stream): 65 if self._detect_blocks == 'sexpr': 66 return read_sexpr_block(stream, comment_char=self._comment_char) 67 elif self._detect_blocks == 'blankline': 68 return read_blankline_block(stream) 69 elif self._detect_blocks == 'unindented_paren': 70 # Tokens start with unindented left parens. 71 toks = read_regexp_block(stream, start_re=r'^\(') 72 # Strip any comments out of the tokens. 73 if self._comment_char: 74 toks = [ 75 re.sub('(?m)^%s.*' % re.escape(self._comment_char), '', tok) 76 for tok in toks 77 ] 78 return toks 79 else: 80 assert 0, 'bad block type' 81 82 def _normalize(self, t): 83 # If there's an empty set of brackets surrounding the actual 84 # parse, then strip them off. 85 if EMPTY_BRACKETS.match(t): 86 t = t.strip()[1:-1] 87 # Replace leaves of the form (!), (,), with (! !), (, ,) 88 t = re.sub(r"\((.)\)", r"(\1 \1)", t) 89 # Replace leaves of the form (tag word root) with (tag word) 90 t = re.sub(r"\(([^\s()]+) ([^\s()]+) [^\s()]+\)", r"(\1 \2)", t) 91 return t 92 93 def _parse(self, t): 94 try: 95 return Tree.fromstring(self._normalize(t)) 96 97 except ValueError as e: 98 sys.stderr.write("Bad tree detected; trying to recover...\n") 99 # Try to recover, if we can: 100 if e.args == ('mismatched parens',): 101 for n in range(1, 5): 102 try: 103 v = Tree(self._normalize(t + ')' * n)) 104 sys.stderr.write( 105 " Recovered by adding %d close " "paren(s)\n" % n 106 ) 107 return v 108 except ValueError: 109 pass 110 # Try something else: 111 sys.stderr.write(" Recovered by returning a flat parse.\n") 112 # sys.stderr.write(' '.join(t.split())+'\n') 113 return Tree('S', self._tag(t)) 114 115 def _tag(self, t, tagset=None): 116 tagged_sent = [(w, p) for (p, w) in TAGWORD.findall(self._normalize(t))] 117 if tagset and tagset != self._tagset: 118 tagged_sent = [ 119 (w, map_tag(self._tagset, tagset, p)) for (w, p) in tagged_sent 120 ] 121 return tagged_sent 122 123 def _word(self, t): 124 return WORD.findall(self._normalize(t)) 125 126 127class CategorizedBracketParseCorpusReader( 128 CategorizedCorpusReader, BracketParseCorpusReader 129): 130 """ 131 A reader for parsed corpora whose documents are 132 divided into categories based on their file identifiers. 133 @author: Nathan Schneider <nschneid@cs.cmu.edu> 134 """ 135 136 def __init__(self, *args, **kwargs): 137 """ 138 Initialize the corpus reader. Categorization arguments 139 (C{cat_pattern}, C{cat_map}, and C{cat_file}) are passed to 140 the L{CategorizedCorpusReader constructor 141 <CategorizedCorpusReader.__init__>}. The remaining arguments 142 are passed to the L{BracketParseCorpusReader constructor 143 <BracketParseCorpusReader.__init__>}. 144 """ 145 CategorizedCorpusReader.__init__(self, kwargs) 146 BracketParseCorpusReader.__init__(self, *args, **kwargs) 147 148 def _resolve(self, fileids, categories): 149 if fileids is not None and categories is not None: 150 raise ValueError('Specify fileids or categories, not both') 151 if categories is not None: 152 return self.fileids(categories) 153 else: 154 return fileids 155 156 def raw(self, fileids=None, categories=None): 157 return BracketParseCorpusReader.raw(self, self._resolve(fileids, categories)) 158 159 def words(self, fileids=None, categories=None): 160 return BracketParseCorpusReader.words(self, self._resolve(fileids, categories)) 161 162 def sents(self, fileids=None, categories=None): 163 return BracketParseCorpusReader.sents(self, self._resolve(fileids, categories)) 164 165 def paras(self, fileids=None, categories=None): 166 return BracketParseCorpusReader.paras(self, self._resolve(fileids, categories)) 167 168 def tagged_words(self, fileids=None, categories=None, tagset=None): 169 return BracketParseCorpusReader.tagged_words( 170 self, self._resolve(fileids, categories), tagset 171 ) 172 173 def tagged_sents(self, fileids=None, categories=None, tagset=None): 174 return BracketParseCorpusReader.tagged_sents( 175 self, self._resolve(fileids, categories), tagset 176 ) 177 178 def tagged_paras(self, fileids=None, categories=None, tagset=None): 179 return BracketParseCorpusReader.tagged_paras( 180 self, self._resolve(fileids, categories), tagset 181 ) 182 183 def parsed_words(self, fileids=None, categories=None): 184 return BracketParseCorpusReader.parsed_words( 185 self, self._resolve(fileids, categories) 186 ) 187 188 def parsed_sents(self, fileids=None, categories=None): 189 return BracketParseCorpusReader.parsed_sents( 190 self, self._resolve(fileids, categories) 191 ) 192 193 def parsed_paras(self, fileids=None, categories=None): 194 return BracketParseCorpusReader.parsed_paras( 195 self, self._resolve(fileids, categories) 196 ) 197 198 199class AlpinoCorpusReader(BracketParseCorpusReader): 200 """ 201 Reader for the Alpino Dutch Treebank. 202 This corpus has a lexical breakdown structure embedded, as read by _parse 203 Unfortunately this puts punctuation and some other words out of the sentence 204 order in the xml element tree. This is no good for tag_ and word_ 205 _tag and _word will be overridden to use a non-default new parameter 'ordered' 206 to the overridden _normalize function. The _parse function can then remain 207 untouched. 208 """ 209 210 def __init__(self, root, encoding='ISO-8859-1', tagset=None): 211 BracketParseCorpusReader.__init__( 212 self, 213 root, 214 'alpino\.xml', 215 detect_blocks='blankline', 216 encoding=encoding, 217 tagset=tagset, 218 ) 219 220 def _normalize(self, t, ordered=False): 221 """Normalize the xml sentence element in t. 222 The sentence elements <alpino_ds>, although embedded in a few overall 223 xml elements, are seperated by blank lines. That's how the reader can 224 deliver them one at a time. 225 Each sentence has a few category subnodes that are of no use to us. 226 The remaining word nodes may or may not appear in the proper order. 227 Each word node has attributes, among which: 228 - begin : the position of the word in the sentence 229 - pos : Part of Speech: the Tag 230 - word : the actual word 231 The return value is a string with all xml elementes replaced by 232 clauses: either a cat clause with nested clauses, or a word clause. 233 The order of the bracket clauses closely follows the xml. 234 If ordered == True, the word clauses include an order sequence number. 235 If ordered == False, the word clauses only have pos and word parts. 236 """ 237 if t[:10] != "<alpino_ds": 238 return "" 239 # convert XML to sexpr notation 240 t = re.sub(r' <node .*? cat="(\w+)".*>', r"(\1", t) 241 if ordered: 242 t = re.sub( 243 r' <node. *?begin="(\d+)".*? pos="(\w+)".*? word="([^"]+)".*?/>', 244 r"(\1 \2 \3)", 245 t, 246 ) 247 else: 248 t = re.sub(r' <node .*?pos="(\w+)".*? word="([^"]+)".*?/>', r"(\1 \2)", t) 249 t = re.sub(r" </node>", r")", t) 250 t = re.sub(r"<sentence>.*</sentence>", r"", t) 251 t = re.sub(r"</?alpino_ds.*>", r"", t) 252 return t 253 254 def _tag(self, t, tagset=None): 255 tagged_sent = [ 256 (int(o), w, p) 257 for (o, p, w) in SORTTAGWRD.findall(self._normalize(t, ordered=True)) 258 ] 259 tagged_sent.sort() 260 if tagset and tagset != self._tagset: 261 tagged_sent = [ 262 (w, map_tag(self._tagset, tagset, p)) for (o, w, p) in tagged_sent 263 ] 264 else: 265 tagged_sent = [(w, p) for (o, w, p) in tagged_sent] 266 return tagged_sent 267 268 def _word(self, t): 269 """Return a correctly ordered list if words""" 270 tagged_sent = self._tag(t) 271 return [w for (w, p) in tagged_sent] 272