1# Natural Language Toolkit: Chunked Corpus Reader 2# 3# Copyright (C) 2001-2019 NLTK Project 4# Author: Steven Bird <stevenbird1@gmail.com> 5# Edward Loper <edloper@gmail.com> 6# URL: <http://nltk.org/> 7# For license information, see LICENSE.TXT 8 9""" 10A reader for corpora that contain chunked (and optionally tagged) 11documents. 12""" 13 14import os.path, codecs 15 16from six import string_types 17 18import nltk 19from nltk.corpus.reader.bracket_parse import BracketParseCorpusReader 20from nltk.tree import Tree 21from nltk.tokenize import * 22from nltk.chunk import tagstr2tree 23from nltk.corpus.reader.util import * 24from nltk.corpus.reader.api import * 25 26 27class ChunkedCorpusReader(CorpusReader): 28 """ 29 Reader for chunked (and optionally tagged) corpora. Paragraphs 30 are split using a block reader. They are then tokenized into 31 sentences using a sentence tokenizer. Finally, these sentences 32 are parsed into chunk trees using a string-to-chunktree conversion 33 function. Each of these steps can be performed using a default 34 function or a custom function. By default, paragraphs are split 35 on blank lines; sentences are listed one per line; and sentences 36 are parsed into chunk trees using ``nltk.chunk.tagstr2tree``. 37 """ 38 39 def __init__( 40 self, 41 root, 42 fileids, 43 extension='', 44 str2chunktree=tagstr2tree, 45 sent_tokenizer=RegexpTokenizer('\n', gaps=True), 46 para_block_reader=read_blankline_block, 47 encoding='utf8', 48 tagset=None, 49 ): 50 """ 51 :param root: The root directory for this corpus. 52 :param fileids: A list or regexp specifying the fileids in this corpus. 53 """ 54 CorpusReader.__init__(self, root, fileids, encoding) 55 self._cv_args = (str2chunktree, sent_tokenizer, para_block_reader, tagset) 56 """Arguments for corpus views generated by this corpus: a tuple 57 (str2chunktree, sent_tokenizer, para_block_tokenizer)""" 58 59 def raw(self, fileids=None): 60 """ 61 :return: the given file(s) as a single string. 62 :rtype: str 63 """ 64 if fileids is None: 65 fileids = self._fileids 66 elif isinstance(fileids, string_types): 67 fileids = [fileids] 68 return concat([self.open(f).read() for f in fileids]) 69 70 def words(self, fileids=None): 71 """ 72 :return: the given file(s) as a list of words 73 and punctuation symbols. 74 :rtype: list(str) 75 """ 76 return concat( 77 [ 78 ChunkedCorpusView(f, enc, 0, 0, 0, 0, *self._cv_args) 79 for (f, enc) in self.abspaths(fileids, True) 80 ] 81 ) 82 83 def sents(self, fileids=None): 84 """ 85 :return: the given file(s) as a list of 86 sentences or utterances, each encoded as a list of word 87 strings. 88 :rtype: list(list(str)) 89 """ 90 return concat( 91 [ 92 ChunkedCorpusView(f, enc, 0, 1, 0, 0, *self._cv_args) 93 for (f, enc) in self.abspaths(fileids, True) 94 ] 95 ) 96 97 def paras(self, fileids=None): 98 """ 99 :return: the given file(s) as a list of 100 paragraphs, each encoded as a list of sentences, which are 101 in turn encoded as lists of word strings. 102 :rtype: list(list(list(str))) 103 """ 104 return concat( 105 [ 106 ChunkedCorpusView(f, enc, 0, 1, 1, 0, *self._cv_args) 107 for (f, enc) in self.abspaths(fileids, True) 108 ] 109 ) 110 111 def tagged_words(self, fileids=None, tagset=None): 112 """ 113 :return: the given file(s) as a list of tagged 114 words and punctuation symbols, encoded as tuples 115 ``(word,tag)``. 116 :rtype: list(tuple(str,str)) 117 """ 118 return concat( 119 [ 120 ChunkedCorpusView( 121 f, enc, 1, 0, 0, 0, *self._cv_args, target_tagset=tagset 122 ) 123 for (f, enc) in self.abspaths(fileids, True) 124 ] 125 ) 126 127 def tagged_sents(self, fileids=None, tagset=None): 128 """ 129 :return: the given file(s) as a list of 130 sentences, each encoded as a list of ``(word,tag)`` tuples. 131 132 :rtype: list(list(tuple(str,str))) 133 """ 134 return concat( 135 [ 136 ChunkedCorpusView( 137 f, enc, 1, 1, 0, 0, *self._cv_args, target_tagset=tagset 138 ) 139 for (f, enc) in self.abspaths(fileids, True) 140 ] 141 ) 142 143 def tagged_paras(self, fileids=None, tagset=None): 144 """ 145 :return: the given file(s) as a list of 146 paragraphs, each encoded as a list of sentences, which are 147 in turn encoded as lists of ``(word,tag)`` tuples. 148 :rtype: list(list(list(tuple(str,str)))) 149 """ 150 return concat( 151 [ 152 ChunkedCorpusView( 153 f, enc, 1, 1, 1, 0, *self._cv_args, target_tagset=tagset 154 ) 155 for (f, enc) in self.abspaths(fileids, True) 156 ] 157 ) 158 159 def chunked_words(self, fileids=None, tagset=None): 160 """ 161 :return: the given file(s) as a list of tagged 162 words and chunks. Words are encoded as ``(word, tag)`` 163 tuples (if the corpus has tags) or word strings (if the 164 corpus has no tags). Chunks are encoded as depth-one 165 trees over ``(word,tag)`` tuples or word strings. 166 :rtype: list(tuple(str,str) and Tree) 167 """ 168 return concat( 169 [ 170 ChunkedCorpusView( 171 f, enc, 1, 0, 0, 1, *self._cv_args, target_tagset=tagset 172 ) 173 for (f, enc) in self.abspaths(fileids, True) 174 ] 175 ) 176 177 def chunked_sents(self, fileids=None, tagset=None): 178 """ 179 :return: the given file(s) as a list of 180 sentences, each encoded as a shallow Tree. The leaves 181 of these trees are encoded as ``(word, tag)`` tuples (if 182 the corpus has tags) or word strings (if the corpus has no 183 tags). 184 :rtype: list(Tree) 185 """ 186 return concat( 187 [ 188 ChunkedCorpusView( 189 f, enc, 1, 1, 0, 1, *self._cv_args, target_tagset=tagset 190 ) 191 for (f, enc) in self.abspaths(fileids, True) 192 ] 193 ) 194 195 def chunked_paras(self, fileids=None, tagset=None): 196 """ 197 :return: the given file(s) as a list of 198 paragraphs, each encoded as a list of sentences, which are 199 in turn encoded as a shallow Tree. The leaves of these 200 trees are encoded as ``(word, tag)`` tuples (if the corpus 201 has tags) or word strings (if the corpus has no tags). 202 :rtype: list(list(Tree)) 203 """ 204 return concat( 205 [ 206 ChunkedCorpusView( 207 f, enc, 1, 1, 1, 1, *self._cv_args, target_tagset=tagset 208 ) 209 for (f, enc) in self.abspaths(fileids, True) 210 ] 211 ) 212 213 def _read_block(self, stream): 214 return [tagstr2tree(t) for t in read_blankline_block(stream)] 215 216 217class ChunkedCorpusView(StreamBackedCorpusView): 218 def __init__( 219 self, 220 fileid, 221 encoding, 222 tagged, 223 group_by_sent, 224 group_by_para, 225 chunked, 226 str2chunktree, 227 sent_tokenizer, 228 para_block_reader, 229 source_tagset=None, 230 target_tagset=None, 231 ): 232 StreamBackedCorpusView.__init__(self, fileid, encoding=encoding) 233 self._tagged = tagged 234 self._group_by_sent = group_by_sent 235 self._group_by_para = group_by_para 236 self._chunked = chunked 237 self._str2chunktree = str2chunktree 238 self._sent_tokenizer = sent_tokenizer 239 self._para_block_reader = para_block_reader 240 self._source_tagset = source_tagset 241 self._target_tagset = target_tagset 242 243 def read_block(self, stream): 244 block = [] 245 for para_str in self._para_block_reader(stream): 246 para = [] 247 for sent_str in self._sent_tokenizer.tokenize(para_str): 248 sent = self._str2chunktree( 249 sent_str, 250 source_tagset=self._source_tagset, 251 target_tagset=self._target_tagset, 252 ) 253 254 # If requested, throw away the tags. 255 if not self._tagged: 256 sent = self._untag(sent) 257 258 # If requested, throw away the chunks. 259 if not self._chunked: 260 sent = sent.leaves() 261 262 # Add the sentence to `para`. 263 if self._group_by_sent: 264 para.append(sent) 265 else: 266 para.extend(sent) 267 268 # Add the paragraph to `block`. 269 if self._group_by_para: 270 block.append(para) 271 else: 272 block.extend(para) 273 274 # Return the block 275 return block 276 277 def _untag(self, tree): 278 for i, child in enumerate(tree): 279 if isinstance(child, Tree): 280 self._untag(child) 281 elif isinstance(child, tuple): 282 tree[i] = child[0] 283 else: 284 raise ValueError('expected child to be Tree or tuple') 285 return tree 286