1# -*- coding: iso-8859-1 -*- 2 3# Natural Language Toolkit: York-Toronto-Helsinki Parsed Corpus of Old English Prose (YCOE) 4# 5# Copyright (C) 2001-2015 NLTK Project 6# Author: Selina Dennis <selina@tranzfusion.net> 7# URL: <http://nltk.org/> 8# For license information, see LICENSE.TXT 9 10""" 11Corpus reader for the York-Toronto-Helsinki Parsed Corpus of Old 12English Prose (YCOE), a 1.5 million word syntactically-annotated 13corpus of Old English prose texts. The corpus is distributed by the 14Oxford Text Archive: http://www.ota.ahds.ac.uk/ It is not included 15with NLTK. 16 17The YCOE corpus is divided into 100 files, each representing 18an Old English prose text. Tags used within each text complies 19to the YCOE standard: http://www-users.york.ac.uk/~lang22/YCOE/YcoeHome.htm 20""" 21 22import os 23import re 24 25from six import string_types 26 27from nltk.tokenize import RegexpTokenizer 28from nltk.corpus.reader.bracket_parse import BracketParseCorpusReader 29from nltk.corpus.reader.tagged import TaggedCorpusReader 30 31from nltk.corpus.reader.util import * 32from nltk.corpus.reader.api import * 33 34 35class YCOECorpusReader(CorpusReader): 36 """ 37 Corpus reader for the York-Toronto-Helsinki Parsed Corpus of Old 38 English Prose (YCOE), a 1.5 million word syntactically-annotated 39 corpus of Old English prose texts. 40 """ 41 42 def __init__(self, root, encoding='utf8'): 43 CorpusReader.__init__(self, root, [], encoding) 44 45 self._psd_reader = YCOEParseCorpusReader( 46 self.root.join('psd'), '.*', '.psd', encoding=encoding 47 ) 48 self._pos_reader = YCOETaggedCorpusReader(self.root.join('pos'), '.*', '.pos') 49 50 # Make sure we have a consistent set of items: 51 documents = set(f[:-4] for f in self._psd_reader.fileids()) 52 if set(f[:-4] for f in self._pos_reader.fileids()) != documents: 53 raise ValueError('Items in "psd" and "pos" ' 'subdirectories do not match.') 54 55 fileids = sorted( 56 ['%s.psd' % doc for doc in documents] 57 + ['%s.pos' % doc for doc in documents] 58 ) 59 CorpusReader.__init__(self, root, fileids, encoding) 60 self._documents = sorted(documents) 61 62 def documents(self, fileids=None): 63 """ 64 Return a list of document identifiers for all documents in 65 this corpus, or for the documents with the given file(s) if 66 specified. 67 """ 68 if fileids is None: 69 return self._documents 70 if isinstance(fileids, string_types): 71 fileids = [fileids] 72 for f in fileids: 73 if f not in self._fileids: 74 raise KeyError('File id %s not found' % fileids) 75 # Strip off the '.pos' and '.psd' extensions. 76 return sorted(set(f[:-4] for f in fileids)) 77 78 def fileids(self, documents=None): 79 """ 80 Return a list of file identifiers for the files that make up 81 this corpus, or that store the given document(s) if specified. 82 """ 83 if documents is None: 84 return self._fileids 85 elif isinstance(documents, string_types): 86 documents = [documents] 87 return sorted( 88 set( 89 ['%s.pos' % doc for doc in documents] 90 + ['%s.psd' % doc for doc in documents] 91 ) 92 ) 93 94 def _getfileids(self, documents, subcorpus): 95 """ 96 Helper that selects the appropriate fileids for a given set of 97 documents from a given subcorpus (pos or psd). 98 """ 99 if documents is None: 100 documents = self._documents 101 else: 102 if isinstance(documents, string_types): 103 documents = [documents] 104 for document in documents: 105 if document not in self._documents: 106 if document[-4:] in ('.pos', '.psd'): 107 raise ValueError( 108 'Expected a document identifier, not a file ' 109 'identifier. (Use corpus.documents() to get ' 110 'a list of document identifiers.' 111 ) 112 else: 113 raise ValueError('Document identifier %s not found' % document) 114 return ['%s.%s' % (d, subcorpus) for d in documents] 115 116 # Delegate to one of our two sub-readers: 117 def words(self, documents=None): 118 return self._pos_reader.words(self._getfileids(documents, 'pos')) 119 120 def sents(self, documents=None): 121 return self._pos_reader.sents(self._getfileids(documents, 'pos')) 122 123 def paras(self, documents=None): 124 return self._pos_reader.paras(self._getfileids(documents, 'pos')) 125 126 def tagged_words(self, documents=None): 127 return self._pos_reader.tagged_words(self._getfileids(documents, 'pos')) 128 129 def tagged_sents(self, documents=None): 130 return self._pos_reader.tagged_sents(self._getfileids(documents, 'pos')) 131 132 def tagged_paras(self, documents=None): 133 return self._pos_reader.tagged_paras(self._getfileids(documents, 'pos')) 134 135 def parsed_sents(self, documents=None): 136 return self._psd_reader.parsed_sents(self._getfileids(documents, 'psd')) 137 138 139class YCOEParseCorpusReader(BracketParseCorpusReader): 140 """Specialized version of the standard bracket parse corpus reader 141 that strips out (CODE ...) and (ID ...) nodes.""" 142 143 def _parse(self, t): 144 t = re.sub(r'(?u)\((CODE|ID)[^\)]*\)', '', t) 145 if re.match(r'\s*\(\s*\)\s*$', t): 146 return None 147 return BracketParseCorpusReader._parse(self, t) 148 149 150class YCOETaggedCorpusReader(TaggedCorpusReader): 151 def __init__(self, root, items, encoding='utf8'): 152 gaps_re = r'(?u)(?<=/\.)\s+|\s*\S*_CODE\s*|\s*\S*_ID\s*' 153 sent_tokenizer = RegexpTokenizer(gaps_re, gaps=True) 154 TaggedCorpusReader.__init__( 155 self, root, items, sep='_', sent_tokenizer=sent_tokenizer 156 ) 157 158 159#: A list of all documents and their titles in ycoe. 160documents = { 161 'coadrian.o34': 'Adrian and Ritheus', 162 'coaelhom.o3': 'Ælfric, Supplemental Homilies', 163 'coaelive.o3': 'Ælfric\'s Lives of Saints', 164 'coalcuin': 'Alcuin De virtutibus et vitiis', 165 'coalex.o23': 'Alexander\'s Letter to Aristotle', 166 'coapollo.o3': 'Apollonius of Tyre', 167 'coaugust': 'Augustine', 168 'cobede.o2': 'Bede\'s History of the English Church', 169 'cobenrul.o3': 'Benedictine Rule', 170 'coblick.o23': 'Blickling Homilies', 171 'coboeth.o2': 'Boethius\' Consolation of Philosophy', 172 'cobyrhtf.o3': 'Byrhtferth\'s Manual', 173 'cocanedgD': 'Canons of Edgar (D)', 174 'cocanedgX': 'Canons of Edgar (X)', 175 'cocathom1.o3': 'Ælfric\'s Catholic Homilies I', 176 'cocathom2.o3': 'Ælfric\'s Catholic Homilies II', 177 'cochad.o24': 'Saint Chad', 178 'cochdrul': 'Chrodegang of Metz, Rule', 179 'cochristoph': 'Saint Christopher', 180 'cochronA.o23': 'Anglo-Saxon Chronicle A', 181 'cochronC': 'Anglo-Saxon Chronicle C', 182 'cochronD': 'Anglo-Saxon Chronicle D', 183 'cochronE.o34': 'Anglo-Saxon Chronicle E', 184 'cocura.o2': 'Cura Pastoralis', 185 'cocuraC': 'Cura Pastoralis (Cotton)', 186 'codicts.o34': 'Dicts of Cato', 187 'codocu1.o1': 'Documents 1 (O1)', 188 'codocu2.o12': 'Documents 2 (O1/O2)', 189 'codocu2.o2': 'Documents 2 (O2)', 190 'codocu3.o23': 'Documents 3 (O2/O3)', 191 'codocu3.o3': 'Documents 3 (O3)', 192 'codocu4.o24': 'Documents 4 (O2/O4)', 193 'coeluc1': 'Honorius of Autun, Elucidarium 1', 194 'coeluc2': 'Honorius of Autun, Elucidarium 1', 195 'coepigen.o3': 'Ælfric\'s Epilogue to Genesis', 196 'coeuphr': 'Saint Euphrosyne', 197 'coeust': 'Saint Eustace and his companions', 198 'coexodusP': 'Exodus (P)', 199 'cogenesiC': 'Genesis (C)', 200 'cogregdC.o24': 'Gregory\'s Dialogues (C)', 201 'cogregdH.o23': 'Gregory\'s Dialogues (H)', 202 'coherbar': 'Pseudo-Apuleius, Herbarium', 203 'coinspolD.o34': 'Wulfstan\'s Institute of Polity (D)', 204 'coinspolX': 'Wulfstan\'s Institute of Polity (X)', 205 'cojames': 'Saint James', 206 'colacnu.o23': 'Lacnunga', 207 'colaece.o2': 'Leechdoms', 208 'colaw1cn.o3': 'Laws, Cnut I', 209 'colaw2cn.o3': 'Laws, Cnut II', 210 'colaw5atr.o3': 'Laws, Æthelred V', 211 'colaw6atr.o3': 'Laws, Æthelred VI', 212 'colawaf.o2': 'Laws, Alfred', 213 'colawafint.o2': 'Alfred\'s Introduction to Laws', 214 'colawger.o34': 'Laws, Gerefa', 215 'colawine.ox2': 'Laws, Ine', 216 'colawnorthu.o3': 'Northumbra Preosta Lagu', 217 'colawwllad.o4': 'Laws, William I, Lad', 218 'coleofri.o4': 'Leofric', 219 'colsigef.o3': 'Ælfric\'s Letter to Sigefyrth', 220 'colsigewB': 'Ælfric\'s Letter to Sigeweard (B)', 221 'colsigewZ.o34': 'Ælfric\'s Letter to Sigeweard (Z)', 222 'colwgeat': 'Ælfric\'s Letter to Wulfgeat', 223 'colwsigeT': 'Ælfric\'s Letter to Wulfsige (T)', 224 'colwsigeXa.o34': 'Ælfric\'s Letter to Wulfsige (Xa)', 225 'colwstan1.o3': 'Ælfric\'s Letter to Wulfstan I', 226 'colwstan2.o3': 'Ælfric\'s Letter to Wulfstan II', 227 'comargaC.o34': 'Saint Margaret (C)', 228 'comargaT': 'Saint Margaret (T)', 229 'comart1': 'Martyrology, I', 230 'comart2': 'Martyrology, II', 231 'comart3.o23': 'Martyrology, III', 232 'comarvel.o23': 'Marvels of the East', 233 'comary': 'Mary of Egypt', 234 'coneot': 'Saint Neot', 235 'conicodA': 'Gospel of Nicodemus (A)', 236 'conicodC': 'Gospel of Nicodemus (C)', 237 'conicodD': 'Gospel of Nicodemus (D)', 238 'conicodE': 'Gospel of Nicodemus (E)', 239 'coorosiu.o2': 'Orosius', 240 'cootest.o3': 'Heptateuch', 241 'coprefcath1.o3': 'Ælfric\'s Preface to Catholic Homilies I', 242 'coprefcath2.o3': 'Ælfric\'s Preface to Catholic Homilies II', 243 'coprefcura.o2': 'Preface to the Cura Pastoralis', 244 'coprefgen.o3': 'Ælfric\'s Preface to Genesis', 245 'copreflives.o3': 'Ælfric\'s Preface to Lives of Saints', 246 'coprefsolilo': 'Preface to Augustine\'s Soliloquies', 247 'coquadru.o23': 'Pseudo-Apuleius, Medicina de quadrupedibus', 248 'corood': 'History of the Holy Rood-Tree', 249 'cosevensl': 'Seven Sleepers', 250 'cosolilo': 'St. Augustine\'s Soliloquies', 251 'cosolsat1.o4': 'Solomon and Saturn I', 252 'cosolsat2': 'Solomon and Saturn II', 253 'cotempo.o3': 'Ælfric\'s De Temporibus Anni', 254 'coverhom': 'Vercelli Homilies', 255 'coverhomE': 'Vercelli Homilies (E)', 256 'coverhomL': 'Vercelli Homilies (L)', 257 'covinceB': 'Saint Vincent (Bodley 343)', 258 'covinsal': 'Vindicta Salvatoris', 259 'cowsgosp.o3': 'West-Saxon Gospels', 260 'cowulf.o34': 'Wulfstan\'s Homilies', 261} 262