1# -*- coding: iso-8859-1 -*-
2
3# Natural Language Toolkit: York-Toronto-Helsinki Parsed Corpus of Old English Prose (YCOE)
4#
5# Copyright (C) 2001-2015 NLTK Project
6# Author: Selina Dennis <selina@tranzfusion.net>
7# URL: <http://nltk.org/>
8# For license information, see LICENSE.TXT
9
10"""
11Corpus reader for the York-Toronto-Helsinki Parsed Corpus of Old
12English Prose (YCOE), a 1.5 million word syntactically-annotated
13corpus of Old English prose texts. The corpus is distributed by the
14Oxford Text Archive: http://www.ota.ahds.ac.uk/ It is not included
15with NLTK.
16
17The YCOE corpus is divided into 100 files, each representing
18an Old English prose text. Tags used within each text complies
19to the YCOE standard: http://www-users.york.ac.uk/~lang22/YCOE/YcoeHome.htm
20"""
21
22import os
23import re
24
25from six import string_types
26
27from nltk.tokenize import RegexpTokenizer
28from nltk.corpus.reader.bracket_parse import BracketParseCorpusReader
29from nltk.corpus.reader.tagged import TaggedCorpusReader
30
31from nltk.corpus.reader.util import *
32from nltk.corpus.reader.api import *
33
34
35class YCOECorpusReader(CorpusReader):
36    """
37    Corpus reader for the York-Toronto-Helsinki Parsed Corpus of Old
38    English Prose (YCOE), a 1.5 million word syntactically-annotated
39    corpus of Old English prose texts.
40    """
41
42    def __init__(self, root, encoding='utf8'):
43        CorpusReader.__init__(self, root, [], encoding)
44
45        self._psd_reader = YCOEParseCorpusReader(
46            self.root.join('psd'), '.*', '.psd', encoding=encoding
47        )
48        self._pos_reader = YCOETaggedCorpusReader(self.root.join('pos'), '.*', '.pos')
49
50        # Make sure we have a consistent set of items:
51        documents = set(f[:-4] for f in self._psd_reader.fileids())
52        if set(f[:-4] for f in self._pos_reader.fileids()) != documents:
53            raise ValueError('Items in "psd" and "pos" ' 'subdirectories do not match.')
54
55        fileids = sorted(
56            ['%s.psd' % doc for doc in documents]
57            + ['%s.pos' % doc for doc in documents]
58        )
59        CorpusReader.__init__(self, root, fileids, encoding)
60        self._documents = sorted(documents)
61
62    def documents(self, fileids=None):
63        """
64        Return a list of document identifiers for all documents in
65        this corpus, or for the documents with the given file(s) if
66        specified.
67        """
68        if fileids is None:
69            return self._documents
70        if isinstance(fileids, string_types):
71            fileids = [fileids]
72        for f in fileids:
73            if f not in self._fileids:
74                raise KeyError('File id %s not found' % fileids)
75        # Strip off the '.pos' and '.psd' extensions.
76        return sorted(set(f[:-4] for f in fileids))
77
78    def fileids(self, documents=None):
79        """
80        Return a list of file identifiers for the files that make up
81        this corpus, or that store the given document(s) if specified.
82        """
83        if documents is None:
84            return self._fileids
85        elif isinstance(documents, string_types):
86            documents = [documents]
87        return sorted(
88            set(
89                ['%s.pos' % doc for doc in documents]
90                + ['%s.psd' % doc for doc in documents]
91            )
92        )
93
94    def _getfileids(self, documents, subcorpus):
95        """
96        Helper that selects the appropriate fileids for a given set of
97        documents from a given subcorpus (pos or psd).
98        """
99        if documents is None:
100            documents = self._documents
101        else:
102            if isinstance(documents, string_types):
103                documents = [documents]
104            for document in documents:
105                if document not in self._documents:
106                    if document[-4:] in ('.pos', '.psd'):
107                        raise ValueError(
108                            'Expected a document identifier, not a file '
109                            'identifier.  (Use corpus.documents() to get '
110                            'a list of document identifiers.'
111                        )
112                    else:
113                        raise ValueError('Document identifier %s not found' % document)
114        return ['%s.%s' % (d, subcorpus) for d in documents]
115
116    # Delegate to one of our two sub-readers:
117    def words(self, documents=None):
118        return self._pos_reader.words(self._getfileids(documents, 'pos'))
119
120    def sents(self, documents=None):
121        return self._pos_reader.sents(self._getfileids(documents, 'pos'))
122
123    def paras(self, documents=None):
124        return self._pos_reader.paras(self._getfileids(documents, 'pos'))
125
126    def tagged_words(self, documents=None):
127        return self._pos_reader.tagged_words(self._getfileids(documents, 'pos'))
128
129    def tagged_sents(self, documents=None):
130        return self._pos_reader.tagged_sents(self._getfileids(documents, 'pos'))
131
132    def tagged_paras(self, documents=None):
133        return self._pos_reader.tagged_paras(self._getfileids(documents, 'pos'))
134
135    def parsed_sents(self, documents=None):
136        return self._psd_reader.parsed_sents(self._getfileids(documents, 'psd'))
137
138
139class YCOEParseCorpusReader(BracketParseCorpusReader):
140    """Specialized version of the standard bracket parse corpus reader
141    that strips out (CODE ...) and (ID ...) nodes."""
142
143    def _parse(self, t):
144        t = re.sub(r'(?u)\((CODE|ID)[^\)]*\)', '', t)
145        if re.match(r'\s*\(\s*\)\s*$', t):
146            return None
147        return BracketParseCorpusReader._parse(self, t)
148
149
150class YCOETaggedCorpusReader(TaggedCorpusReader):
151    def __init__(self, root, items, encoding='utf8'):
152        gaps_re = r'(?u)(?<=/\.)\s+|\s*\S*_CODE\s*|\s*\S*_ID\s*'
153        sent_tokenizer = RegexpTokenizer(gaps_re, gaps=True)
154        TaggedCorpusReader.__init__(
155            self, root, items, sep='_', sent_tokenizer=sent_tokenizer
156        )
157
158
159#: A list of all documents and their titles in ycoe.
160documents = {
161    'coadrian.o34': 'Adrian and Ritheus',
162    'coaelhom.o3': 'Ælfric, Supplemental Homilies',
163    'coaelive.o3': 'Ælfric\'s Lives of Saints',
164    'coalcuin': 'Alcuin De virtutibus et vitiis',
165    'coalex.o23': 'Alexander\'s Letter to Aristotle',
166    'coapollo.o3': 'Apollonius of Tyre',
167    'coaugust': 'Augustine',
168    'cobede.o2': 'Bede\'s History of the English Church',
169    'cobenrul.o3': 'Benedictine Rule',
170    'coblick.o23': 'Blickling Homilies',
171    'coboeth.o2': 'Boethius\' Consolation of Philosophy',
172    'cobyrhtf.o3': 'Byrhtferth\'s Manual',
173    'cocanedgD': 'Canons of Edgar (D)',
174    'cocanedgX': 'Canons of Edgar (X)',
175    'cocathom1.o3': 'Ælfric\'s Catholic Homilies I',
176    'cocathom2.o3': 'Ælfric\'s Catholic Homilies II',
177    'cochad.o24': 'Saint Chad',
178    'cochdrul': 'Chrodegang of Metz, Rule',
179    'cochristoph': 'Saint Christopher',
180    'cochronA.o23': 'Anglo-Saxon Chronicle A',
181    'cochronC': 'Anglo-Saxon Chronicle C',
182    'cochronD': 'Anglo-Saxon Chronicle D',
183    'cochronE.o34': 'Anglo-Saxon Chronicle E',
184    'cocura.o2': 'Cura Pastoralis',
185    'cocuraC': 'Cura Pastoralis (Cotton)',
186    'codicts.o34': 'Dicts of Cato',
187    'codocu1.o1': 'Documents 1 (O1)',
188    'codocu2.o12': 'Documents 2 (O1/O2)',
189    'codocu2.o2': 'Documents 2 (O2)',
190    'codocu3.o23': 'Documents 3 (O2/O3)',
191    'codocu3.o3': 'Documents 3 (O3)',
192    'codocu4.o24': 'Documents 4 (O2/O4)',
193    'coeluc1': 'Honorius of Autun, Elucidarium 1',
194    'coeluc2': 'Honorius of Autun, Elucidarium 1',
195    'coepigen.o3': 'Ælfric\'s Epilogue to Genesis',
196    'coeuphr': 'Saint Euphrosyne',
197    'coeust': 'Saint Eustace and his companions',
198    'coexodusP': 'Exodus (P)',
199    'cogenesiC': 'Genesis (C)',
200    'cogregdC.o24': 'Gregory\'s Dialogues (C)',
201    'cogregdH.o23': 'Gregory\'s Dialogues (H)',
202    'coherbar': 'Pseudo-Apuleius, Herbarium',
203    'coinspolD.o34': 'Wulfstan\'s Institute of Polity (D)',
204    'coinspolX': 'Wulfstan\'s Institute of Polity (X)',
205    'cojames': 'Saint James',
206    'colacnu.o23': 'Lacnunga',
207    'colaece.o2': 'Leechdoms',
208    'colaw1cn.o3': 'Laws, Cnut I',
209    'colaw2cn.o3': 'Laws, Cnut II',
210    'colaw5atr.o3': 'Laws, Æthelred V',
211    'colaw6atr.o3': 'Laws, Æthelred VI',
212    'colawaf.o2': 'Laws, Alfred',
213    'colawafint.o2': 'Alfred\'s Introduction to Laws',
214    'colawger.o34': 'Laws, Gerefa',
215    'colawine.ox2': 'Laws, Ine',
216    'colawnorthu.o3': 'Northumbra Preosta Lagu',
217    'colawwllad.o4': 'Laws, William I, Lad',
218    'coleofri.o4': 'Leofric',
219    'colsigef.o3': 'Ælfric\'s Letter to Sigefyrth',
220    'colsigewB': 'Ælfric\'s Letter to Sigeweard (B)',
221    'colsigewZ.o34': 'Ælfric\'s Letter to Sigeweard (Z)',
222    'colwgeat': 'Ælfric\'s Letter to Wulfgeat',
223    'colwsigeT': 'Ælfric\'s Letter to Wulfsige (T)',
224    'colwsigeXa.o34': 'Ælfric\'s Letter to Wulfsige (Xa)',
225    'colwstan1.o3': 'Ælfric\'s Letter to Wulfstan I',
226    'colwstan2.o3': 'Ælfric\'s Letter to Wulfstan II',
227    'comargaC.o34': 'Saint Margaret (C)',
228    'comargaT': 'Saint Margaret (T)',
229    'comart1': 'Martyrology, I',
230    'comart2': 'Martyrology, II',
231    'comart3.o23': 'Martyrology, III',
232    'comarvel.o23': 'Marvels of the East',
233    'comary': 'Mary of Egypt',
234    'coneot': 'Saint Neot',
235    'conicodA': 'Gospel of Nicodemus (A)',
236    'conicodC': 'Gospel of Nicodemus (C)',
237    'conicodD': 'Gospel of Nicodemus (D)',
238    'conicodE': 'Gospel of Nicodemus (E)',
239    'coorosiu.o2': 'Orosius',
240    'cootest.o3': 'Heptateuch',
241    'coprefcath1.o3': 'Ælfric\'s Preface to Catholic Homilies I',
242    'coprefcath2.o3': 'Ælfric\'s Preface to Catholic Homilies II',
243    'coprefcura.o2': 'Preface to the Cura Pastoralis',
244    'coprefgen.o3': 'Ælfric\'s Preface to Genesis',
245    'copreflives.o3': 'Ælfric\'s Preface to Lives of Saints',
246    'coprefsolilo': 'Preface to Augustine\'s Soliloquies',
247    'coquadru.o23': 'Pseudo-Apuleius, Medicina de quadrupedibus',
248    'corood': 'History of the Holy Rood-Tree',
249    'cosevensl': 'Seven Sleepers',
250    'cosolilo': 'St. Augustine\'s Soliloquies',
251    'cosolsat1.o4': 'Solomon and Saturn I',
252    'cosolsat2': 'Solomon and Saturn II',
253    'cotempo.o3': 'Ælfric\'s De Temporibus Anni',
254    'coverhom': 'Vercelli Homilies',
255    'coverhomE': 'Vercelli Homilies (E)',
256    'coverhomL': 'Vercelli Homilies (L)',
257    'covinceB': 'Saint Vincent (Bodley 343)',
258    'covinsal': 'Vindicta Salvatoris',
259    'cowsgosp.o3': 'West-Saxon Gospels',
260    'cowulf.o34': 'Wulfstan\'s Homilies',
261}
262