1# CHILDES XML Corpus Reader
2
3# Copyright (C) 2001-2019 NLTK Project
4# Author: Tomonori Nagano <tnagano@gc.cuny.edu>
5#         Alexis Dimitriadis <A.Dimitriadis@uu.nl>
6# URL: <http://nltk.org/>
7# For license information, see LICENSE.TXT
8
9"""
10Corpus reader for the XML version of the CHILDES corpus.
11"""
12from __future__ import print_function, division
13
14__docformat__ = 'epytext en'
15
16import re
17from collections import defaultdict
18from six import string_types
19
20from nltk.util import flatten, LazyMap, LazyConcatenation
21
22from nltk.corpus.reader.util import concat
23from nltk.corpus.reader.xmldocs import XMLCorpusReader, ElementTree
24
25# to resolve the namespace issue
26NS = 'http://www.talkbank.org/ns/talkbank'
27
28
29class CHILDESCorpusReader(XMLCorpusReader):
30    """
31    Corpus reader for the XML version of the CHILDES corpus.
32    The CHILDES corpus is available at ``https://childes.talkbank.org/``. The XML
33    version of CHILDES is located at ``https://childes.talkbank.org/data-xml/``.
34    Copy the needed parts of the CHILDES XML corpus into the NLTK data directory
35    (``nltk_data/corpora/CHILDES/``).
36
37    For access to the file text use the usual nltk functions,
38    ``words()``, ``sents()``, ``tagged_words()`` and ``tagged_sents()``.
39    """
40
41    def __init__(self, root, fileids, lazy=True):
42        XMLCorpusReader.__init__(self, root, fileids)
43        self._lazy = lazy
44
45    def words(
46        self,
47        fileids=None,
48        speaker='ALL',
49        stem=False,
50        relation=False,
51        strip_space=True,
52        replace=False,
53    ):
54        """
55        :return: the given file(s) as a list of words
56        :rtype: list(str)
57
58        :param speaker: If specified, select specific speaker(s) defined
59            in the corpus. Default is 'ALL' (all participants). Common choices
60            are 'CHI' (the child), 'MOT' (mother), ['CHI','MOT'] (exclude
61            researchers)
62        :param stem: If true, then use word stems instead of word strings.
63        :param relation: If true, then return tuples of (stem, index,
64            dependent_index)
65        :param strip_space: If true, then strip trailing spaces from word
66            tokens. Otherwise, leave the spaces on the tokens.
67        :param replace: If true, then use the replaced (intended) word instead
68            of the original word (e.g., 'wat' will be replaced with 'watch')
69        """
70        sent = None
71        pos = False
72        if not self._lazy:
73            return [
74                self._get_words(
75                    fileid, speaker, sent, stem, relation, pos, strip_space, replace
76                )
77                for fileid in self.abspaths(fileids)
78            ]
79
80        get_words = lambda fileid: self._get_words(
81            fileid, speaker, sent, stem, relation, pos, strip_space, replace
82        )
83        return LazyConcatenation(LazyMap(get_words, self.abspaths(fileids)))
84
85    def tagged_words(
86        self,
87        fileids=None,
88        speaker='ALL',
89        stem=False,
90        relation=False,
91        strip_space=True,
92        replace=False,
93    ):
94        """
95        :return: the given file(s) as a list of tagged
96            words and punctuation symbols, encoded as tuples
97            ``(word,tag)``.
98        :rtype: list(tuple(str,str))
99
100        :param speaker: If specified, select specific speaker(s) defined
101            in the corpus. Default is 'ALL' (all participants). Common choices
102            are 'CHI' (the child), 'MOT' (mother), ['CHI','MOT'] (exclude
103            researchers)
104        :param stem: If true, then use word stems instead of word strings.
105        :param relation: If true, then return tuples of (stem, index,
106            dependent_index)
107        :param strip_space: If true, then strip trailing spaces from word
108            tokens. Otherwise, leave the spaces on the tokens.
109        :param replace: If true, then use the replaced (intended) word instead
110            of the original word (e.g., 'wat' will be replaced with 'watch')
111        """
112        sent = None
113        pos = True
114        if not self._lazy:
115            return [
116                self._get_words(
117                    fileid, speaker, sent, stem, relation, pos, strip_space, replace
118                )
119                for fileid in self.abspaths(fileids)
120            ]
121
122        get_words = lambda fileid: self._get_words(
123            fileid, speaker, sent, stem, relation, pos, strip_space, replace
124        )
125        return LazyConcatenation(LazyMap(get_words, self.abspaths(fileids)))
126
127    def sents(
128        self,
129        fileids=None,
130        speaker='ALL',
131        stem=False,
132        relation=None,
133        strip_space=True,
134        replace=False,
135    ):
136        """
137        :return: the given file(s) as a list of sentences or utterances, each
138            encoded as a list of word strings.
139        :rtype: list(list(str))
140
141        :param speaker: If specified, select specific speaker(s) defined
142            in the corpus. Default is 'ALL' (all participants). Common choices
143            are 'CHI' (the child), 'MOT' (mother), ['CHI','MOT'] (exclude
144            researchers)
145        :param stem: If true, then use word stems instead of word strings.
146        :param relation: If true, then return tuples of ``(str,pos,relation_list)``.
147            If there is manually-annotated relation info, it will return
148            tuples of ``(str,pos,test_relation_list,str,pos,gold_relation_list)``
149        :param strip_space: If true, then strip trailing spaces from word
150            tokens. Otherwise, leave the spaces on the tokens.
151        :param replace: If true, then use the replaced (intended) word instead
152            of the original word (e.g., 'wat' will be replaced with 'watch')
153        """
154        sent = True
155        pos = False
156        if not self._lazy:
157            return [
158                self._get_words(
159                    fileid, speaker, sent, stem, relation, pos, strip_space, replace
160                )
161                for fileid in self.abspaths(fileids)
162            ]
163
164        get_words = lambda fileid: self._get_words(
165            fileid, speaker, sent, stem, relation, pos, strip_space, replace
166        )
167        return LazyConcatenation(LazyMap(get_words, self.abspaths(fileids)))
168
169    def tagged_sents(
170        self,
171        fileids=None,
172        speaker='ALL',
173        stem=False,
174        relation=None,
175        strip_space=True,
176        replace=False,
177    ):
178        """
179        :return: the given file(s) as a list of
180            sentences, each encoded as a list of ``(word,tag)`` tuples.
181        :rtype: list(list(tuple(str,str)))
182
183        :param speaker: If specified, select specific speaker(s) defined
184            in the corpus. Default is 'ALL' (all participants). Common choices
185            are 'CHI' (the child), 'MOT' (mother), ['CHI','MOT'] (exclude
186            researchers)
187        :param stem: If true, then use word stems instead of word strings.
188        :param relation: If true, then return tuples of ``(str,pos,relation_list)``.
189            If there is manually-annotated relation info, it will return
190            tuples of ``(str,pos,test_relation_list,str,pos,gold_relation_list)``
191        :param strip_space: If true, then strip trailing spaces from word
192            tokens. Otherwise, leave the spaces on the tokens.
193        :param replace: If true, then use the replaced (intended) word instead
194            of the original word (e.g., 'wat' will be replaced with 'watch')
195        """
196        sent = True
197        pos = True
198        if not self._lazy:
199            return [
200                self._get_words(
201                    fileid, speaker, sent, stem, relation, pos, strip_space, replace
202                )
203                for fileid in self.abspaths(fileids)
204            ]
205
206        get_words = lambda fileid: self._get_words(
207            fileid, speaker, sent, stem, relation, pos, strip_space, replace
208        )
209        return LazyConcatenation(LazyMap(get_words, self.abspaths(fileids)))
210
211    def corpus(self, fileids=None):
212        """
213        :return: the given file(s) as a dict of ``(corpus_property_key, value)``
214        :rtype: list(dict)
215        """
216        if not self._lazy:
217            return [self._get_corpus(fileid) for fileid in self.abspaths(fileids)]
218        return LazyMap(self._get_corpus, self.abspaths(fileids))
219
220    def _get_corpus(self, fileid):
221        results = dict()
222        xmldoc = ElementTree.parse(fileid).getroot()
223        for key, value in xmldoc.items():
224            results[key] = value
225        return results
226
227    def participants(self, fileids=None):
228        """
229        :return: the given file(s) as a dict of
230            ``(participant_property_key, value)``
231        :rtype: list(dict)
232        """
233        if not self._lazy:
234            return [self._get_participants(fileid) for fileid in self.abspaths(fileids)]
235        return LazyMap(self._get_participants, self.abspaths(fileids))
236
237    def _get_participants(self, fileid):
238        # multidimensional dicts
239        def dictOfDicts():
240            return defaultdict(dictOfDicts)
241
242        xmldoc = ElementTree.parse(fileid).getroot()
243        # getting participants' data
244        pat = dictOfDicts()
245        for participant in xmldoc.findall(
246            './/{%s}Participants/{%s}participant' % (NS, NS)
247        ):
248            for (key, value) in participant.items():
249                pat[participant.get('id')][key] = value
250        return pat
251
252    def age(self, fileids=None, speaker='CHI', month=False):
253        """
254        :return: the given file(s) as string or int
255        :rtype: list or int
256
257        :param month: If true, return months instead of year-month-date
258        """
259        if not self._lazy:
260            return [
261                self._get_age(fileid, speaker, month)
262                for fileid in self.abspaths(fileids)
263            ]
264        get_age = lambda fileid: self._get_age(fileid, speaker, month)
265        return LazyMap(get_age, self.abspaths(fileids))
266
267    def _get_age(self, fileid, speaker, month):
268        xmldoc = ElementTree.parse(fileid).getroot()
269        for pat in xmldoc.findall('.//{%s}Participants/{%s}participant' % (NS, NS)):
270            try:
271                if pat.get('id') == speaker:
272                    age = pat.get('age')
273                    if month:
274                        age = self.convert_age(age)
275                    return age
276            # some files don't have age data
277            except (TypeError, AttributeError) as e:
278                return None
279
280    def convert_age(self, age_year):
281        "Caclculate age in months from a string in CHILDES format"
282        m = re.match("P(\d+)Y(\d+)M?(\d?\d?)D?", age_year)
283        age_month = int(m.group(1)) * 12 + int(m.group(2))
284        try:
285            if int(m.group(3)) > 15:
286                age_month += 1
287        # some corpora don't have age information?
288        except ValueError as e:
289            pass
290        return age_month
291
292    def MLU(self, fileids=None, speaker='CHI'):
293        """
294        :return: the given file(s) as a floating number
295        :rtype: list(float)
296        """
297        if not self._lazy:
298            return [
299                self._getMLU(fileid, speaker=speaker)
300                for fileid in self.abspaths(fileids)
301            ]
302        get_MLU = lambda fileid: self._getMLU(fileid, speaker=speaker)
303        return LazyMap(get_MLU, self.abspaths(fileids))
304
305    def _getMLU(self, fileid, speaker):
306        sents = self._get_words(
307            fileid,
308            speaker=speaker,
309            sent=True,
310            stem=True,
311            relation=False,
312            pos=True,
313            strip_space=True,
314            replace=True,
315        )
316        results = []
317        lastSent = []
318        numFillers = 0
319        sentDiscount = 0
320        for sent in sents:
321            posList = [pos for (word, pos) in sent]
322            # if any part of the sentence is intelligible
323            if any(pos == 'unk' for pos in posList):
324                continue
325            # if the sentence is null
326            elif sent == []:
327                continue
328            # if the sentence is the same as the last sent
329            elif sent == lastSent:
330                continue
331            else:
332                results.append([word for (word, pos) in sent])
333                # count number of fillers
334                if len(set(['co', None]).intersection(posList)) > 0:
335                    numFillers += posList.count('co')
336                    numFillers += posList.count(None)
337                    sentDiscount += 1
338            lastSent = sent
339        try:
340            thisWordList = flatten(results)
341            # count number of morphemes
342            # (e.g., 'read' = 1 morpheme but 'read-PAST' is 2 morphemes)
343            numWords = (
344                len(flatten([word.split('-') for word in thisWordList])) - numFillers
345            )
346            numSents = len(results) - sentDiscount
347            mlu = numWords / numSents
348        except ZeroDivisionError:
349            mlu = 0
350        # return {'mlu':mlu,'wordNum':numWords,'sentNum':numSents}
351        return mlu
352
353    def _get_words(
354        self, fileid, speaker, sent, stem, relation, pos, strip_space, replace
355    ):
356        if (
357            isinstance(speaker, string_types) and speaker != 'ALL'
358        ):  # ensure we have a list of speakers
359            speaker = [speaker]
360        xmldoc = ElementTree.parse(fileid).getroot()
361        # processing each xml doc
362        results = []
363        for xmlsent in xmldoc.findall('.//{%s}u' % NS):
364            sents = []
365            # select speakers
366            if speaker == 'ALL' or xmlsent.get('who') in speaker:
367                for xmlword in xmlsent.findall('.//{%s}w' % NS):
368                    infl = None
369                    suffixStem = None
370                    suffixTag = None
371                    # getting replaced words
372                    if replace and xmlsent.find('.//{%s}w/{%s}replacement' % (NS, NS)):
373                        xmlword = xmlsent.find(
374                            './/{%s}w/{%s}replacement/{%s}w' % (NS, NS, NS)
375                        )
376                    elif replace and xmlsent.find('.//{%s}w/{%s}wk' % (NS, NS)):
377                        xmlword = xmlsent.find('.//{%s}w/{%s}wk' % (NS, NS))
378                    # get text
379                    if xmlword.text:
380                        word = xmlword.text
381                    else:
382                        word = ''
383                    # strip tailing space
384                    if strip_space:
385                        word = word.strip()
386                    # stem
387                    if relation or stem:
388                        try:
389                            xmlstem = xmlword.find('.//{%s}stem' % NS)
390                            word = xmlstem.text
391                        except AttributeError as e:
392                            pass
393                        # if there is an inflection
394                        try:
395                            xmlinfl = xmlword.find(
396                                './/{%s}mor/{%s}mw/{%s}mk' % (NS, NS, NS)
397                            )
398                            word += '-' + xmlinfl.text
399                        except:
400                            pass
401                        # if there is a suffix
402                        try:
403                            xmlsuffix = xmlword.find(
404                                './/{%s}mor/{%s}mor-post/{%s}mw/{%s}stem'
405                                % (NS, NS, NS, NS)
406                            )
407                            suffixStem = xmlsuffix.text
408                        except AttributeError:
409                            suffixStem = ""
410                        if suffixStem:
411                            word += "~" + suffixStem
412                    # pos
413                    if relation or pos:
414                        try:
415                            xmlpos = xmlword.findall(".//{%s}c" % NS)
416                            xmlpos2 = xmlword.findall(".//{%s}s" % NS)
417                            if xmlpos2 != []:
418                                tag = xmlpos[0].text + ":" + xmlpos2[0].text
419                            else:
420                                tag = xmlpos[0].text
421                        except (AttributeError, IndexError) as e:
422                            tag = ""
423                        try:
424                            xmlsuffixpos = xmlword.findall(
425                                './/{%s}mor/{%s}mor-post/{%s}mw/{%s}pos/{%s}c'
426                                % (NS, NS, NS, NS, NS)
427                            )
428                            xmlsuffixpos2 = xmlword.findall(
429                                './/{%s}mor/{%s}mor-post/{%s}mw/{%s}pos/{%s}s'
430                                % (NS, NS, NS, NS, NS)
431                            )
432                            if xmlsuffixpos2:
433                                suffixTag = (
434                                    xmlsuffixpos[0].text + ":" + xmlsuffixpos2[0].text
435                                )
436                            else:
437                                suffixTag = xmlsuffixpos[0].text
438                        except:
439                            pass
440                        if suffixTag:
441                            tag += "~" + suffixTag
442                        word = (word, tag)
443                    # relational
444                    # the gold standard is stored in
445                    # <mor></mor><mor type="trn"><gra type="grt">
446                    if relation == True:
447                        for xmlstem_rel in xmlword.findall(
448                            './/{%s}mor/{%s}gra' % (NS, NS)
449                        ):
450                            if not xmlstem_rel.get('type') == 'grt':
451                                word = (
452                                    word[0],
453                                    word[1],
454                                    xmlstem_rel.get('index')
455                                    + "|"
456                                    + xmlstem_rel.get('head')
457                                    + "|"
458                                    + xmlstem_rel.get('relation'),
459                                )
460                            else:
461                                word = (
462                                    word[0],
463                                    word[1],
464                                    word[2],
465                                    word[0],
466                                    word[1],
467                                    xmlstem_rel.get('index')
468                                    + "|"
469                                    + xmlstem_rel.get('head')
470                                    + "|"
471                                    + xmlstem_rel.get('relation'),
472                                )
473                        try:
474                            for xmlpost_rel in xmlword.findall(
475                                './/{%s}mor/{%s}mor-post/{%s}gra' % (NS, NS, NS)
476                            ):
477                                if not xmlpost_rel.get('type') == 'grt':
478                                    suffixStem = (
479                                        suffixStem[0],
480                                        suffixStem[1],
481                                        xmlpost_rel.get('index')
482                                        + "|"
483                                        + xmlpost_rel.get('head')
484                                        + "|"
485                                        + xmlpost_rel.get('relation'),
486                                    )
487                                else:
488                                    suffixStem = (
489                                        suffixStem[0],
490                                        suffixStem[1],
491                                        suffixStem[2],
492                                        suffixStem[0],
493                                        suffixStem[1],
494                                        xmlpost_rel.get('index')
495                                        + "|"
496                                        + xmlpost_rel.get('head')
497                                        + "|"
498                                        + xmlpost_rel.get('relation'),
499                                    )
500                        except:
501                            pass
502                    sents.append(word)
503                if sent or relation:
504                    results.append(sents)
505                else:
506                    results.extend(sents)
507        return LazyMap(lambda x: x, results)
508
509    # Ready-to-use browser opener
510
511    """
512    The base URL for viewing files on the childes website. This
513    shouldn't need to be changed, unless CHILDES changes the configuration
514    of their server or unless the user sets up their own corpus webserver.
515    """
516    childes_url_base = r'https://childes.talkbank.org/browser/index.php?url='
517
518    def webview_file(self, fileid, urlbase=None):
519        """Map a corpus file to its web version on the CHILDES website,
520        and open it in a web browser.
521
522        The complete URL to be used is:
523            childes.childes_url_base + urlbase + fileid.replace('.xml', '.cha')
524
525        If no urlbase is passed, we try to calculate it.  This
526        requires that the childes corpus was set up to mirror the
527        folder hierarchy under childes.psy.cmu.edu/data-xml/, e.g.:
528        nltk_data/corpora/childes/Eng-USA/Cornell/??? or
529        nltk_data/corpora/childes/Romance/Spanish/Aguirre/???
530
531        The function first looks (as a special case) if "Eng-USA" is
532        on the path consisting of <corpus root>+fileid; then if
533        "childes", possibly followed by "data-xml", appears. If neither
534        one is found, we use the unmodified fileid and hope for the best.
535        If this is not right, specify urlbase explicitly, e.g., if the
536        corpus root points to the Cornell folder, urlbase='Eng-USA/Cornell'.
537        """
538
539        import webbrowser
540
541        if urlbase:
542            path = urlbase + "/" + fileid
543        else:
544            full = self.root + "/" + fileid
545            full = re.sub(r'\\', '/', full)
546            if '/childes/' in full.lower():
547                # Discard /data-xml/ if present
548                path = re.findall(r'(?i)/childes(?:/data-xml)?/(.*)\.xml', full)[0]
549            elif 'eng-usa' in full.lower():
550                path = 'Eng-USA/' + re.findall(r'/(?i)Eng-USA/(.*)\.xml', full)[0]
551            else:
552                path = fileid
553
554        # Strip ".xml" and add ".cha", as necessary:
555        if path.endswith('.xml'):
556            path = path[:-4]
557
558        if not path.endswith('.cha'):
559            path = path + '.cha'
560
561        url = self.childes_url_base + path
562
563        webbrowser.open_new_tab(url)
564        print("Opening in browser:", url)
565        # Pausing is a good idea, but it's up to the user...
566        # raw_input("Hit Return to continue")
567
568
569def demo(corpus_root=None):
570    """
571    The CHILDES corpus should be manually downloaded and saved
572    to ``[NLTK_Data_Dir]/corpora/childes/``
573    """
574    if not corpus_root:
575        from nltk.data import find
576
577        corpus_root = find('corpora/childes/data-xml/Eng-USA/')
578
579    try:
580        childes = CHILDESCorpusReader(corpus_root, '.*.xml')
581        # describe all corpus
582        for file in childes.fileids()[:5]:
583            corpus = ''
584            corpus_id = ''
585            for (key, value) in childes.corpus(file)[0].items():
586                if key == "Corpus":
587                    corpus = value
588                if key == "Id":
589                    corpus_id = value
590            print('Reading', corpus, corpus_id, ' .....')
591            print("words:", childes.words(file)[:7], "...")
592            print(
593                "words with replaced words:",
594                childes.words(file, replace=True)[:7],
595                " ...",
596            )
597            print("words with pos tags:", childes.tagged_words(file)[:7], " ...")
598            print("words (only MOT):", childes.words(file, speaker='MOT')[:7], "...")
599            print("words (only CHI):", childes.words(file, speaker='CHI')[:7], "...")
600            print("stemmed words:", childes.words(file, stem=True)[:7], " ...")
601            print(
602                "words with relations and pos-tag:",
603                childes.words(file, relation=True)[:5],
604                " ...",
605            )
606            print("sentence:", childes.sents(file)[:2], " ...")
607            for (participant, values) in childes.participants(file)[0].items():
608                for (key, value) in values.items():
609                    print("\tparticipant", participant, key, ":", value)
610            print("num of sent:", len(childes.sents(file)))
611            print("num of morphemes:", len(childes.words(file, stem=True)))
612            print("age:", childes.age(file))
613            print("age in month:", childes.age(file, month=True))
614            print("MLU:", childes.MLU(file))
615            print()
616
617    except LookupError as e:
618        print(
619            """The CHILDES corpus, or the parts you need, should be manually
620        downloaded from https://childes.talkbank.org/data-xml/ and saved at
621        [NLTK_Data_Dir]/corpora/childes/
622            Alternately, you can call the demo with the path to a portion of the CHILDES corpus, e.g.:
623        demo('/path/to/childes/data-xml/Eng-USA/")
624        """
625        )
626        # corpus_root_http = urllib2.urlopen('https://childes.talkbank.org/data-xml/Eng-USA/Bates.zip')
627        # corpus_root_http_bates = zipfile.ZipFile(cStringIO.StringIO(corpus_root_http.read()))
628        ##this fails
629        # childes = CHILDESCorpusReader(corpus_root_http_bates,corpus_root_http_bates.namelist())
630
631
632if __name__ == "__main__":
633    demo()
634