corpus/reader/childes.py

# CHILDES XML Corpus Reader

# Copyright (C) 2001-2019 NLTK Project
# Author: Tomonori Nagano <tnagano@gc.cuny.edu>
#         Alexis Dimitriadis <A.Dimitriadis@uu.nl>
# URL: <http://nltk.org/>
# For license information, see LICENSE.TXT

"""
Corpus reader for the XML version of the CHILDES corpus.
"""
from __future__ import print_function, division

__docformat__ = 'epytext en'

import re
from collections import defaultdict
from six import string_types

from nltk.util import flatten, LazyMap, LazyConcatenation

from nltk.corpus.reader.util import concat
from nltk.corpus.reader.xmldocs import XMLCorpusReader, ElementTree

# to resolve the namespace issue
NS = 'http://www.talkbank.org/ns/talkbank'


class CHILDESCorpusReader(XMLCorpusReader):
    """
    Corpus reader for the XML version of the CHILDES corpus.
    The CHILDES corpus is available at ``https://childes.talkbank.org/``. The XML
    version of CHILDES is located at ``https://childes.talkbank.org/data-xml/``.
    Copy the needed parts of the CHILDES XML corpus into the NLTK data directory
    (``nltk_data/corpora/CHILDES/``).

    For access to the file text use the usual nltk functions,
    ``words()``, ``sents()``, ``tagged_words()`` and ``tagged_sents()``.
    """

    def __init__(self, root, fileids, lazy=True):
        XMLCorpusReader.__init__(self, root, fileids)
        self._lazy = lazy

    def words(
        self,
        fileids=None,
        speaker='ALL',
        stem=False,
        relation=False,
        strip_space=True,
        replace=False,
    ):
        """
        :return: the given file(s) as a list of words
        :rtype: list(str)

        :param speaker: If specified, select specific speaker(s) defined
            in the corpus. Default is 'ALL' (all participants). Common choices
            are 'CHI' (the child), 'MOT' (mother), ['CHI','MOT'] (exclude
            researchers)
        :param stem: If true, then use word stems instead of word strings.
        :param relation: If true, then return tuples of (stem, index,
            dependent_index)
        :param strip_space: If true, then strip trailing spaces from word
            tokens. Otherwise, leave the spaces on the tokens.
        :param replace: If true, then use the replaced (intended) word instead
            of the original word (e.g., 'wat' will be replaced with 'watch')
        """
        sent = None
        pos = False
        if not self._lazy:
            return [
                self._get_words(
                    fileid, speaker, sent, stem, relation, pos, strip_space, replace
                )
                for fileid in self.abspaths(fileids)
            ]

        get_words = lambda fileid: self._get_words(
            fileid, speaker, sent, stem, relation, pos, strip_space, replace
        )
        return LazyConcatenation(LazyMap(get_words, self.abspaths(fileids)))

    def tagged_words(
        self,
        fileids=None,
        speaker='ALL',
        stem=False,
        relation=False,
        strip_space=True,
        replace=False,
    ):
        """
        :return: the given file(s) as a list of tagged
            words and punctuation symbols, encoded as tuples
            ``(word,tag)``.
        :rtype: list(tuple(str,str))

        :param speaker: If specified, select specific speaker(s) defined
            in the corpus. Default is 'ALL' (all participants). Common choices
            are 'CHI' (the child), 'MOT' (mother), ['CHI','MOT'] (exclude
            researchers)
        :param stem: If true, then use word stems instead of word strings.
        :param relation: If true, then return tuples of (stem, index,
            dependent_index)
        :param strip_space: If true, then strip trailing spaces from word
            tokens. Otherwise, leave the spaces on the tokens.
        :param replace: If true, then use the replaced (intended) word instead
            of the original word (e.g., 'wat' will be replaced with 'watch')
        """
        sent = None
        pos = True
        if not self._lazy:
            return [
                self._get_words(
                    fileid, speaker, sent, stem, relation, pos, strip_space, replace
                )
                for fileid in self.abspaths(fileids)
            ]

        get_words = lambda fileid: self._get_words(
            fileid, speaker, sent, stem, relation, pos, strip_space, replace
        )
        return LazyConcatenation(LazyMap(get_words, self.abspaths(fileids)))

    def sents(
        self,
        fileids=None,
        speaker='ALL',
        stem=False,
        relation=None,
        strip_space=True,
        replace=False,
    ):
        """
        :return: the given file(s) as a list of sentences or utterances, each
            encoded as a list of word strings.
        :rtype: list(list(str))

        :param speaker: If specified, select specific speaker(s) defined
            in the corpus. Default is 'ALL' (all participants). Common choices
            are 'CHI' (the child), 'MOT' (mother), ['CHI','MOT'] (exclude
            researchers)
        :param stem: If true, then use word stems instead of word strings.
        :param relation: If true, then return tuples of ``(str,pos,relation_list)``.
            If there is manually-annotated relation info, it will return
            tuples of ``(str,pos,test_relation_list,str,pos,gold_relation_list)``
        :param strip_space: If true, then strip trailing spaces from word
            tokens. Otherwise, leave the spaces on the tokens.
        :param replace: If true, then use the replaced (intended) word instead
            of the original word (e.g., 'wat' will be replaced with 'watch')
        """
        sent = True
        pos = False
        if not self._lazy:
            return [
                self._get_words(
                    fileid, speaker, sent, stem, relation, pos, strip_space, replace
                )
                for fileid in self.abspaths(fileids)
            ]

        get_words = lambda fileid: self._get_words(
            fileid, speaker, sent, stem, relation, pos, strip_space, replace
        )
        return LazyConcatenation(LazyMap(get_words, self.abspaths(fileids)))

    def tagged_sents(
        self,
        fileids=None,
        speaker='ALL',
        stem=False,
        relation=None,
        strip_space=True,
        replace=False,
    ):
        """
        :return: the given file(s) as a list of
            sentences, each encoded as a list of ``(word,tag)`` tuples.
        :rtype: list(list(tuple(str,str)))

        :param speaker: If specified, select specific speaker(s) defined
            in the corpus. Default is 'ALL' (all participants). Common choices
            are 'CHI' (the child), 'MOT' (mother), ['CHI','MOT'] (exclude
            researchers)
        :param stem: If true, then use word stems instead of word strings.
        :param relation: If true, then return tuples of ``(str,pos,relation_list)``.
            If there is manually-annotated relation info, it will return
            tuples of ``(str,pos,test_relation_list,str,pos,gold_relation_list)``
        :param strip_space: If true, then strip trailing spaces from word
            tokens. Otherwise, leave the spaces on the tokens.
        :param replace: If true, then use the replaced (intended) word instead
            of the original word (e.g., 'wat' will be replaced with 'watch')
        """
        sent = True
        pos = True
        if not self._lazy:
            return [
                self._get_words(
                    fileid, speaker, sent, stem, relation, pos, strip_space, replace
                )
                for fileid in self.abspaths(fileids)
            ]

        get_words = lambda fileid: self._get_words(
            fileid, speaker, sent, stem, relation, pos, strip_space, replace
        )
        return LazyConcatenation(LazyMap(get_words, self.abspaths(fileids)))

    def corpus(self, fileids=None):
        """
        :return: the given file(s) as a dict of ``(corpus_property_key, value)``
        :rtype: list(dict)
        """
        if not self._lazy:
            return [self._get_corpus(fileid) for fileid in self.abspaths(fileids)]
        return LazyMap(self._get_corpus, self.abspaths(fileids))

    def _get_corpus(self, fileid):
        results = dict()
        xmldoc = ElementTree.parse(fileid).getroot()
        for key, value in xmldoc.items():
            results[key] = value
        return results

    def participants(self, fileids=None):
        """
        :return: the given file(s) as a dict of
            ``(participant_property_key, value)``
        :rtype: list(dict)
        """
        if not self._lazy:
            return [self._get_participants(fileid) for fileid in self.abspaths(fileids)]
        return LazyMap(self._get_participants, self.abspaths(fileids))

    def _get_participants(self, fileid):
        # multidimensional dicts
        def dictOfDicts():
            return defaultdict(dictOfDicts)

        xmldoc = ElementTree.parse(fileid).getroot()
        # getting participants' data
        pat = dictOfDicts()
        for participant in xmldoc.findall(
            './/{%s}Participants/{%s}participant' % (NS, NS)
        ):
            for (key, value) in participant.items():
                pat[participant.get('id')][key] = value
        return pat

    def age(self, fileids=None, speaker='CHI', month=False):
        """
        :return: the given file(s) as string or int
        :rtype: list or int

        :param month: If true, return months instead of year-month-date
        """
        if not self._lazy:
            return [
                self._get_age(fileid, speaker, month)
                for fileid in self.abspaths(fileids)
            ]
        get_age = lambda fileid: self._get_age(fileid, speaker, month)
        return LazyMap(get_age, self.abspaths(fileids))

    def _get_age(self, fileid, speaker, month):
        xmldoc = ElementTree.parse(fileid).getroot()
        for pat in xmldoc.findall('.//{%s}Participants/{%s}participant' % (NS, NS)):
            try:
                if pat.get('id') == speaker:
                    age = pat.get('age')
                    if month:
                        age = self.convert_age(age)
                    return age
            # some files don't have age data
            except (TypeError, AttributeError) as e:
                return None

    def convert_age(self, age_year):
        "Caclculate age in months from a string in CHILDES format"
        m = re.match("P(\d+)Y(\d+)M?(\d?\d?)D?", age_year)
        age_month = int(m.group(1)) * 12 + int(m.group(2))
        try:
            if int(m.group(3)) > 15:
                age_month += 1
        # some corpora don't have age information?
        except ValueError as e:
            pass
        return age_month

    def MLU(self, fileids=None, speaker='CHI'):
        """
        :return: the given file(s) as a floating number
        :rtype: list(float)
        """
        if not self._lazy:
            return [
                self._getMLU(fileid, speaker=speaker)
                for fileid in self.abspaths(fileids)
            ]
        get_MLU = lambda fileid: self._getMLU(fileid, speaker=speaker)
        return LazyMap(get_MLU, self.abspaths(fileids))

    def _getMLU(self, fileid, speaker):
        sents = self._get_words(
            fileid,
            speaker=speaker,
            sent=True,
            stem=True,
            relation=False,
            pos=True,
            strip_space=True,
            replace=True,
        )
        results = []
        lastSent = []
        numFillers = 0
        sentDiscount = 0
        for sent in sents:
            posList = [pos for (word, pos) in sent]
            # if any part of the sentence is intelligible
            if any(pos == 'unk' for pos in posList):
                continue
            # if the sentence is null
            elif sent == []:
                continue
            # if the sentence is the same as the last sent
            elif sent == lastSent:
                continue
            else:
                results.append([word for (word, pos) in sent])
                # count number of fillers
                if len(set(['co', None]).intersection(posList)) > 0:
                    numFillers += posList.count('co')
                    numFillers += posList.count(None)
                    sentDiscount += 1
            lastSent = sent
        try:
            thisWordList = flatten(results)
            # count number of morphemes
            # (e.g., 'read' = 1 morpheme but 'read-PAST' is 2 morphemes)
            numWords = (
                len(flatten([word.split('-') for word in thisWordList])) - numFillers
            )
            numSents = len(results) - sentDiscount
            mlu = numWords / numSents
        except ZeroDivisionError:
            mlu = 0
        # return {'mlu':mlu,'wordNum':numWords,'sentNum':numSents}
        return mlu

    def _get_words(
        self, fileid, speaker, sent, stem, relation, pos, strip_space, replace
    ):
        if (
            isinstance(speaker, string_types) and speaker != 'ALL'
        ):  # ensure we have a list of speakers
            speaker = [speaker]
        xmldoc = ElementTree.parse(fileid).getroot()
        # processing each xml doc
        results = []
        for xmlsent in xmldoc.findall('.//{%s}u' % NS):
            sents = []
            # select speakers
            if speaker == 'ALL' or xmlsent.get('who') in speaker:
                for xmlword in xmlsent.findall('.//{%s}w' % NS):
                    infl = None
                    suffixStem = None
                    suffixTag = None
                    # getting replaced words
                    if replace and xmlsent.find('.//{%s}w/{%s}replacement' % (NS, NS)):
                        xmlword = xmlsent.find(
                            './/{%s}w/{%s}replacement/{%s}w' % (NS, NS, NS)
                        )
                    elif replace and xmlsent.find('.//{%s}w/{%s}wk' % (NS, NS)):
                        xmlword = xmlsent.find('.//{%s}w/{%s}wk' % (NS, NS))
                    # get text
                    if xmlword.text:
                        word = xmlword.text
                    else:
                        word = ''
                    # strip tailing space
                    if strip_space:
                        word = word.strip()
                    # stem
                    if relation or stem:
                        try:
                            xmlstem = xmlword.find('.//{%s}stem' % NS)
                            word = xmlstem.text
                        except AttributeError as e:
                            pass
                        # if there is an inflection
                        try:
                            xmlinfl = xmlword.find(
                                './/{%s}mor/{%s}mw/{%s}mk' % (NS, NS, NS)
                            )
                            word += '-' + xmlinfl.text
                        except:
                            pass
                        # if there is a suffix
                        try:
                            xmlsuffix = xmlword.find(
                                './/{%s}mor/{%s}mor-post/{%s}mw/{%s}stem'
                                % (NS, NS, NS, NS)
                            )
                            suffixStem = xmlsuffix.text
                        except AttributeError:
                            suffixStem = ""
                        if suffixStem:
                            word += "~" + suffixStem
                    # pos
                    if relation or pos:
                        try:
                            xmlpos = xmlword.findall(".//{%s}c" % NS)
                            xmlpos2 = xmlword.findall(".//{%s}s" % NS)
                            if xmlpos2 != []:
                                tag = xmlpos[0].text + ":" + xmlpos2[0].text
                            else:
                                tag = xmlpos[0].text
                        except (AttributeError, IndexError) as e:
                            tag = ""
                        try:
                            xmlsuffixpos = xmlword.findall(
                                './/{%s}mor/{%s}mor-post/{%s}mw/{%s}pos/{%s}c'
                                % (NS, NS, NS, NS, NS)
                            )
                            xmlsuffixpos2 = xmlword.findall(
                                './/{%s}mor/{%s}mor-post/{%s}mw/{%s}pos/{%s}s'
                                % (NS, NS, NS, NS, NS)
                            )
                            if xmlsuffixpos2:
                                suffixTag = (
                                    xmlsuffixpos[0].text + ":" + xmlsuffixpos2[0].text
                                )
                            else:
                                suffixTag = xmlsuffixpos[0].text
                        except:
                            pass
                        if suffixTag:
                            tag += "~" + suffixTag
                        word = (word, tag)
                    # relational
                    # the gold standard is stored in
                    # <mor></mor><mor type="trn"><gra type="grt">
                    if relation == True:
                        for xmlstem_rel in xmlword.findall(
                            './/{%s}mor/{%s}gra' % (NS, NS)
                        ):
                            if not xmlstem_rel.get('type') == 'grt':
                                word = (
                                    word[0],
                                    word[1],
                                    xmlstem_rel.get('index')
                                    + "|"
                                    + xmlstem_rel.get('head')
                                    + "|"
                                    + xmlstem_rel.get('relation'),
                                )
                            else:
                                word = (
                                    word[0],
                                    word[1],
                                    word[2],
                                    word[0],
                                    word[1],
                                    xmlstem_rel.get('index')
                                    + "|"
                                    + xmlstem_rel.get('head')
                                    + "|"
                                    + xmlstem_rel.get('relation'),
                                )
                        try:
                            for xmlpost_rel in xmlword.findall(
                                './/{%s}mor/{%s}mor-post/{%s}gra' % (NS, NS, NS)
                            ):
                                if not xmlpost_rel.get('type') == 'grt':
                                    suffixStem = (
                                        suffixStem[0],
                                        suffixStem[1],
                                        xmlpost_rel.get('index')
                                        + "|"
                                        + xmlpost_rel.get('head')
                                        + "|"
                                        + xmlpost_rel.get('relation'),
                                    )
                                else:
                                    suffixStem = (
                                        suffixStem[0],
                                        suffixStem[1],
                                        suffixStem[2],
                                        suffixStem[0],
                                        suffixStem[1],
                                        xmlpost_rel.get('index')
                                        + "|"
                                        + xmlpost_rel.get('head')
                                        + "|"
                                        + xmlpost_rel.get('relation'),
                                    )
                        except:
                            pass
                    sents.append(word)
                if sent or relation:
                    results.append(sents)
                else:
                    results.extend(sents)
        return LazyMap(lambda x: x, results)

    # Ready-to-use browser opener

    """
    The base URL for viewing files on the childes website. This
    shouldn't need to be changed, unless CHILDES changes the configuration
    of their server or unless the user sets up their own corpus webserver.
    """
    childes_url_base = r'https://childes.talkbank.org/browser/index.php?url='

    def webview_file(self, fileid, urlbase=None):
        """Map a corpus file to its web version on the CHILDES website,
        and open it in a web browser.

        The complete URL to be used is:
            childes.childes_url_base + urlbase + fileid.replace('.xml', '.cha')

        If no urlbase is passed, we try to calculate it.  This
        requires that the childes corpus was set up to mirror the
        folder hierarchy under childes.psy.cmu.edu/data-xml/, e.g.:
        nltk_data/corpora/childes/Eng-USA/Cornell/??? or
        nltk_data/corpora/childes/Romance/Spanish/Aguirre/???

        The function first looks (as a special case) if "Eng-USA" is
        on the path consisting of <corpus root>+fileid; then if
        "childes", possibly followed by "data-xml", appears. If neither
        one is found, we use the unmodified fileid and hope for the best.
        If this is not right, specify urlbase explicitly, e.g., if the
        corpus root points to the Cornell folder, urlbase='Eng-USA/Cornell'.
        """

        import webbrowser

        if urlbase:
            path = urlbase + "/" + fileid
        else:
            full = self.root + "/" + fileid
            full = re.sub(r'\\', '/', full)
            if '/childes/' in full.lower():
                # Discard /data-xml/ if present
                path = re.findall(r'(?i)/childes(?:/data-xml)?/(.*)\.xml', full)[0]
            elif 'eng-usa' in full.lower():
                path = 'Eng-USA/' + re.findall(r'/(?i)Eng-USA/(.*)\.xml', full)[0]
            else:
                path = fileid

        # Strip ".xml" and add ".cha", as necessary:
        if path.endswith('.xml'):
            path = path[:-4]

        if not path.endswith('.cha'):
            path = path + '.cha'

        url = self.childes_url_base + path

        webbrowser.open_new_tab(url)
        print("Opening in browser:", url)
        # Pausing is a good idea, but it's up to the user...
        # raw_input("Hit Return to continue")


def demo(corpus_root=None):
    """
    The CHILDES corpus should be manually downloaded and saved
    to ``[NLTK_Data_Dir]/corpora/childes/``
    """
    if not corpus_root:
        from nltk.data import find

        corpus_root = find('corpora/childes/data-xml/Eng-USA/')

    try:
        childes = CHILDESCorpusReader(corpus_root, '.*.xml')
        # describe all corpus
        for file in childes.fileids()[:5]:
            corpus = ''
            corpus_id = ''
            for (key, value) in childes.corpus(file)[0].items():
                if key == "Corpus":
                    corpus = value
                if key == "Id":
                    corpus_id = value
            print('Reading', corpus, corpus_id, ' .....')
            print("words:", childes.words(file)[:7], "...")
            print(
                "words with replaced words:",
                childes.words(file, replace=True)[:7],
                " ...",
            )
            print("words with pos tags:", childes.tagged_words(file)[:7], " ...")
            print("words (only MOT):", childes.words(file, speaker='MOT')[:7], "...")
            print("words (only CHI):", childes.words(file, speaker='CHI')[:7], "...")
            print("stemmed words:", childes.words(file, stem=True)[:7], " ...")
            print(
                "words with relations and pos-tag:",
                childes.words(file, relation=True)[:5],
                " ...",
            )
            print("sentence:", childes.sents(file)[:2], " ...")
            for (participant, values) in childes.participants(file)[0].items():
                for (key, value) in values.items():
                    print("\tparticipant", participant, key, ":", value)
            print("num of sent:", len(childes.sents(file)))
            print("num of morphemes:", len(childes.words(file, stem=True)))
            print("age:", childes.age(file))
            print("age in month:", childes.age(file, month=True))
            print("MLU:", childes.MLU(file))
            print()

    except LookupError as e:
        print(
            """The CHILDES corpus, or the parts you need, should be manually
        downloaded from https://childes.talkbank.org/data-xml/ and saved at
        [NLTK_Data_Dir]/corpora/childes/
            Alternately, you can call the demo with the path to a portion of the CHILDES corpus, e.g.:
        demo('/path/to/childes/data-xml/Eng-USA/")
        """
        )
        # corpus_root_http = urllib2.urlopen('https://childes.talkbank.org/data-xml/Eng-USA/Bates.zip')
        # corpus_root_http_bates = zipfile.ZipFile(cStringIO.StringIO(corpus_root_http.read()))
        ##this fails
        # childes = CHILDESCorpusReader(corpus_root_http_bates,corpus_root_http_bates.namelist())


if __name__ == "__main__":
    demo()