1# CHILDES XML Corpus Reader 2 3# Copyright (C) 2001-2019 NLTK Project 4# Author: Tomonori Nagano <tnagano@gc.cuny.edu> 5# Alexis Dimitriadis <A.Dimitriadis@uu.nl> 6# URL: <http://nltk.org/> 7# For license information, see LICENSE.TXT 8 9""" 10Corpus reader for the XML version of the CHILDES corpus. 11""" 12from __future__ import print_function, division 13 14__docformat__ = 'epytext en' 15 16import re 17from collections import defaultdict 18from six import string_types 19 20from nltk.util import flatten, LazyMap, LazyConcatenation 21 22from nltk.corpus.reader.util import concat 23from nltk.corpus.reader.xmldocs import XMLCorpusReader, ElementTree 24 25# to resolve the namespace issue 26NS = 'http://www.talkbank.org/ns/talkbank' 27 28 29class CHILDESCorpusReader(XMLCorpusReader): 30 """ 31 Corpus reader for the XML version of the CHILDES corpus. 32 The CHILDES corpus is available at ``https://childes.talkbank.org/``. The XML 33 version of CHILDES is located at ``https://childes.talkbank.org/data-xml/``. 34 Copy the needed parts of the CHILDES XML corpus into the NLTK data directory 35 (``nltk_data/corpora/CHILDES/``). 36 37 For access to the file text use the usual nltk functions, 38 ``words()``, ``sents()``, ``tagged_words()`` and ``tagged_sents()``. 39 """ 40 41 def __init__(self, root, fileids, lazy=True): 42 XMLCorpusReader.__init__(self, root, fileids) 43 self._lazy = lazy 44 45 def words( 46 self, 47 fileids=None, 48 speaker='ALL', 49 stem=False, 50 relation=False, 51 strip_space=True, 52 replace=False, 53 ): 54 """ 55 :return: the given file(s) as a list of words 56 :rtype: list(str) 57 58 :param speaker: If specified, select specific speaker(s) defined 59 in the corpus. Default is 'ALL' (all participants). Common choices 60 are 'CHI' (the child), 'MOT' (mother), ['CHI','MOT'] (exclude 61 researchers) 62 :param stem: If true, then use word stems instead of word strings. 63 :param relation: If true, then return tuples of (stem, index, 64 dependent_index) 65 :param strip_space: If true, then strip trailing spaces from word 66 tokens. Otherwise, leave the spaces on the tokens. 67 :param replace: If true, then use the replaced (intended) word instead 68 of the original word (e.g., 'wat' will be replaced with 'watch') 69 """ 70 sent = None 71 pos = False 72 if not self._lazy: 73 return [ 74 self._get_words( 75 fileid, speaker, sent, stem, relation, pos, strip_space, replace 76 ) 77 for fileid in self.abspaths(fileids) 78 ] 79 80 get_words = lambda fileid: self._get_words( 81 fileid, speaker, sent, stem, relation, pos, strip_space, replace 82 ) 83 return LazyConcatenation(LazyMap(get_words, self.abspaths(fileids))) 84 85 def tagged_words( 86 self, 87 fileids=None, 88 speaker='ALL', 89 stem=False, 90 relation=False, 91 strip_space=True, 92 replace=False, 93 ): 94 """ 95 :return: the given file(s) as a list of tagged 96 words and punctuation symbols, encoded as tuples 97 ``(word,tag)``. 98 :rtype: list(tuple(str,str)) 99 100 :param speaker: If specified, select specific speaker(s) defined 101 in the corpus. Default is 'ALL' (all participants). Common choices 102 are 'CHI' (the child), 'MOT' (mother), ['CHI','MOT'] (exclude 103 researchers) 104 :param stem: If true, then use word stems instead of word strings. 105 :param relation: If true, then return tuples of (stem, index, 106 dependent_index) 107 :param strip_space: If true, then strip trailing spaces from word 108 tokens. Otherwise, leave the spaces on the tokens. 109 :param replace: If true, then use the replaced (intended) word instead 110 of the original word (e.g., 'wat' will be replaced with 'watch') 111 """ 112 sent = None 113 pos = True 114 if not self._lazy: 115 return [ 116 self._get_words( 117 fileid, speaker, sent, stem, relation, pos, strip_space, replace 118 ) 119 for fileid in self.abspaths(fileids) 120 ] 121 122 get_words = lambda fileid: self._get_words( 123 fileid, speaker, sent, stem, relation, pos, strip_space, replace 124 ) 125 return LazyConcatenation(LazyMap(get_words, self.abspaths(fileids))) 126 127 def sents( 128 self, 129 fileids=None, 130 speaker='ALL', 131 stem=False, 132 relation=None, 133 strip_space=True, 134 replace=False, 135 ): 136 """ 137 :return: the given file(s) as a list of sentences or utterances, each 138 encoded as a list of word strings. 139 :rtype: list(list(str)) 140 141 :param speaker: If specified, select specific speaker(s) defined 142 in the corpus. Default is 'ALL' (all participants). Common choices 143 are 'CHI' (the child), 'MOT' (mother), ['CHI','MOT'] (exclude 144 researchers) 145 :param stem: If true, then use word stems instead of word strings. 146 :param relation: If true, then return tuples of ``(str,pos,relation_list)``. 147 If there is manually-annotated relation info, it will return 148 tuples of ``(str,pos,test_relation_list,str,pos,gold_relation_list)`` 149 :param strip_space: If true, then strip trailing spaces from word 150 tokens. Otherwise, leave the spaces on the tokens. 151 :param replace: If true, then use the replaced (intended) word instead 152 of the original word (e.g., 'wat' will be replaced with 'watch') 153 """ 154 sent = True 155 pos = False 156 if not self._lazy: 157 return [ 158 self._get_words( 159 fileid, speaker, sent, stem, relation, pos, strip_space, replace 160 ) 161 for fileid in self.abspaths(fileids) 162 ] 163 164 get_words = lambda fileid: self._get_words( 165 fileid, speaker, sent, stem, relation, pos, strip_space, replace 166 ) 167 return LazyConcatenation(LazyMap(get_words, self.abspaths(fileids))) 168 169 def tagged_sents( 170 self, 171 fileids=None, 172 speaker='ALL', 173 stem=False, 174 relation=None, 175 strip_space=True, 176 replace=False, 177 ): 178 """ 179 :return: the given file(s) as a list of 180 sentences, each encoded as a list of ``(word,tag)`` tuples. 181 :rtype: list(list(tuple(str,str))) 182 183 :param speaker: If specified, select specific speaker(s) defined 184 in the corpus. Default is 'ALL' (all participants). Common choices 185 are 'CHI' (the child), 'MOT' (mother), ['CHI','MOT'] (exclude 186 researchers) 187 :param stem: If true, then use word stems instead of word strings. 188 :param relation: If true, then return tuples of ``(str,pos,relation_list)``. 189 If there is manually-annotated relation info, it will return 190 tuples of ``(str,pos,test_relation_list,str,pos,gold_relation_list)`` 191 :param strip_space: If true, then strip trailing spaces from word 192 tokens. Otherwise, leave the spaces on the tokens. 193 :param replace: If true, then use the replaced (intended) word instead 194 of the original word (e.g., 'wat' will be replaced with 'watch') 195 """ 196 sent = True 197 pos = True 198 if not self._lazy: 199 return [ 200 self._get_words( 201 fileid, speaker, sent, stem, relation, pos, strip_space, replace 202 ) 203 for fileid in self.abspaths(fileids) 204 ] 205 206 get_words = lambda fileid: self._get_words( 207 fileid, speaker, sent, stem, relation, pos, strip_space, replace 208 ) 209 return LazyConcatenation(LazyMap(get_words, self.abspaths(fileids))) 210 211 def corpus(self, fileids=None): 212 """ 213 :return: the given file(s) as a dict of ``(corpus_property_key, value)`` 214 :rtype: list(dict) 215 """ 216 if not self._lazy: 217 return [self._get_corpus(fileid) for fileid in self.abspaths(fileids)] 218 return LazyMap(self._get_corpus, self.abspaths(fileids)) 219 220 def _get_corpus(self, fileid): 221 results = dict() 222 xmldoc = ElementTree.parse(fileid).getroot() 223 for key, value in xmldoc.items(): 224 results[key] = value 225 return results 226 227 def participants(self, fileids=None): 228 """ 229 :return: the given file(s) as a dict of 230 ``(participant_property_key, value)`` 231 :rtype: list(dict) 232 """ 233 if not self._lazy: 234 return [self._get_participants(fileid) for fileid in self.abspaths(fileids)] 235 return LazyMap(self._get_participants, self.abspaths(fileids)) 236 237 def _get_participants(self, fileid): 238 # multidimensional dicts 239 def dictOfDicts(): 240 return defaultdict(dictOfDicts) 241 242 xmldoc = ElementTree.parse(fileid).getroot() 243 # getting participants' data 244 pat = dictOfDicts() 245 for participant in xmldoc.findall( 246 './/{%s}Participants/{%s}participant' % (NS, NS) 247 ): 248 for (key, value) in participant.items(): 249 pat[participant.get('id')][key] = value 250 return pat 251 252 def age(self, fileids=None, speaker='CHI', month=False): 253 """ 254 :return: the given file(s) as string or int 255 :rtype: list or int 256 257 :param month: If true, return months instead of year-month-date 258 """ 259 if not self._lazy: 260 return [ 261 self._get_age(fileid, speaker, month) 262 for fileid in self.abspaths(fileids) 263 ] 264 get_age = lambda fileid: self._get_age(fileid, speaker, month) 265 return LazyMap(get_age, self.abspaths(fileids)) 266 267 def _get_age(self, fileid, speaker, month): 268 xmldoc = ElementTree.parse(fileid).getroot() 269 for pat in xmldoc.findall('.//{%s}Participants/{%s}participant' % (NS, NS)): 270 try: 271 if pat.get('id') == speaker: 272 age = pat.get('age') 273 if month: 274 age = self.convert_age(age) 275 return age 276 # some files don't have age data 277 except (TypeError, AttributeError) as e: 278 return None 279 280 def convert_age(self, age_year): 281 "Caclculate age in months from a string in CHILDES format" 282 m = re.match("P(\d+)Y(\d+)M?(\d?\d?)D?", age_year) 283 age_month = int(m.group(1)) * 12 + int(m.group(2)) 284 try: 285 if int(m.group(3)) > 15: 286 age_month += 1 287 # some corpora don't have age information? 288 except ValueError as e: 289 pass 290 return age_month 291 292 def MLU(self, fileids=None, speaker='CHI'): 293 """ 294 :return: the given file(s) as a floating number 295 :rtype: list(float) 296 """ 297 if not self._lazy: 298 return [ 299 self._getMLU(fileid, speaker=speaker) 300 for fileid in self.abspaths(fileids) 301 ] 302 get_MLU = lambda fileid: self._getMLU(fileid, speaker=speaker) 303 return LazyMap(get_MLU, self.abspaths(fileids)) 304 305 def _getMLU(self, fileid, speaker): 306 sents = self._get_words( 307 fileid, 308 speaker=speaker, 309 sent=True, 310 stem=True, 311 relation=False, 312 pos=True, 313 strip_space=True, 314 replace=True, 315 ) 316 results = [] 317 lastSent = [] 318 numFillers = 0 319 sentDiscount = 0 320 for sent in sents: 321 posList = [pos for (word, pos) in sent] 322 # if any part of the sentence is intelligible 323 if any(pos == 'unk' for pos in posList): 324 continue 325 # if the sentence is null 326 elif sent == []: 327 continue 328 # if the sentence is the same as the last sent 329 elif sent == lastSent: 330 continue 331 else: 332 results.append([word for (word, pos) in sent]) 333 # count number of fillers 334 if len(set(['co', None]).intersection(posList)) > 0: 335 numFillers += posList.count('co') 336 numFillers += posList.count(None) 337 sentDiscount += 1 338 lastSent = sent 339 try: 340 thisWordList = flatten(results) 341 # count number of morphemes 342 # (e.g., 'read' = 1 morpheme but 'read-PAST' is 2 morphemes) 343 numWords = ( 344 len(flatten([word.split('-') for word in thisWordList])) - numFillers 345 ) 346 numSents = len(results) - sentDiscount 347 mlu = numWords / numSents 348 except ZeroDivisionError: 349 mlu = 0 350 # return {'mlu':mlu,'wordNum':numWords,'sentNum':numSents} 351 return mlu 352 353 def _get_words( 354 self, fileid, speaker, sent, stem, relation, pos, strip_space, replace 355 ): 356 if ( 357 isinstance(speaker, string_types) and speaker != 'ALL' 358 ): # ensure we have a list of speakers 359 speaker = [speaker] 360 xmldoc = ElementTree.parse(fileid).getroot() 361 # processing each xml doc 362 results = [] 363 for xmlsent in xmldoc.findall('.//{%s}u' % NS): 364 sents = [] 365 # select speakers 366 if speaker == 'ALL' or xmlsent.get('who') in speaker: 367 for xmlword in xmlsent.findall('.//{%s}w' % NS): 368 infl = None 369 suffixStem = None 370 suffixTag = None 371 # getting replaced words 372 if replace and xmlsent.find('.//{%s}w/{%s}replacement' % (NS, NS)): 373 xmlword = xmlsent.find( 374 './/{%s}w/{%s}replacement/{%s}w' % (NS, NS, NS) 375 ) 376 elif replace and xmlsent.find('.//{%s}w/{%s}wk' % (NS, NS)): 377 xmlword = xmlsent.find('.//{%s}w/{%s}wk' % (NS, NS)) 378 # get text 379 if xmlword.text: 380 word = xmlword.text 381 else: 382 word = '' 383 # strip tailing space 384 if strip_space: 385 word = word.strip() 386 # stem 387 if relation or stem: 388 try: 389 xmlstem = xmlword.find('.//{%s}stem' % NS) 390 word = xmlstem.text 391 except AttributeError as e: 392 pass 393 # if there is an inflection 394 try: 395 xmlinfl = xmlword.find( 396 './/{%s}mor/{%s}mw/{%s}mk' % (NS, NS, NS) 397 ) 398 word += '-' + xmlinfl.text 399 except: 400 pass 401 # if there is a suffix 402 try: 403 xmlsuffix = xmlword.find( 404 './/{%s}mor/{%s}mor-post/{%s}mw/{%s}stem' 405 % (NS, NS, NS, NS) 406 ) 407 suffixStem = xmlsuffix.text 408 except AttributeError: 409 suffixStem = "" 410 if suffixStem: 411 word += "~" + suffixStem 412 # pos 413 if relation or pos: 414 try: 415 xmlpos = xmlword.findall(".//{%s}c" % NS) 416 xmlpos2 = xmlword.findall(".//{%s}s" % NS) 417 if xmlpos2 != []: 418 tag = xmlpos[0].text + ":" + xmlpos2[0].text 419 else: 420 tag = xmlpos[0].text 421 except (AttributeError, IndexError) as e: 422 tag = "" 423 try: 424 xmlsuffixpos = xmlword.findall( 425 './/{%s}mor/{%s}mor-post/{%s}mw/{%s}pos/{%s}c' 426 % (NS, NS, NS, NS, NS) 427 ) 428 xmlsuffixpos2 = xmlword.findall( 429 './/{%s}mor/{%s}mor-post/{%s}mw/{%s}pos/{%s}s' 430 % (NS, NS, NS, NS, NS) 431 ) 432 if xmlsuffixpos2: 433 suffixTag = ( 434 xmlsuffixpos[0].text + ":" + xmlsuffixpos2[0].text 435 ) 436 else: 437 suffixTag = xmlsuffixpos[0].text 438 except: 439 pass 440 if suffixTag: 441 tag += "~" + suffixTag 442 word = (word, tag) 443 # relational 444 # the gold standard is stored in 445 # <mor></mor><mor type="trn"><gra type="grt"> 446 if relation == True: 447 for xmlstem_rel in xmlword.findall( 448 './/{%s}mor/{%s}gra' % (NS, NS) 449 ): 450 if not xmlstem_rel.get('type') == 'grt': 451 word = ( 452 word[0], 453 word[1], 454 xmlstem_rel.get('index') 455 + "|" 456 + xmlstem_rel.get('head') 457 + "|" 458 + xmlstem_rel.get('relation'), 459 ) 460 else: 461 word = ( 462 word[0], 463 word[1], 464 word[2], 465 word[0], 466 word[1], 467 xmlstem_rel.get('index') 468 + "|" 469 + xmlstem_rel.get('head') 470 + "|" 471 + xmlstem_rel.get('relation'), 472 ) 473 try: 474 for xmlpost_rel in xmlword.findall( 475 './/{%s}mor/{%s}mor-post/{%s}gra' % (NS, NS, NS) 476 ): 477 if not xmlpost_rel.get('type') == 'grt': 478 suffixStem = ( 479 suffixStem[0], 480 suffixStem[1], 481 xmlpost_rel.get('index') 482 + "|" 483 + xmlpost_rel.get('head') 484 + "|" 485 + xmlpost_rel.get('relation'), 486 ) 487 else: 488 suffixStem = ( 489 suffixStem[0], 490 suffixStem[1], 491 suffixStem[2], 492 suffixStem[0], 493 suffixStem[1], 494 xmlpost_rel.get('index') 495 + "|" 496 + xmlpost_rel.get('head') 497 + "|" 498 + xmlpost_rel.get('relation'), 499 ) 500 except: 501 pass 502 sents.append(word) 503 if sent or relation: 504 results.append(sents) 505 else: 506 results.extend(sents) 507 return LazyMap(lambda x: x, results) 508 509 # Ready-to-use browser opener 510 511 """ 512 The base URL for viewing files on the childes website. This 513 shouldn't need to be changed, unless CHILDES changes the configuration 514 of their server or unless the user sets up their own corpus webserver. 515 """ 516 childes_url_base = r'https://childes.talkbank.org/browser/index.php?url=' 517 518 def webview_file(self, fileid, urlbase=None): 519 """Map a corpus file to its web version on the CHILDES website, 520 and open it in a web browser. 521 522 The complete URL to be used is: 523 childes.childes_url_base + urlbase + fileid.replace('.xml', '.cha') 524 525 If no urlbase is passed, we try to calculate it. This 526 requires that the childes corpus was set up to mirror the 527 folder hierarchy under childes.psy.cmu.edu/data-xml/, e.g.: 528 nltk_data/corpora/childes/Eng-USA/Cornell/??? or 529 nltk_data/corpora/childes/Romance/Spanish/Aguirre/??? 530 531 The function first looks (as a special case) if "Eng-USA" is 532 on the path consisting of <corpus root>+fileid; then if 533 "childes", possibly followed by "data-xml", appears. If neither 534 one is found, we use the unmodified fileid and hope for the best. 535 If this is not right, specify urlbase explicitly, e.g., if the 536 corpus root points to the Cornell folder, urlbase='Eng-USA/Cornell'. 537 """ 538 539 import webbrowser 540 541 if urlbase: 542 path = urlbase + "/" + fileid 543 else: 544 full = self.root + "/" + fileid 545 full = re.sub(r'\\', '/', full) 546 if '/childes/' in full.lower(): 547 # Discard /data-xml/ if present 548 path = re.findall(r'(?i)/childes(?:/data-xml)?/(.*)\.xml', full)[0] 549 elif 'eng-usa' in full.lower(): 550 path = 'Eng-USA/' + re.findall(r'/(?i)Eng-USA/(.*)\.xml', full)[0] 551 else: 552 path = fileid 553 554 # Strip ".xml" and add ".cha", as necessary: 555 if path.endswith('.xml'): 556 path = path[:-4] 557 558 if not path.endswith('.cha'): 559 path = path + '.cha' 560 561 url = self.childes_url_base + path 562 563 webbrowser.open_new_tab(url) 564 print("Opening in browser:", url) 565 # Pausing is a good idea, but it's up to the user... 566 # raw_input("Hit Return to continue") 567 568 569def demo(corpus_root=None): 570 """ 571 The CHILDES corpus should be manually downloaded and saved 572 to ``[NLTK_Data_Dir]/corpora/childes/`` 573 """ 574 if not corpus_root: 575 from nltk.data import find 576 577 corpus_root = find('corpora/childes/data-xml/Eng-USA/') 578 579 try: 580 childes = CHILDESCorpusReader(corpus_root, '.*.xml') 581 # describe all corpus 582 for file in childes.fileids()[:5]: 583 corpus = '' 584 corpus_id = '' 585 for (key, value) in childes.corpus(file)[0].items(): 586 if key == "Corpus": 587 corpus = value 588 if key == "Id": 589 corpus_id = value 590 print('Reading', corpus, corpus_id, ' .....') 591 print("words:", childes.words(file)[:7], "...") 592 print( 593 "words with replaced words:", 594 childes.words(file, replace=True)[:7], 595 " ...", 596 ) 597 print("words with pos tags:", childes.tagged_words(file)[:7], " ...") 598 print("words (only MOT):", childes.words(file, speaker='MOT')[:7], "...") 599 print("words (only CHI):", childes.words(file, speaker='CHI')[:7], "...") 600 print("stemmed words:", childes.words(file, stem=True)[:7], " ...") 601 print( 602 "words with relations and pos-tag:", 603 childes.words(file, relation=True)[:5], 604 " ...", 605 ) 606 print("sentence:", childes.sents(file)[:2], " ...") 607 for (participant, values) in childes.participants(file)[0].items(): 608 for (key, value) in values.items(): 609 print("\tparticipant", participant, key, ":", value) 610 print("num of sent:", len(childes.sents(file))) 611 print("num of morphemes:", len(childes.words(file, stem=True))) 612 print("age:", childes.age(file)) 613 print("age in month:", childes.age(file, month=True)) 614 print("MLU:", childes.MLU(file)) 615 print() 616 617 except LookupError as e: 618 print( 619 """The CHILDES corpus, or the parts you need, should be manually 620 downloaded from https://childes.talkbank.org/data-xml/ and saved at 621 [NLTK_Data_Dir]/corpora/childes/ 622 Alternately, you can call the demo with the path to a portion of the CHILDES corpus, e.g.: 623 demo('/path/to/childes/data-xml/Eng-USA/") 624 """ 625 ) 626 # corpus_root_http = urllib2.urlopen('https://childes.talkbank.org/data-xml/Eng-USA/Bates.zip') 627 # corpus_root_http_bates = zipfile.ZipFile(cStringIO.StringIO(corpus_root_http.read())) 628 ##this fails 629 # childes = CHILDESCorpusReader(corpus_root_http_bates,corpus_root_http_bates.namelist()) 630 631 632if __name__ == "__main__": 633 demo() 634