1# Natural Language Toolkit: TIMIT Corpus Reader 2# 3# Copyright (C) 2001-2007 NLTK Project 4# Author: Haejoong Lee <haejoong@ldc.upenn.edu> 5# Steven Bird <stevenbird1@gmail.com> 6# Jacob Perkins <japerk@gmail.com> 7# URL: <http://nltk.org/> 8# For license information, see LICENSE.TXT 9 10# [xx] this docstring is out-of-date: 11""" 12Read tokens, phonemes and audio data from the NLTK TIMIT Corpus. 13 14This corpus contains selected portion of the TIMIT corpus. 15 16 - 16 speakers from 8 dialect regions 17 - 1 male and 1 female from each dialect region 18 - total 130 sentences (10 sentences per speaker. Note that some 19 sentences are shared among other speakers, especially sa1 and sa2 20 are spoken by all speakers.) 21 - total 160 recording of sentences (10 recordings per speaker) 22 - audio format: NIST Sphere, single channel, 16kHz sampling, 23 16 bit sample, PCM encoding 24 25 26Module contents 27=============== 28 29The timit corpus reader provides 4 functions and 4 data items. 30 31 - utterances 32 33 List of utterances in the corpus. There are total 160 utterances, 34 each of which corresponds to a unique utterance of a speaker. 35 Here's an example of an utterance identifier in the list:: 36 37 dr1-fvmh0/sx206 38 - _---- _--- 39 | | | | | 40 | | | | | 41 | | | | `--- sentence number 42 | | | `----- sentence type (a:all, i:shared, x:exclusive) 43 | | `--------- speaker ID 44 | `------------ sex (m:male, f:female) 45 `-------------- dialect region (1..8) 46 47 - speakers 48 49 List of speaker IDs. An example of speaker ID:: 50 51 dr1-fvmh0 52 53 Note that if you split an item ID with colon and take the first element of 54 the result, you will get a speaker ID. 55 56 >>> itemid = 'dr1-fvmh0/sx206' 57 >>> spkrid , sentid = itemid.split('/') 58 >>> spkrid 59 'dr1-fvmh0' 60 61 The second element of the result is a sentence ID. 62 63 - dictionary() 64 65 Phonetic dictionary of words contained in this corpus. This is a Python 66 dictionary from words to phoneme lists. 67 68 - spkrinfo() 69 70 Speaker information table. It's a Python dictionary from speaker IDs to 71 records of 10 fields. Speaker IDs the same as the ones in timie.speakers. 72 Each record is a dictionary from field names to values, and the fields are 73 as follows:: 74 75 id speaker ID as defined in the original TIMIT speaker info table 76 sex speaker gender (M:male, F:female) 77 dr speaker dialect region (1:new england, 2:northern, 78 3:north midland, 4:south midland, 5:southern, 6:new york city, 79 7:western, 8:army brat (moved around)) 80 use corpus type (TRN:training, TST:test) 81 in this sample corpus only TRN is available 82 recdate recording date 83 birthdate speaker birth date 84 ht speaker height 85 race speaker race (WHT:white, BLK:black, AMR:american indian, 86 SPN:spanish-american, ORN:oriental,???:unknown) 87 edu speaker education level (HS:high school, AS:associate degree, 88 BS:bachelor's degree (BS or BA), MS:master's degree (MS or MA), 89 PHD:doctorate degree (PhD,JD,MD), ??:unknown) 90 comments comments by the recorder 91 92The 4 functions are as follows. 93 94 - tokenized(sentences=items, offset=False) 95 96 Given a list of items, returns an iterator of a list of word lists, 97 each of which corresponds to an item (sentence). If offset is set to True, 98 each element of the word list is a tuple of word(string), start offset and 99 end offset, where offset is represented as a number of 16kHz samples. 100 101 - phonetic(sentences=items, offset=False) 102 103 Given a list of items, returns an iterator of a list of phoneme lists, 104 each of which corresponds to an item (sentence). If offset is set to True, 105 each element of the phoneme list is a tuple of word(string), start offset 106 and end offset, where offset is represented as a number of 16kHz samples. 107 108 - audiodata(item, start=0, end=None) 109 110 Given an item, returns a chunk of audio samples formatted into a string. 111 When the fuction is called, if start and end are omitted, the entire 112 samples of the recording will be returned. If only end is omitted, 113 samples from the start offset to the end of the recording will be returned. 114 115 - play(data) 116 117 Play the given audio samples. The audio samples can be obtained from the 118 timit.audiodata function. 119 120""" 121from __future__ import print_function, unicode_literals 122 123import sys 124import os 125import re 126import tempfile 127import time 128 129from six import string_types 130 131from nltk import compat 132from nltk.tree import Tree 133from nltk.internals import import_from_stdlib 134 135from nltk.corpus.reader.util import * 136from nltk.corpus.reader.api import * 137 138 139class TimitCorpusReader(CorpusReader): 140 """ 141 Reader for the TIMIT corpus (or any other corpus with the same 142 file layout and use of file formats). The corpus root directory 143 should contain the following files: 144 145 - timitdic.txt: dictionary of standard transcriptions 146 - spkrinfo.txt: table of speaker information 147 148 In addition, the root directory should contain one subdirectory 149 for each speaker, containing three files for each utterance: 150 151 - <utterance-id>.txt: text content of utterances 152 - <utterance-id>.wrd: tokenized text content of utterances 153 - <utterance-id>.phn: phonetic transcription of utterances 154 - <utterance-id>.wav: utterance sound file 155 """ 156 157 _FILE_RE = r'(\w+-\w+/\w+\.(phn|txt|wav|wrd))|' + r'timitdic\.txt|spkrinfo\.txt' 158 """A regexp matching fileids that are used by this corpus reader.""" 159 _UTTERANCE_RE = r'\w+-\w+/\w+\.txt' 160 161 def __init__(self, root, encoding='utf8'): 162 """ 163 Construct a new TIMIT corpus reader in the given directory. 164 :param root: The root directory for this corpus. 165 """ 166 # Ensure that wave files don't get treated as unicode data: 167 if isinstance(encoding, string_types): 168 encoding = [('.*\.wav', None), ('.*', encoding)] 169 170 CorpusReader.__init__( 171 self, root, find_corpus_fileids(root, self._FILE_RE), encoding=encoding 172 ) 173 174 self._utterances = [ 175 name[:-4] for name in find_corpus_fileids(root, self._UTTERANCE_RE) 176 ] 177 """A list of the utterance identifiers for all utterances in 178 this corpus.""" 179 180 self._speakerinfo = None 181 self._root = root 182 self.speakers = sorted(set(u.split('/')[0] for u in self._utterances)) 183 184 def fileids(self, filetype=None): 185 """ 186 Return a list of file identifiers for the files that make up 187 this corpus. 188 189 :param filetype: If specified, then ``filetype`` indicates that 190 only the files that have the given type should be 191 returned. Accepted values are: ``txt``, ``wrd``, ``phn``, 192 ``wav``, or ``metadata``, 193 """ 194 if filetype is None: 195 return CorpusReader.fileids(self) 196 elif filetype in ('txt', 'wrd', 'phn', 'wav'): 197 return ['%s.%s' % (u, filetype) for u in self._utterances] 198 elif filetype == 'metadata': 199 return ['timitdic.txt', 'spkrinfo.txt'] 200 else: 201 raise ValueError('Bad value for filetype: %r' % filetype) 202 203 def utteranceids( 204 self, dialect=None, sex=None, spkrid=None, sent_type=None, sentid=None 205 ): 206 """ 207 :return: A list of the utterance identifiers for all 208 utterances in this corpus, or for the given speaker, dialect 209 region, gender, sentence type, or sentence number, if 210 specified. 211 """ 212 if isinstance(dialect, string_types): 213 dialect = [dialect] 214 if isinstance(sex, string_types): 215 sex = [sex] 216 if isinstance(spkrid, string_types): 217 spkrid = [spkrid] 218 if isinstance(sent_type, string_types): 219 sent_type = [sent_type] 220 if isinstance(sentid, string_types): 221 sentid = [sentid] 222 223 utterances = self._utterances[:] 224 if dialect is not None: 225 utterances = [u for u in utterances if u[2] in dialect] 226 if sex is not None: 227 utterances = [u for u in utterances if u[4] in sex] 228 if spkrid is not None: 229 utterances = [u for u in utterances if u[:9] in spkrid] 230 if sent_type is not None: 231 utterances = [u for u in utterances if u[11] in sent_type] 232 if sentid is not None: 233 utterances = [u for u in utterances if u[10:] in spkrid] 234 return utterances 235 236 def transcription_dict(self): 237 """ 238 :return: A dictionary giving the 'standard' transcription for 239 each word. 240 """ 241 _transcriptions = {} 242 for line in self.open('timitdic.txt'): 243 if not line.strip() or line[0] == ';': 244 continue 245 m = re.match(r'\s*(\S+)\s+/(.*)/\s*$', line) 246 if not m: 247 raise ValueError('Bad line: %r' % line) 248 _transcriptions[m.group(1)] = m.group(2).split() 249 return _transcriptions 250 251 def spkrid(self, utterance): 252 return utterance.split('/')[0] 253 254 def sentid(self, utterance): 255 return utterance.split('/')[1] 256 257 def utterance(self, spkrid, sentid): 258 return '%s/%s' % (spkrid, sentid) 259 260 def spkrutteranceids(self, speaker): 261 """ 262 :return: A list of all utterances associated with a given 263 speaker. 264 """ 265 return [ 266 utterance 267 for utterance in self._utterances 268 if utterance.startswith(speaker + '/') 269 ] 270 271 def spkrinfo(self, speaker): 272 """ 273 :return: A dictionary mapping .. something. 274 """ 275 if speaker in self._utterances: 276 speaker = self.spkrid(speaker) 277 278 if self._speakerinfo is None: 279 self._speakerinfo = {} 280 for line in self.open('spkrinfo.txt'): 281 if not line.strip() or line[0] == ';': 282 continue 283 rec = line.strip().split(None, 9) 284 key = "dr%s-%s%s" % (rec[2], rec[1].lower(), rec[0].lower()) 285 self._speakerinfo[key] = SpeakerInfo(*rec) 286 287 return self._speakerinfo[speaker] 288 289 def phones(self, utterances=None): 290 return [ 291 line.split()[-1] 292 for fileid in self._utterance_fileids(utterances, '.phn') 293 for line in self.open(fileid) 294 if line.strip() 295 ] 296 297 def phone_times(self, utterances=None): 298 """ 299 offset is represented as a number of 16kHz samples! 300 """ 301 return [ 302 (line.split()[2], int(line.split()[0]), int(line.split()[1])) 303 for fileid in self._utterance_fileids(utterances, '.phn') 304 for line in self.open(fileid) 305 if line.strip() 306 ] 307 308 def words(self, utterances=None): 309 return [ 310 line.split()[-1] 311 for fileid in self._utterance_fileids(utterances, '.wrd') 312 for line in self.open(fileid) 313 if line.strip() 314 ] 315 316 def word_times(self, utterances=None): 317 return [ 318 (line.split()[2], int(line.split()[0]), int(line.split()[1])) 319 for fileid in self._utterance_fileids(utterances, '.wrd') 320 for line in self.open(fileid) 321 if line.strip() 322 ] 323 324 def sents(self, utterances=None): 325 return [ 326 [line.split()[-1] for line in self.open(fileid) if line.strip()] 327 for fileid in self._utterance_fileids(utterances, '.wrd') 328 ] 329 330 def sent_times(self, utterances=None): 331 return [ 332 ( 333 line.split(None, 2)[-1].strip(), 334 int(line.split()[0]), 335 int(line.split()[1]), 336 ) 337 for fileid in self._utterance_fileids(utterances, '.txt') 338 for line in self.open(fileid) 339 if line.strip() 340 ] 341 342 def phone_trees(self, utterances=None): 343 if utterances is None: 344 utterances = self._utterances 345 if isinstance(utterances, string_types): 346 utterances = [utterances] 347 348 trees = [] 349 for utterance in utterances: 350 word_times = self.word_times(utterance) 351 phone_times = self.phone_times(utterance) 352 sent_times = self.sent_times(utterance) 353 354 while sent_times: 355 (sent, sent_start, sent_end) = sent_times.pop(0) 356 trees.append(Tree('S', [])) 357 while ( 358 word_times and phone_times and phone_times[0][2] <= word_times[0][1] 359 ): 360 trees[-1].append(phone_times.pop(0)[0]) 361 while word_times and word_times[0][2] <= sent_end: 362 (word, word_start, word_end) = word_times.pop(0) 363 trees[-1].append(Tree(word, [])) 364 while phone_times and phone_times[0][2] <= word_end: 365 trees[-1][-1].append(phone_times.pop(0)[0]) 366 while phone_times and phone_times[0][2] <= sent_end: 367 trees[-1].append(phone_times.pop(0)[0]) 368 return trees 369 370 # [xx] NOTE: This is currently broken -- we're assuming that the 371 # fileids are WAV fileids (aka RIFF), but they're actually NIST SPHERE 372 # fileids. 373 def wav(self, utterance, start=0, end=None): 374 # nltk.chunk conflicts with the stdlib module 'chunk' 375 wave = import_from_stdlib('wave') 376 377 w = wave.open(self.open(utterance + '.wav'), 'rb') 378 379 if end is None: 380 end = w.getnframes() 381 382 # Skip past frames before start, then read the frames we want 383 w.readframes(start) 384 frames = w.readframes(end - start) 385 386 # Open a new temporary file -- the wave module requires 387 # an actual file, and won't work w/ stringio. :( 388 tf = tempfile.TemporaryFile() 389 out = wave.open(tf, 'w') 390 391 # Write the parameters & data to the new file. 392 out.setparams(w.getparams()) 393 out.writeframes(frames) 394 out.close() 395 396 # Read the data back from the file, and return it. The 397 # file will automatically be deleted when we return. 398 tf.seek(0) 399 return tf.read() 400 401 def audiodata(self, utterance, start=0, end=None): 402 assert end is None or end > start 403 headersize = 44 404 if end is None: 405 data = self.open(utterance + '.wav').read() 406 else: 407 data = self.open(utterance + '.wav').read(headersize + end * 2) 408 return data[headersize + start * 2 :] 409 410 def _utterance_fileids(self, utterances, extension): 411 if utterances is None: 412 utterances = self._utterances 413 if isinstance(utterances, string_types): 414 utterances = [utterances] 415 return ['%s%s' % (u, extension) for u in utterances] 416 417 def play(self, utterance, start=0, end=None): 418 """ 419 Play the given audio sample. 420 421 :param utterance: The utterance id of the sample to play 422 """ 423 # Method 1: os audio dev. 424 try: 425 import ossaudiodev 426 427 try: 428 dsp = ossaudiodev.open('w') 429 dsp.setfmt(ossaudiodev.AFMT_S16_LE) 430 dsp.channels(1) 431 dsp.speed(16000) 432 dsp.write(self.audiodata(utterance, start, end)) 433 dsp.close() 434 except IOError as e: 435 print( 436 ( 437 "can't acquire the audio device; please " 438 "activate your audio device." 439 ), 440 file=sys.stderr, 441 ) 442 print("system error message:", str(e), file=sys.stderr) 443 return 444 except ImportError: 445 pass 446 447 # Method 2: pygame 448 try: 449 # FIXME: this won't work under python 3 450 import pygame.mixer, StringIO 451 452 pygame.mixer.init(16000) 453 f = StringIO.StringIO(self.wav(utterance, start, end)) 454 pygame.mixer.Sound(f).play() 455 while pygame.mixer.get_busy(): 456 time.sleep(0.01) 457 return 458 except ImportError: 459 pass 460 461 # Method 3: complain. :) 462 print( 463 ("you must install pygame or ossaudiodev " "for audio playback."), 464 file=sys.stderr, 465 ) 466 467 468@compat.python_2_unicode_compatible 469class SpeakerInfo(object): 470 def __init__( 471 self, id, sex, dr, use, recdate, birthdate, ht, race, edu, comments=None 472 ): 473 self.id = id 474 self.sex = sex 475 self.dr = dr 476 self.use = use 477 self.recdate = recdate 478 self.birthdate = birthdate 479 self.ht = ht 480 self.race = race 481 self.edu = edu 482 self.comments = comments 483 484 def __repr__(self): 485 attribs = 'id sex dr use recdate birthdate ht race edu comments' 486 args = ['%s=%r' % (attr, getattr(self, attr)) for attr in attribs.split()] 487 return 'SpeakerInfo(%s)' % (', '.join(args)) 488 489 490def read_timit_block(stream): 491 """ 492 Block reader for timit tagged sentences, which are preceded by a sentence 493 number that will be ignored. 494 """ 495 line = stream.readline() 496 if not line: 497 return [] 498 n, sent = line.split(' ', 1) 499 return [sent] 500