1# Natural Language Toolkit: TIMIT Corpus Reader
2#
3# Copyright (C) 2001-2007 NLTK Project
4# Author: Haejoong Lee <haejoong@ldc.upenn.edu>
5#         Steven Bird <stevenbird1@gmail.com>
6#         Jacob Perkins <japerk@gmail.com>
7# URL: <http://nltk.org/>
8# For license information, see LICENSE.TXT
9
10# [xx] this docstring is out-of-date:
11"""
12Read tokens, phonemes and audio data from the NLTK TIMIT Corpus.
13
14This corpus contains selected portion of the TIMIT corpus.
15
16 - 16 speakers from 8 dialect regions
17 - 1 male and 1 female from each dialect region
18 - total 130 sentences (10 sentences per speaker.  Note that some
19   sentences are shared among other speakers, especially sa1 and sa2
20   are spoken by all speakers.)
21 - total 160 recording of sentences (10 recordings per speaker)
22 - audio format: NIST Sphere, single channel, 16kHz sampling,
23  16 bit sample, PCM encoding
24
25
26Module contents
27===============
28
29The timit corpus reader provides 4 functions and 4 data items.
30
31 - utterances
32
33   List of utterances in the corpus.  There are total 160 utterances,
34   each of which corresponds to a unique utterance of a speaker.
35   Here's an example of an utterance identifier in the list::
36
37       dr1-fvmh0/sx206
38         - _----  _---
39         | |  |   | |
40         | |  |   | |
41         | |  |   | `--- sentence number
42         | |  |   `----- sentence type (a:all, i:shared, x:exclusive)
43         | |  `--------- speaker ID
44         | `------------ sex (m:male, f:female)
45         `-------------- dialect region (1..8)
46
47 - speakers
48
49   List of speaker IDs.  An example of speaker ID::
50
51       dr1-fvmh0
52
53   Note that if you split an item ID with colon and take the first element of
54   the result, you will get a speaker ID.
55
56       >>> itemid = 'dr1-fvmh0/sx206'
57       >>> spkrid , sentid = itemid.split('/')
58       >>> spkrid
59       'dr1-fvmh0'
60
61   The second element of the result is a sentence ID.
62
63 - dictionary()
64
65   Phonetic dictionary of words contained in this corpus.  This is a Python
66   dictionary from words to phoneme lists.
67
68 - spkrinfo()
69
70   Speaker information table.  It's a Python dictionary from speaker IDs to
71   records of 10 fields.  Speaker IDs the same as the ones in timie.speakers.
72   Each record is a dictionary from field names to values, and the fields are
73   as follows::
74
75     id         speaker ID as defined in the original TIMIT speaker info table
76     sex        speaker gender (M:male, F:female)
77     dr         speaker dialect region (1:new england, 2:northern,
78                3:north midland, 4:south midland, 5:southern, 6:new york city,
79                7:western, 8:army brat (moved around))
80     use        corpus type (TRN:training, TST:test)
81                in this sample corpus only TRN is available
82     recdate    recording date
83     birthdate  speaker birth date
84     ht         speaker height
85     race       speaker race (WHT:white, BLK:black, AMR:american indian,
86                SPN:spanish-american, ORN:oriental,???:unknown)
87     edu        speaker education level (HS:high school, AS:associate degree,
88                BS:bachelor's degree (BS or BA), MS:master's degree (MS or MA),
89                PHD:doctorate degree (PhD,JD,MD), ??:unknown)
90     comments   comments by the recorder
91
92The 4 functions are as follows.
93
94 - tokenized(sentences=items, offset=False)
95
96   Given a list of items, returns an iterator of a list of word lists,
97   each of which corresponds to an item (sentence).  If offset is set to True,
98   each element of the word list is a tuple of word(string), start offset and
99   end offset, where offset is represented as a number of 16kHz samples.
100
101 - phonetic(sentences=items, offset=False)
102
103   Given a list of items, returns an iterator of a list of phoneme lists,
104   each of which corresponds to an item (sentence).  If offset is set to True,
105   each element of the phoneme list is a tuple of word(string), start offset
106   and end offset, where offset is represented as a number of 16kHz samples.
107
108 - audiodata(item, start=0, end=None)
109
110   Given an item, returns a chunk of audio samples formatted into a string.
111   When the fuction is called, if start and end are omitted, the entire
112   samples of the recording will be returned.  If only end is omitted,
113   samples from the start offset to the end of the recording will be returned.
114
115 - play(data)
116
117   Play the given audio samples. The audio samples can be obtained from the
118   timit.audiodata function.
119
120"""
121from __future__ import print_function, unicode_literals
122
123import sys
124import os
125import re
126import tempfile
127import time
128
129from six import string_types
130
131from nltk import compat
132from nltk.tree import Tree
133from nltk.internals import import_from_stdlib
134
135from nltk.corpus.reader.util import *
136from nltk.corpus.reader.api import *
137
138
139class TimitCorpusReader(CorpusReader):
140    """
141    Reader for the TIMIT corpus (or any other corpus with the same
142    file layout and use of file formats).  The corpus root directory
143    should contain the following files:
144
145      - timitdic.txt: dictionary of standard transcriptions
146      - spkrinfo.txt: table of speaker information
147
148    In addition, the root directory should contain one subdirectory
149    for each speaker, containing three files for each utterance:
150
151      - <utterance-id>.txt: text content of utterances
152      - <utterance-id>.wrd: tokenized text content of utterances
153      - <utterance-id>.phn: phonetic transcription of utterances
154      - <utterance-id>.wav: utterance sound file
155    """
156
157    _FILE_RE = r'(\w+-\w+/\w+\.(phn|txt|wav|wrd))|' + r'timitdic\.txt|spkrinfo\.txt'
158    """A regexp matching fileids that are used by this corpus reader."""
159    _UTTERANCE_RE = r'\w+-\w+/\w+\.txt'
160
161    def __init__(self, root, encoding='utf8'):
162        """
163        Construct a new TIMIT corpus reader in the given directory.
164        :param root: The root directory for this corpus.
165        """
166        # Ensure that wave files don't get treated as unicode data:
167        if isinstance(encoding, string_types):
168            encoding = [('.*\.wav', None), ('.*', encoding)]
169
170        CorpusReader.__init__(
171            self, root, find_corpus_fileids(root, self._FILE_RE), encoding=encoding
172        )
173
174        self._utterances = [
175            name[:-4] for name in find_corpus_fileids(root, self._UTTERANCE_RE)
176        ]
177        """A list of the utterance identifiers for all utterances in
178        this corpus."""
179
180        self._speakerinfo = None
181        self._root = root
182        self.speakers = sorted(set(u.split('/')[0] for u in self._utterances))
183
184    def fileids(self, filetype=None):
185        """
186        Return a list of file identifiers for the files that make up
187        this corpus.
188
189        :param filetype: If specified, then ``filetype`` indicates that
190            only the files that have the given type should be
191            returned.  Accepted values are: ``txt``, ``wrd``, ``phn``,
192            ``wav``, or ``metadata``,
193        """
194        if filetype is None:
195            return CorpusReader.fileids(self)
196        elif filetype in ('txt', 'wrd', 'phn', 'wav'):
197            return ['%s.%s' % (u, filetype) for u in self._utterances]
198        elif filetype == 'metadata':
199            return ['timitdic.txt', 'spkrinfo.txt']
200        else:
201            raise ValueError('Bad value for filetype: %r' % filetype)
202
203    def utteranceids(
204        self, dialect=None, sex=None, spkrid=None, sent_type=None, sentid=None
205    ):
206        """
207        :return: A list of the utterance identifiers for all
208        utterances in this corpus, or for the given speaker, dialect
209        region, gender, sentence type, or sentence number, if
210        specified.
211        """
212        if isinstance(dialect, string_types):
213            dialect = [dialect]
214        if isinstance(sex, string_types):
215            sex = [sex]
216        if isinstance(spkrid, string_types):
217            spkrid = [spkrid]
218        if isinstance(sent_type, string_types):
219            sent_type = [sent_type]
220        if isinstance(sentid, string_types):
221            sentid = [sentid]
222
223        utterances = self._utterances[:]
224        if dialect is not None:
225            utterances = [u for u in utterances if u[2] in dialect]
226        if sex is not None:
227            utterances = [u for u in utterances if u[4] in sex]
228        if spkrid is not None:
229            utterances = [u for u in utterances if u[:9] in spkrid]
230        if sent_type is not None:
231            utterances = [u for u in utterances if u[11] in sent_type]
232        if sentid is not None:
233            utterances = [u for u in utterances if u[10:] in spkrid]
234        return utterances
235
236    def transcription_dict(self):
237        """
238        :return: A dictionary giving the 'standard' transcription for
239        each word.
240        """
241        _transcriptions = {}
242        for line in self.open('timitdic.txt'):
243            if not line.strip() or line[0] == ';':
244                continue
245            m = re.match(r'\s*(\S+)\s+/(.*)/\s*$', line)
246            if not m:
247                raise ValueError('Bad line: %r' % line)
248            _transcriptions[m.group(1)] = m.group(2).split()
249        return _transcriptions
250
251    def spkrid(self, utterance):
252        return utterance.split('/')[0]
253
254    def sentid(self, utterance):
255        return utterance.split('/')[1]
256
257    def utterance(self, spkrid, sentid):
258        return '%s/%s' % (spkrid, sentid)
259
260    def spkrutteranceids(self, speaker):
261        """
262        :return: A list of all utterances associated with a given
263        speaker.
264        """
265        return [
266            utterance
267            for utterance in self._utterances
268            if utterance.startswith(speaker + '/')
269        ]
270
271    def spkrinfo(self, speaker):
272        """
273        :return: A dictionary mapping .. something.
274        """
275        if speaker in self._utterances:
276            speaker = self.spkrid(speaker)
277
278        if self._speakerinfo is None:
279            self._speakerinfo = {}
280            for line in self.open('spkrinfo.txt'):
281                if not line.strip() or line[0] == ';':
282                    continue
283                rec = line.strip().split(None, 9)
284                key = "dr%s-%s%s" % (rec[2], rec[1].lower(), rec[0].lower())
285                self._speakerinfo[key] = SpeakerInfo(*rec)
286
287        return self._speakerinfo[speaker]
288
289    def phones(self, utterances=None):
290        return [
291            line.split()[-1]
292            for fileid in self._utterance_fileids(utterances, '.phn')
293            for line in self.open(fileid)
294            if line.strip()
295        ]
296
297    def phone_times(self, utterances=None):
298        """
299        offset is represented as a number of 16kHz samples!
300        """
301        return [
302            (line.split()[2], int(line.split()[0]), int(line.split()[1]))
303            for fileid in self._utterance_fileids(utterances, '.phn')
304            for line in self.open(fileid)
305            if line.strip()
306        ]
307
308    def words(self, utterances=None):
309        return [
310            line.split()[-1]
311            for fileid in self._utterance_fileids(utterances, '.wrd')
312            for line in self.open(fileid)
313            if line.strip()
314        ]
315
316    def word_times(self, utterances=None):
317        return [
318            (line.split()[2], int(line.split()[0]), int(line.split()[1]))
319            for fileid in self._utterance_fileids(utterances, '.wrd')
320            for line in self.open(fileid)
321            if line.strip()
322        ]
323
324    def sents(self, utterances=None):
325        return [
326            [line.split()[-1] for line in self.open(fileid) if line.strip()]
327            for fileid in self._utterance_fileids(utterances, '.wrd')
328        ]
329
330    def sent_times(self, utterances=None):
331        return [
332            (
333                line.split(None, 2)[-1].strip(),
334                int(line.split()[0]),
335                int(line.split()[1]),
336            )
337            for fileid in self._utterance_fileids(utterances, '.txt')
338            for line in self.open(fileid)
339            if line.strip()
340        ]
341
342    def phone_trees(self, utterances=None):
343        if utterances is None:
344            utterances = self._utterances
345        if isinstance(utterances, string_types):
346            utterances = [utterances]
347
348        trees = []
349        for utterance in utterances:
350            word_times = self.word_times(utterance)
351            phone_times = self.phone_times(utterance)
352            sent_times = self.sent_times(utterance)
353
354            while sent_times:
355                (sent, sent_start, sent_end) = sent_times.pop(0)
356                trees.append(Tree('S', []))
357                while (
358                    word_times and phone_times and phone_times[0][2] <= word_times[0][1]
359                ):
360                    trees[-1].append(phone_times.pop(0)[0])
361                while word_times and word_times[0][2] <= sent_end:
362                    (word, word_start, word_end) = word_times.pop(0)
363                    trees[-1].append(Tree(word, []))
364                    while phone_times and phone_times[0][2] <= word_end:
365                        trees[-1][-1].append(phone_times.pop(0)[0])
366                while phone_times and phone_times[0][2] <= sent_end:
367                    trees[-1].append(phone_times.pop(0)[0])
368        return trees
369
370    # [xx] NOTE: This is currently broken -- we're assuming that the
371    # fileids are WAV fileids (aka RIFF), but they're actually NIST SPHERE
372    # fileids.
373    def wav(self, utterance, start=0, end=None):
374        # nltk.chunk conflicts with the stdlib module 'chunk'
375        wave = import_from_stdlib('wave')
376
377        w = wave.open(self.open(utterance + '.wav'), 'rb')
378
379        if end is None:
380            end = w.getnframes()
381
382        # Skip past frames before start, then read the frames we want
383        w.readframes(start)
384        frames = w.readframes(end - start)
385
386        # Open a new temporary file -- the wave module requires
387        # an actual file, and won't work w/ stringio. :(
388        tf = tempfile.TemporaryFile()
389        out = wave.open(tf, 'w')
390
391        # Write the parameters & data to the new file.
392        out.setparams(w.getparams())
393        out.writeframes(frames)
394        out.close()
395
396        # Read the data back from the file, and return it.  The
397        # file will automatically be deleted when we return.
398        tf.seek(0)
399        return tf.read()
400
401    def audiodata(self, utterance, start=0, end=None):
402        assert end is None or end > start
403        headersize = 44
404        if end is None:
405            data = self.open(utterance + '.wav').read()
406        else:
407            data = self.open(utterance + '.wav').read(headersize + end * 2)
408        return data[headersize + start * 2 :]
409
410    def _utterance_fileids(self, utterances, extension):
411        if utterances is None:
412            utterances = self._utterances
413        if isinstance(utterances, string_types):
414            utterances = [utterances]
415        return ['%s%s' % (u, extension) for u in utterances]
416
417    def play(self, utterance, start=0, end=None):
418        """
419        Play the given audio sample.
420
421        :param utterance: The utterance id of the sample to play
422        """
423        # Method 1: os audio dev.
424        try:
425            import ossaudiodev
426
427            try:
428                dsp = ossaudiodev.open('w')
429                dsp.setfmt(ossaudiodev.AFMT_S16_LE)
430                dsp.channels(1)
431                dsp.speed(16000)
432                dsp.write(self.audiodata(utterance, start, end))
433                dsp.close()
434            except IOError as e:
435                print(
436                    (
437                        "can't acquire the audio device; please "
438                        "activate your audio device."
439                    ),
440                    file=sys.stderr,
441                )
442                print("system error message:", str(e), file=sys.stderr)
443            return
444        except ImportError:
445            pass
446
447        # Method 2: pygame
448        try:
449            # FIXME: this won't work under python 3
450            import pygame.mixer, StringIO
451
452            pygame.mixer.init(16000)
453            f = StringIO.StringIO(self.wav(utterance, start, end))
454            pygame.mixer.Sound(f).play()
455            while pygame.mixer.get_busy():
456                time.sleep(0.01)
457            return
458        except ImportError:
459            pass
460
461        # Method 3: complain. :)
462        print(
463            ("you must install pygame or ossaudiodev " "for audio playback."),
464            file=sys.stderr,
465        )
466
467
468@compat.python_2_unicode_compatible
469class SpeakerInfo(object):
470    def __init__(
471        self, id, sex, dr, use, recdate, birthdate, ht, race, edu, comments=None
472    ):
473        self.id = id
474        self.sex = sex
475        self.dr = dr
476        self.use = use
477        self.recdate = recdate
478        self.birthdate = birthdate
479        self.ht = ht
480        self.race = race
481        self.edu = edu
482        self.comments = comments
483
484    def __repr__(self):
485        attribs = 'id sex dr use recdate birthdate ht race edu comments'
486        args = ['%s=%r' % (attr, getattr(self, attr)) for attr in attribs.split()]
487        return 'SpeakerInfo(%s)' % (', '.join(args))
488
489
490def read_timit_block(stream):
491    """
492    Block reader for timit tagged sentences, which are preceded by a sentence
493    number that will be ignored.
494    """
495    line = stream.readline()
496    if not line:
497        return []
498    n, sent = line.split(' ', 1)
499    return [sent]
500