1#!/usr/bin/env python3
2
3"""Library for performing speech recognition, with support for several engines and APIs, online and offline."""
4
5import io
6import os
7import sys
8import subprocess
9import wave
10import aifc
11import math
12import audioop
13import collections
14import json
15import base64
16import threading
17import platform
18import stat
19import hashlib
20import hmac
21import time
22import uuid
23
24__author__ = "Anthony Zhang (Uberi)"
25__version__ = "3.8.1"
26__license__ = "BSD"
27
28try:  # attempt to use the Python 2 modules
29    from urllib import urlencode
30    from urllib2 import Request, urlopen, URLError, HTTPError
31except ImportError:  # use the Python 3 modules
32    from urllib.parse import urlencode
33    from urllib.request import Request, urlopen
34    from urllib.error import URLError, HTTPError
35
36
37class WaitTimeoutError(Exception): pass
38
39
40class RequestError(Exception): pass
41
42
43class UnknownValueError(Exception): pass
44
45
46class AudioSource(object):
47    def __init__(self):
48        raise NotImplementedError("this is an abstract class")
49
50    def __enter__(self):
51        raise NotImplementedError("this is an abstract class")
52
53    def __exit__(self, exc_type, exc_value, traceback):
54        raise NotImplementedError("this is an abstract class")
55
56
57class Microphone(AudioSource):
58    """
59    Creates a new ``Microphone`` instance, which represents a physical microphone on the computer. Subclass of ``AudioSource``.
60
61    This will throw an ``AttributeError`` if you don't have PyAudio 0.2.11 or later installed.
62
63    If ``device_index`` is unspecified or ``None``, the default microphone is used as the audio source. Otherwise, ``device_index`` should be the index of the device to use for audio input.
64
65    A device index is an integer between 0 and ``pyaudio.get_device_count() - 1`` (assume we have used ``import pyaudio`` beforehand) inclusive. It represents an audio device such as a microphone or speaker. See the `PyAudio documentation <http://people.csail.mit.edu/hubert/pyaudio/docs/>`__ for more details.
66
67    The microphone audio is recorded in chunks of ``chunk_size`` samples, at a rate of ``sample_rate`` samples per second (Hertz). If not specified, the value of ``sample_rate`` is determined automatically from the system's microphone settings.
68
69    Higher ``sample_rate`` values result in better audio quality, but also more bandwidth (and therefore, slower recognition). Additionally, some CPUs, such as those in older Raspberry Pi models, can't keep up if this value is too high.
70
71    Higher ``chunk_size`` values help avoid triggering on rapidly changing ambient noise, but also makes detection less sensitive. This value, generally, should be left at its default.
72    """
73    def __init__(self, device_index=None, sample_rate=None, chunk_size=1024):
74        assert device_index is None or isinstance(device_index, int), "Device index must be None or an integer"
75        assert sample_rate is None or (isinstance(sample_rate, int) and sample_rate > 0), "Sample rate must be None or a positive integer"
76        assert isinstance(chunk_size, int) and chunk_size > 0, "Chunk size must be a positive integer"
77
78        # set up PyAudio
79        self.pyaudio_module = self.get_pyaudio()
80        audio = self.pyaudio_module.PyAudio()
81        try:
82            count = audio.get_device_count()  # obtain device count
83            if device_index is not None:  # ensure device index is in range
84                assert 0 <= device_index < count, "Device index out of range ({} devices available; device index should be between 0 and {} inclusive)".format(count, count - 1)
85            if sample_rate is None:  # automatically set the sample rate to the hardware's default sample rate if not specified
86                device_info = audio.get_device_info_by_index(device_index) if device_index is not None else audio.get_default_input_device_info()
87                assert isinstance(device_info.get("defaultSampleRate"), (float, int)) and device_info["defaultSampleRate"] > 0, "Invalid device info returned from PyAudio: {}".format(device_info)
88                sample_rate = int(device_info["defaultSampleRate"])
89        except Exception:
90            audio.terminate()
91            raise
92
93        self.device_index = device_index
94        self.format = self.pyaudio_module.paInt16  # 16-bit int sampling
95        self.SAMPLE_WIDTH = self.pyaudio_module.get_sample_size(self.format)  # size of each sample
96        self.SAMPLE_RATE = sample_rate  # sampling rate in Hertz
97        self.CHUNK = chunk_size  # number of frames stored in each buffer
98
99        self.audio = None
100        self.stream = None
101
102    @staticmethod
103    def get_pyaudio():
104        """
105        Imports the pyaudio module and checks its version. Throws exceptions if pyaudio can't be found or a wrong version is installed
106        """
107        try:
108            import pyaudio
109        except ImportError:
110            raise AttributeError("Could not find PyAudio; check installation")
111        from distutils.version import LooseVersion
112        if LooseVersion(pyaudio.__version__) < LooseVersion("0.2.11"):
113            raise AttributeError("PyAudio 0.2.11 or later is required (found version {})".format(pyaudio.__version__))
114        return pyaudio
115
116    @staticmethod
117    def list_microphone_names():
118        """
119        Returns a list of the names of all available microphones. For microphones where the name can't be retrieved, the list entry contains ``None`` instead.
120
121        The index of each microphone's name is the same as its device index when creating a ``Microphone`` instance - indices in this list can be used as values of ``device_index``.
122        """
123        audio = Microphone.get_pyaudio().PyAudio()
124        try:
125            result = []
126            for i in range(audio.get_device_count()):
127                device_info = audio.get_device_info_by_index(i)
128                result.append(device_info.get("name"))
129        finally:
130            audio.terminate()
131        return result
132
133    def __enter__(self):
134        assert self.stream is None, "This audio source is already inside a context manager"
135        self.audio = self.pyaudio_module.PyAudio()
136        try:
137            self.stream = Microphone.MicrophoneStream(
138                self.audio.open(
139                    input_device_index=self.device_index, channels=1,
140                    format=self.format, rate=self.SAMPLE_RATE, frames_per_buffer=self.CHUNK,
141                    input=True,  # stream is an input stream
142                )
143            )
144        except Exception:
145            self.audio.terminate()
146            raise
147        return self
148
149    def __exit__(self, exc_type, exc_value, traceback):
150        try:
151            self.stream.close()
152        finally:
153            self.stream = None
154            self.audio.terminate()
155
156    class MicrophoneStream(object):
157        def __init__(self, pyaudio_stream):
158            self.pyaudio_stream = pyaudio_stream
159
160        def read(self, size):
161            return self.pyaudio_stream.read(size, exception_on_overflow=False)
162
163        def close(self):
164            try:
165                # sometimes, if the stream isn't stopped, closing the stream throws an exception
166                if not self.pyaudio_stream.is_stopped():
167                    self.pyaudio_stream.stop_stream()
168            finally:
169                self.pyaudio_stream.close()
170
171
172class AudioFile(AudioSource):
173    """
174    Creates a new ``AudioFile`` instance given a WAV/AIFF/FLAC audio file ``filename_or_fileobject``. Subclass of ``AudioSource``.
175
176    If ``filename_or_fileobject`` is a string, then it is interpreted as a path to an audio file on the filesystem. Otherwise, ``filename_or_fileobject`` should be a file-like object such as ``io.BytesIO`` or similar.
177
178    Note that functions that read from the audio (such as ``recognizer_instance.record`` or ``recognizer_instance.listen``) will move ahead in the stream. For example, if you execute ``recognizer_instance.record(audiofile_instance, duration=10)`` twice, the first time it will return the first 10 seconds of audio, and the second time it will return the 10 seconds of audio right after that. This is always reset to the beginning when entering an ``AudioFile`` context.
179
180    WAV files must be in PCM/LPCM format; WAVE_FORMAT_EXTENSIBLE and compressed WAV are not supported and may result in undefined behaviour.
181
182    Both AIFF and AIFF-C (compressed AIFF) formats are supported.
183
184    FLAC files must be in native FLAC format; OGG-FLAC is not supported and may result in undefined behaviour.
185    """
186
187    def __init__(self, filename_or_fileobject):
188        assert isinstance(filename_or_fileobject, (type(""), type(u""))) or hasattr(filename_or_fileobject, "read"), "Given audio file must be a filename string or a file-like object"
189        self.filename_or_fileobject = filename_or_fileobject
190        self.stream = None
191        self.DURATION = None
192
193        self.audio_reader = None
194        self.little_endian = False
195        self.SAMPLE_RATE = None
196        self.CHUNK = None
197        self.FRAME_COUNT = None
198
199    def __enter__(self):
200        assert self.stream is None, "This audio source is already inside a context manager"
201        try:
202            # attempt to read the file as WAV
203            self.audio_reader = wave.open(self.filename_or_fileobject, "rb")
204            self.little_endian = True  # RIFF WAV is a little-endian format (most ``audioop`` operations assume that the frames are stored in little-endian form)
205        except (wave.Error, EOFError):
206            try:
207                # attempt to read the file as AIFF
208                self.audio_reader = aifc.open(self.filename_or_fileobject, "rb")
209                self.little_endian = False  # AIFF is a big-endian format
210            except (aifc.Error, EOFError):
211                # attempt to read the file as FLAC
212                if hasattr(self.filename_or_fileobject, "read"):
213                    flac_data = self.filename_or_fileobject.read()
214                else:
215                    with open(self.filename_or_fileobject, "rb") as f: flac_data = f.read()
216
217                # run the FLAC converter with the FLAC data to get the AIFF data
218                flac_converter = get_flac_converter()
219                if os.name == "nt":  # on Windows, specify that the process is to be started without showing a console window
220                    startup_info = subprocess.STARTUPINFO()
221                    startup_info.dwFlags |= subprocess.STARTF_USESHOWWINDOW  # specify that the wShowWindow field of `startup_info` contains a value
222                    startup_info.wShowWindow = subprocess.SW_HIDE  # specify that the console window should be hidden
223                else:
224                    startup_info = None  # default startupinfo
225                process = subprocess.Popen([
226                    flac_converter,
227                    "--stdout", "--totally-silent",  # put the resulting AIFF file in stdout, and make sure it's not mixed with any program output
228                    "--decode", "--force-aiff-format",  # decode the FLAC file into an AIFF file
229                    "-",  # the input FLAC file contents will be given in stdin
230                ], stdin=subprocess.PIPE, stdout=subprocess.PIPE, startupinfo=startup_info)
231                aiff_data, _ = process.communicate(flac_data)
232                aiff_file = io.BytesIO(aiff_data)
233                try:
234                    self.audio_reader = aifc.open(aiff_file, "rb")
235                except (aifc.Error, EOFError):
236                    raise ValueError("Audio file could not be read as PCM WAV, AIFF/AIFF-C, or Native FLAC; check if file is corrupted or in another format")
237                self.little_endian = False  # AIFF is a big-endian format
238        assert 1 <= self.audio_reader.getnchannels() <= 2, "Audio must be mono or stereo"
239        self.SAMPLE_WIDTH = self.audio_reader.getsampwidth()
240
241        # 24-bit audio needs some special handling for old Python versions (workaround for https://bugs.python.org/issue12866)
242        samples_24_bit_pretending_to_be_32_bit = False
243        if self.SAMPLE_WIDTH == 3:  # 24-bit audio
244            try: audioop.bias(b"", self.SAMPLE_WIDTH, 0)  # test whether this sample width is supported (for example, ``audioop`` in Python 3.3 and below don't support sample width 3, while Python 3.4+ do)
245            except audioop.error:  # this version of audioop doesn't support 24-bit audio (probably Python 3.3 or less)
246                samples_24_bit_pretending_to_be_32_bit = True  # while the ``AudioFile`` instance will outwardly appear to be 32-bit, it will actually internally be 24-bit
247                self.SAMPLE_WIDTH = 4  # the ``AudioFile`` instance should present itself as a 32-bit stream now, since we'll be converting into 32-bit on the fly when reading
248
249        self.SAMPLE_RATE = self.audio_reader.getframerate()
250        self.CHUNK = 4096
251        self.FRAME_COUNT = self.audio_reader.getnframes()
252        self.DURATION = self.FRAME_COUNT / float(self.SAMPLE_RATE)
253        self.stream = AudioFile.AudioFileStream(self.audio_reader, self.little_endian, samples_24_bit_pretending_to_be_32_bit)
254        return self
255
256    def __exit__(self, exc_type, exc_value, traceback):
257        if not hasattr(self.filename_or_fileobject, "read"):  # only close the file if it was opened by this class in the first place (if the file was originally given as a path)
258            self.audio_reader.close()
259        self.stream = None
260        self.DURATION = None
261
262    class AudioFileStream(object):
263        def __init__(self, audio_reader, little_endian, samples_24_bit_pretending_to_be_32_bit):
264            self.audio_reader = audio_reader  # an audio file object (e.g., a `wave.Wave_read` instance)
265            self.little_endian = little_endian  # whether the audio data is little-endian (when working with big-endian things, we'll have to convert it to little-endian before we process it)
266            self.samples_24_bit_pretending_to_be_32_bit = samples_24_bit_pretending_to_be_32_bit  # this is true if the audio is 24-bit audio, but 24-bit audio isn't supported, so we have to pretend that this is 32-bit audio and convert it on the fly
267
268        def read(self, size=-1):
269            buffer = self.audio_reader.readframes(self.audio_reader.getnframes() if size == -1 else size)
270            if not isinstance(buffer, bytes): buffer = b""  # workaround for https://bugs.python.org/issue24608
271
272            sample_width = self.audio_reader.getsampwidth()
273            if not self.little_endian:  # big endian format, convert to little endian on the fly
274                if hasattr(audioop, "byteswap"):  # ``audioop.byteswap`` was only added in Python 3.4 (incidentally, that also means that we don't need to worry about 24-bit audio being unsupported, since Python 3.4+ always has that functionality)
275                    buffer = audioop.byteswap(buffer, sample_width)
276                else:  # manually reverse the bytes of each sample, which is slower but works well enough as a fallback
277                    buffer = buffer[sample_width - 1::-1] + b"".join(buffer[i + sample_width:i:-1] for i in range(sample_width - 1, len(buffer), sample_width))
278
279            # workaround for https://bugs.python.org/issue12866
280            if self.samples_24_bit_pretending_to_be_32_bit:  # we need to convert samples from 24-bit to 32-bit before we can process them with ``audioop`` functions
281                buffer = b"".join(b"\x00" + buffer[i:i + sample_width] for i in range(0, len(buffer), sample_width))  # since we're in little endian, we prepend a zero byte to each 24-bit sample to get a 32-bit sample
282                sample_width = 4  # make sure we thread the buffer as 32-bit audio now, after converting it from 24-bit audio
283            if self.audio_reader.getnchannels() != 1:  # stereo audio
284                buffer = audioop.tomono(buffer, sample_width, 1, 1)  # convert stereo audio data to mono
285            return buffer
286
287
288class AudioData(object):
289    """
290    Creates a new ``AudioData`` instance, which represents mono audio data.
291
292    The raw audio data is specified by ``frame_data``, which is a sequence of bytes representing audio samples. This is the frame data structure used by the PCM WAV format.
293
294    The width of each sample, in bytes, is specified by ``sample_width``. Each group of ``sample_width`` bytes represents a single audio sample.
295
296    The audio data is assumed to have a sample rate of ``sample_rate`` samples per second (Hertz).
297
298    Usually, instances of this class are obtained from ``recognizer_instance.record`` or ``recognizer_instance.listen``, or in the callback for ``recognizer_instance.listen_in_background``, rather than instantiating them directly.
299    """
300    def __init__(self, frame_data, sample_rate, sample_width):
301        assert sample_rate > 0, "Sample rate must be a positive integer"
302        assert sample_width % 1 == 0 and 1 <= sample_width <= 4, "Sample width must be between 1 and 4 inclusive"
303        self.frame_data = frame_data
304        self.sample_rate = sample_rate
305        self.sample_width = int(sample_width)
306
307    def get_segment(self, start_ms=None, end_ms=None):
308        """
309        Returns a new ``AudioData`` instance, trimmed to a given time interval. In other words, an ``AudioData`` instance with the same audio data except starting at ``start_ms`` milliseconds in and ending ``end_ms`` milliseconds in.
310
311        If not specified, ``start_ms`` defaults to the beginning of the audio, and ``end_ms`` defaults to the end.
312        """
313        assert start_ms is None or start_ms >= 0, "``start_ms`` must be a non-negative number"
314        assert end_ms is None or end_ms >= (0 if start_ms is None else start_ms), "``end_ms`` must be a non-negative number greater or equal to ``start_ms``"
315        if start_ms is None:
316            start_byte = 0
317        else:
318            start_byte = int((start_ms * self.sample_rate * self.sample_width) // 1000)
319        if end_ms is None:
320            end_byte = len(self.frame_data)
321        else:
322            end_byte = int((end_ms * self.sample_rate * self.sample_width) // 1000)
323        return AudioData(self.frame_data[start_byte:end_byte], self.sample_rate, self.sample_width)
324
325    def get_raw_data(self, convert_rate=None, convert_width=None):
326        """
327        Returns a byte string representing the raw frame data for the audio represented by the ``AudioData`` instance.
328
329        If ``convert_rate`` is specified and the audio sample rate is not ``convert_rate`` Hz, the resulting audio is resampled to match.
330
331        If ``convert_width`` is specified and the audio samples are not ``convert_width`` bytes each, the resulting audio is converted to match.
332
333        Writing these bytes directly to a file results in a valid `RAW/PCM audio file <https://en.wikipedia.org/wiki/Raw_audio_format>`__.
334        """
335        assert convert_rate is None or convert_rate > 0, "Sample rate to convert to must be a positive integer"
336        assert convert_width is None or (convert_width % 1 == 0 and 1 <= convert_width <= 4), "Sample width to convert to must be between 1 and 4 inclusive"
337
338        raw_data = self.frame_data
339
340        # make sure unsigned 8-bit audio (which uses unsigned samples) is handled like higher sample width audio (which uses signed samples)
341        if self.sample_width == 1:
342            raw_data = audioop.bias(raw_data, 1, -128)  # subtract 128 from every sample to make them act like signed samples
343
344        # resample audio at the desired rate if specified
345        if convert_rate is not None and self.sample_rate != convert_rate:
346            raw_data, _ = audioop.ratecv(raw_data, self.sample_width, 1, self.sample_rate, convert_rate, None)
347
348        # convert samples to desired sample width if specified
349        if convert_width is not None and self.sample_width != convert_width:
350            if convert_width == 3:  # we're converting the audio into 24-bit (workaround for https://bugs.python.org/issue12866)
351                raw_data = audioop.lin2lin(raw_data, self.sample_width, 4)  # convert audio into 32-bit first, which is always supported
352                try: audioop.bias(b"", 3, 0)  # test whether 24-bit audio is supported (for example, ``audioop`` in Python 3.3 and below don't support sample width 3, while Python 3.4+ do)
353                except audioop.error:  # this version of audioop doesn't support 24-bit audio (probably Python 3.3 or less)
354                    raw_data = b"".join(raw_data[i + 1:i + 4] for i in range(0, len(raw_data), 4))  # since we're in little endian, we discard the first byte from each 32-bit sample to get a 24-bit sample
355                else:  # 24-bit audio fully supported, we don't need to shim anything
356                    raw_data = audioop.lin2lin(raw_data, self.sample_width, convert_width)
357            else:
358                raw_data = audioop.lin2lin(raw_data, self.sample_width, convert_width)
359
360        # if the output is 8-bit audio with unsigned samples, convert the samples we've been treating as signed to unsigned again
361        if convert_width == 1:
362            raw_data = audioop.bias(raw_data, 1, 128)  # add 128 to every sample to make them act like unsigned samples again
363
364        return raw_data
365
366    def get_wav_data(self, convert_rate=None, convert_width=None):
367        """
368        Returns a byte string representing the contents of a WAV file containing the audio represented by the ``AudioData`` instance.
369
370        If ``convert_width`` is specified and the audio samples are not ``convert_width`` bytes each, the resulting audio is converted to match.
371
372        If ``convert_rate`` is specified and the audio sample rate is not ``convert_rate`` Hz, the resulting audio is resampled to match.
373
374        Writing these bytes directly to a file results in a valid `WAV file <https://en.wikipedia.org/wiki/WAV>`__.
375        """
376        raw_data = self.get_raw_data(convert_rate, convert_width)
377        sample_rate = self.sample_rate if convert_rate is None else convert_rate
378        sample_width = self.sample_width if convert_width is None else convert_width
379
380        # generate the WAV file contents
381        with io.BytesIO() as wav_file:
382            wav_writer = wave.open(wav_file, "wb")
383            try:  # note that we can't use context manager, since that was only added in Python 3.4
384                wav_writer.setframerate(sample_rate)
385                wav_writer.setsampwidth(sample_width)
386                wav_writer.setnchannels(1)
387                wav_writer.writeframes(raw_data)
388                wav_data = wav_file.getvalue()
389            finally:  # make sure resources are cleaned up
390                wav_writer.close()
391        return wav_data
392
393    def get_aiff_data(self, convert_rate=None, convert_width=None):
394        """
395        Returns a byte string representing the contents of an AIFF-C file containing the audio represented by the ``AudioData`` instance.
396
397        If ``convert_width`` is specified and the audio samples are not ``convert_width`` bytes each, the resulting audio is converted to match.
398
399        If ``convert_rate`` is specified and the audio sample rate is not ``convert_rate`` Hz, the resulting audio is resampled to match.
400
401        Writing these bytes directly to a file results in a valid `AIFF-C file <https://en.wikipedia.org/wiki/Audio_Interchange_File_Format>`__.
402        """
403        raw_data = self.get_raw_data(convert_rate, convert_width)
404        sample_rate = self.sample_rate if convert_rate is None else convert_rate
405        sample_width = self.sample_width if convert_width is None else convert_width
406
407        # the AIFF format is big-endian, so we need to covnert the little-endian raw data to big-endian
408        if hasattr(audioop, "byteswap"):  # ``audioop.byteswap`` was only added in Python 3.4
409            raw_data = audioop.byteswap(raw_data, sample_width)
410        else:  # manually reverse the bytes of each sample, which is slower but works well enough as a fallback
411            raw_data = raw_data[sample_width - 1::-1] + b"".join(raw_data[i + sample_width:i:-1] for i in range(sample_width - 1, len(raw_data), sample_width))
412
413        # generate the AIFF-C file contents
414        with io.BytesIO() as aiff_file:
415            aiff_writer = aifc.open(aiff_file, "wb")
416            try:  # note that we can't use context manager, since that was only added in Python 3.4
417                aiff_writer.setframerate(sample_rate)
418                aiff_writer.setsampwidth(sample_width)
419                aiff_writer.setnchannels(1)
420                aiff_writer.writeframes(raw_data)
421                aiff_data = aiff_file.getvalue()
422            finally:  # make sure resources are cleaned up
423                aiff_writer.close()
424        return aiff_data
425
426    def get_flac_data(self, convert_rate=None, convert_width=None):
427        """
428        Returns a byte string representing the contents of a FLAC file containing the audio represented by the ``AudioData`` instance.
429
430        Note that 32-bit FLAC is not supported. If the audio data is 32-bit and ``convert_width`` is not specified, then the resulting FLAC will be a 24-bit FLAC.
431
432        If ``convert_rate`` is specified and the audio sample rate is not ``convert_rate`` Hz, the resulting audio is resampled to match.
433
434        If ``convert_width`` is specified and the audio samples are not ``convert_width`` bytes each, the resulting audio is converted to match.
435
436        Writing these bytes directly to a file results in a valid `FLAC file <https://en.wikipedia.org/wiki/FLAC>`__.
437        """
438        assert convert_width is None or (convert_width % 1 == 0 and 1 <= convert_width <= 3), "Sample width to convert to must be between 1 and 3 inclusive"
439
440        if self.sample_width > 3 and convert_width is None:  # resulting WAV data would be 32-bit, which is not convertable to FLAC using our encoder
441            convert_width = 3  # the largest supported sample width is 24-bit, so we'll limit the sample width to that
442
443        # run the FLAC converter with the WAV data to get the FLAC data
444        wav_data = self.get_wav_data(convert_rate, convert_width)
445        flac_converter = get_flac_converter()
446        if os.name == "nt":  # on Windows, specify that the process is to be started without showing a console window
447            startup_info = subprocess.STARTUPINFO()
448            startup_info.dwFlags |= subprocess.STARTF_USESHOWWINDOW  # specify that the wShowWindow field of `startup_info` contains a value
449            startup_info.wShowWindow = subprocess.SW_HIDE  # specify that the console window should be hidden
450        else:
451            startup_info = None  # default startupinfo
452        process = subprocess.Popen([
453            flac_converter,
454            "--stdout", "--totally-silent",  # put the resulting FLAC file in stdout, and make sure it's not mixed with any program output
455            "--best",  # highest level of compression available
456            "-",  # the input FLAC file contents will be given in stdin
457        ], stdin=subprocess.PIPE, stdout=subprocess.PIPE, startupinfo=startup_info)
458        flac_data, stderr = process.communicate(wav_data)
459        return flac_data
460
461
462class Recognizer(AudioSource):
463    def __init__(self):
464        """
465        Creates a new ``Recognizer`` instance, which represents a collection of speech recognition functionality.
466        """
467        self.energy_threshold = 300  # minimum audio energy to consider for recording
468        self.dynamic_energy_threshold = True
469        self.dynamic_energy_adjustment_damping = 0.15
470        self.dynamic_energy_ratio = 1.5
471        self.pause_threshold = 0.8  # seconds of non-speaking audio before a phrase is considered complete
472        self.operation_timeout = None  # seconds after an internal operation (e.g., an API request) starts before it times out, or ``None`` for no timeout
473
474        self.phrase_threshold = 0.3  # minimum seconds of speaking audio before we consider the speaking audio a phrase - values below this are ignored (for filtering out clicks and pops)
475        self.non_speaking_duration = 0.5  # seconds of non-speaking audio to keep on both sides of the recording
476
477    def record(self, source, duration=None, offset=None):
478        """
479        Records up to ``duration`` seconds of audio from ``source`` (an ``AudioSource`` instance) starting at ``offset`` (or at the beginning if not specified) into an ``AudioData`` instance, which it returns.
480
481        If ``duration`` is not specified, then it will record until there is no more audio input.
482        """
483        assert isinstance(source, AudioSource), "Source must be an audio source"
484        assert source.stream is not None, "Audio source must be entered before recording, see documentation for ``AudioSource``; are you using ``source`` outside of a ``with`` statement?"
485
486        frames = io.BytesIO()
487        seconds_per_buffer = (source.CHUNK + 0.0) / source.SAMPLE_RATE
488        elapsed_time = 0
489        offset_time = 0
490        offset_reached = False
491        while True:  # loop for the total number of chunks needed
492            if offset and not offset_reached:
493                offset_time += seconds_per_buffer
494                if offset_time > offset:
495                    offset_reached = True
496
497            buffer = source.stream.read(source.CHUNK)
498            if len(buffer) == 0: break
499
500            if offset_reached or not offset:
501                elapsed_time += seconds_per_buffer
502                if duration and elapsed_time > duration: break
503
504                frames.write(buffer)
505
506        frame_data = frames.getvalue()
507        frames.close()
508        return AudioData(frame_data, source.SAMPLE_RATE, source.SAMPLE_WIDTH)
509
510    def adjust_for_ambient_noise(self, source, duration=1):
511        """
512        Adjusts the energy threshold dynamically using audio from ``source`` (an ``AudioSource`` instance) to account for ambient noise.
513
514        Intended to calibrate the energy threshold with the ambient energy level. Should be used on periods of audio without speech - will stop early if any speech is detected.
515
516        The ``duration`` parameter is the maximum number of seconds that it will dynamically adjust the threshold for before returning. This value should be at least 0.5 in order to get a representative sample of the ambient noise.
517        """
518        assert isinstance(source, AudioSource), "Source must be an audio source"
519        assert source.stream is not None, "Audio source must be entered before adjusting, see documentation for ``AudioSource``; are you using ``source`` outside of a ``with`` statement?"
520        assert self.pause_threshold >= self.non_speaking_duration >= 0
521
522        seconds_per_buffer = (source.CHUNK + 0.0) / source.SAMPLE_RATE
523        elapsed_time = 0
524
525        # adjust energy threshold until a phrase starts
526        while True:
527            elapsed_time += seconds_per_buffer
528            if elapsed_time > duration: break
529            buffer = source.stream.read(source.CHUNK)
530            energy = audioop.rms(buffer, source.SAMPLE_WIDTH)  # energy of the audio signal
531
532            # dynamically adjust the energy threshold using asymmetric weighted average
533            damping = self.dynamic_energy_adjustment_damping ** seconds_per_buffer  # account for different chunk sizes and rates
534            target_energy = energy * self.dynamic_energy_ratio
535            self.energy_threshold = self.energy_threshold * damping + target_energy * (1 - damping)
536
537    def snowboy_wait_for_hot_word(self, snowboy_location, snowboy_hot_word_files, source, timeout=None):
538        # load snowboy library (NOT THREAD SAFE)
539        sys.path.append(snowboy_location)
540        import snowboydetect
541        sys.path.pop()
542
543        detector = snowboydetect.SnowboyDetect(
544            resource_filename=os.path.join(snowboy_location, "resources", "common.res").encode(),
545            model_str=",".join(snowboy_hot_word_files).encode()
546        )
547        detector.SetAudioGain(1.0)
548        detector.SetSensitivity(",".join(["0.4"] * len(snowboy_hot_word_files)).encode())
549        snowboy_sample_rate = detector.SampleRate()
550
551        elapsed_time = 0
552        seconds_per_buffer = float(source.CHUNK) / source.SAMPLE_RATE
553        resampling_state = None
554
555        # buffers capable of holding 5 seconds of original and resampled audio
556        five_seconds_buffer_count = int(math.ceil(5 / seconds_per_buffer))
557        frames = collections.deque(maxlen=five_seconds_buffer_count)
558        resampled_frames = collections.deque(maxlen=five_seconds_buffer_count)
559        while True:
560            elapsed_time += seconds_per_buffer
561            if timeout and elapsed_time > timeout:
562                raise WaitTimeoutError("listening timed out while waiting for hotword to be said")
563
564            buffer = source.stream.read(source.CHUNK)
565            if len(buffer) == 0: break  # reached end of the stream
566            frames.append(buffer)
567
568            # resample audio to the required sample rate
569            resampled_buffer, resampling_state = audioop.ratecv(buffer, source.SAMPLE_WIDTH, 1, source.SAMPLE_RATE, snowboy_sample_rate, resampling_state)
570            resampled_frames.append(resampled_buffer)
571
572            # run Snowboy on the resampled audio
573            snowboy_result = detector.RunDetection(b"".join(resampled_frames))
574            assert snowboy_result != -1, "Error initializing streams or reading audio data"
575            if snowboy_result > 0: break  # wake word found
576
577        return b"".join(frames), elapsed_time
578
579    def listen(self, source, timeout=None, phrase_time_limit=None, snowboy_configuration=None):
580        """
581        Records a single phrase from ``source`` (an ``AudioSource`` instance) into an ``AudioData`` instance, which it returns.
582
583        This is done by waiting until the audio has an energy above ``recognizer_instance.energy_threshold`` (the user has started speaking), and then recording until it encounters ``recognizer_instance.pause_threshold`` seconds of non-speaking or there is no more audio input. The ending silence is not included.
584
585        The ``timeout`` parameter is the maximum number of seconds that this will wait for a phrase to start before giving up and throwing an ``speech_recognition.WaitTimeoutError`` exception. If ``timeout`` is ``None``, there will be no wait timeout.
586
587        The ``phrase_time_limit`` parameter is the maximum number of seconds that this will allow a phrase to continue before stopping and returning the part of the phrase processed before the time limit was reached. The resulting audio will be the phrase cut off at the time limit. If ``phrase_timeout`` is ``None``, there will be no phrase time limit.
588
589        The ``snowboy_configuration`` parameter allows integration with `Snowboy <https://snowboy.kitt.ai/>`__, an offline, high-accuracy, power-efficient hotword recognition engine. When used, this function will pause until Snowboy detects a hotword, after which it will unpause. This parameter should either be ``None`` to turn off Snowboy support, or a tuple of the form ``(SNOWBOY_LOCATION, LIST_OF_HOT_WORD_FILES)``, where ``SNOWBOY_LOCATION`` is the path to the Snowboy root directory, and ``LIST_OF_HOT_WORD_FILES`` is a list of paths to Snowboy hotword configuration files (`*.pmdl` or `*.umdl` format).
590
591        This operation will always complete within ``timeout + phrase_timeout`` seconds if both are numbers, either by returning the audio data, or by raising a ``speech_recognition.WaitTimeoutError`` exception.
592        """
593        assert isinstance(source, AudioSource), "Source must be an audio source"
594        assert source.stream is not None, "Audio source must be entered before listening, see documentation for ``AudioSource``; are you using ``source`` outside of a ``with`` statement?"
595        assert self.pause_threshold >= self.non_speaking_duration >= 0
596        if snowboy_configuration is not None:
597            assert os.path.isfile(os.path.join(snowboy_configuration[0], "snowboydetect.py")), "``snowboy_configuration[0]`` must be a Snowboy root directory containing ``snowboydetect.py``"
598            for hot_word_file in snowboy_configuration[1]:
599                assert os.path.isfile(hot_word_file), "``snowboy_configuration[1]`` must be a list of Snowboy hot word configuration files"
600
601        seconds_per_buffer = float(source.CHUNK) / source.SAMPLE_RATE
602        pause_buffer_count = int(math.ceil(self.pause_threshold / seconds_per_buffer))  # number of buffers of non-speaking audio during a phrase, before the phrase should be considered complete
603        phrase_buffer_count = int(math.ceil(self.phrase_threshold / seconds_per_buffer))  # minimum number of buffers of speaking audio before we consider the speaking audio a phrase
604        non_speaking_buffer_count = int(math.ceil(self.non_speaking_duration / seconds_per_buffer))  # maximum number of buffers of non-speaking audio to retain before and after a phrase
605
606        # read audio input for phrases until there is a phrase that is long enough
607        elapsed_time = 0  # number of seconds of audio read
608        buffer = b""  # an empty buffer means that the stream has ended and there is no data left to read
609        while True:
610            frames = collections.deque()
611
612            if snowboy_configuration is None:
613                # store audio input until the phrase starts
614                while True:
615                    # handle waiting too long for phrase by raising an exception
616                    elapsed_time += seconds_per_buffer
617                    if timeout and elapsed_time > timeout:
618                        raise WaitTimeoutError("listening timed out while waiting for phrase to start")
619
620                    buffer = source.stream.read(source.CHUNK)
621                    if len(buffer) == 0: break  # reached end of the stream
622                    frames.append(buffer)
623                    if len(frames) > non_speaking_buffer_count:  # ensure we only keep the needed amount of non-speaking buffers
624                        frames.popleft()
625
626                    # detect whether speaking has started on audio input
627                    energy = audioop.rms(buffer, source.SAMPLE_WIDTH)  # energy of the audio signal
628                    if energy > self.energy_threshold: break
629
630                    # dynamically adjust the energy threshold using asymmetric weighted average
631                    if self.dynamic_energy_threshold:
632                        damping = self.dynamic_energy_adjustment_damping ** seconds_per_buffer  # account for different chunk sizes and rates
633                        target_energy = energy * self.dynamic_energy_ratio
634                        self.energy_threshold = self.energy_threshold * damping + target_energy * (1 - damping)
635            else:
636                # read audio input until the hotword is said
637                snowboy_location, snowboy_hot_word_files = snowboy_configuration
638                buffer, delta_time = self.snowboy_wait_for_hot_word(snowboy_location, snowboy_hot_word_files, source, timeout)
639                elapsed_time += delta_time
640                if len(buffer) == 0: break  # reached end of the stream
641                frames.append(buffer)
642
643            # read audio input until the phrase ends
644            pause_count, phrase_count = 0, 0
645            phrase_start_time = elapsed_time
646            while True:
647                # handle phrase being too long by cutting off the audio
648                elapsed_time += seconds_per_buffer
649                if phrase_time_limit and elapsed_time - phrase_start_time > phrase_time_limit:
650                    break
651
652                buffer = source.stream.read(source.CHUNK)
653                if len(buffer) == 0: break  # reached end of the stream
654                frames.append(buffer)
655                phrase_count += 1
656
657                # check if speaking has stopped for longer than the pause threshold on the audio input
658                energy = audioop.rms(buffer, source.SAMPLE_WIDTH)  # unit energy of the audio signal within the buffer
659                if energy > self.energy_threshold:
660                    pause_count = 0
661                else:
662                    pause_count += 1
663                if pause_count > pause_buffer_count:  # end of the phrase
664                    break
665
666            # check how long the detected phrase is, and retry listening if the phrase is too short
667            phrase_count -= pause_count  # exclude the buffers for the pause before the phrase
668            if phrase_count >= phrase_buffer_count or len(buffer) == 0: break  # phrase is long enough or we've reached the end of the stream, so stop listening
669
670        # obtain frame data
671        for i in range(pause_count - non_speaking_buffer_count): frames.pop()  # remove extra non-speaking frames at the end
672        frame_data = b"".join(frames)
673
674        return AudioData(frame_data, source.SAMPLE_RATE, source.SAMPLE_WIDTH)
675
676    def listen_in_background(self, source, callback, phrase_time_limit=None):
677        """
678        Spawns a thread to repeatedly record phrases from ``source`` (an ``AudioSource`` instance) into an ``AudioData`` instance and call ``callback`` with that ``AudioData`` instance as soon as each phrase are detected.
679
680        Returns a function object that, when called, requests that the background listener thread stop. The background thread is a daemon and will not stop the program from exiting if there are no other non-daemon threads. The function accepts one parameter, ``wait_for_stop``: if truthy, the function will wait for the background listener to stop before returning, otherwise it will return immediately and the background listener thread might still be running for a second or two afterwards. Additionally, if you are using a truthy value for ``wait_for_stop``, you must call the function from the same thread you originally called ``listen_in_background`` from.
681
682        Phrase recognition uses the exact same mechanism as ``recognizer_instance.listen(source)``. The ``phrase_time_limit`` parameter works in the same way as the ``phrase_time_limit`` parameter for ``recognizer_instance.listen(source)``, as well.
683
684        The ``callback`` parameter is a function that should accept two parameters - the ``recognizer_instance``, and an ``AudioData`` instance representing the captured audio. Note that ``callback`` function will be called from a non-main thread.
685        """
686        assert isinstance(source, AudioSource), "Source must be an audio source"
687        running = [True]
688
689        def threaded_listen():
690            with source as s:
691                while running[0]:
692                    try:  # listen for 1 second, then check again if the stop function has been called
693                        audio = self.listen(s, 1, phrase_time_limit)
694                    except WaitTimeoutError:  # listening timed out, just try again
695                        pass
696                    else:
697                        if running[0]: callback(self, audio)
698
699        def stopper(wait_for_stop=True):
700            running[0] = False
701            if wait_for_stop:
702                listener_thread.join()  # block until the background thread is done, which can take around 1 second
703
704        listener_thread = threading.Thread(target=threaded_listen)
705        listener_thread.daemon = True
706        listener_thread.start()
707        return stopper
708
709    def recognize_sphinx(self, audio_data, language="en-US", keyword_entries=None, grammar=None, show_all=False):
710        """
711        Performs speech recognition on ``audio_data`` (an ``AudioData`` instance), using CMU Sphinx.
712
713        The recognition language is determined by ``language``, an RFC5646 language tag like ``"en-US"`` or ``"en-GB"``, defaulting to US English. Out of the box, only ``en-US`` is supported. See `Notes on using `PocketSphinx <https://github.com/Uberi/speech_recognition/blob/master/reference/pocketsphinx.rst>`__ for information about installing other languages. This document is also included under ``reference/pocketsphinx.rst``. The ``language`` parameter can also be a tuple of filesystem paths, of the form ``(acoustic_parameters_directory, language_model_file, phoneme_dictionary_file)`` - this allows you to load arbitrary Sphinx models.
714
715        If specified, the keywords to search for are determined by ``keyword_entries``, an iterable of tuples of the form ``(keyword, sensitivity)``, where ``keyword`` is a phrase, and ``sensitivity`` is how sensitive to this phrase the recognizer should be, on a scale of 0 (very insensitive, more false negatives) to 1 (very sensitive, more false positives) inclusive. If not specified or ``None``, no keywords are used and Sphinx will simply transcribe whatever words it recognizes. Specifying ``keyword_entries`` is more accurate than just looking for those same keywords in non-keyword-based transcriptions, because Sphinx knows specifically what sounds to look for.
716
717        Sphinx can also handle FSG or JSGF grammars. The parameter ``grammar`` expects a path to the grammar file. Note that if a JSGF grammar is passed, an FSG grammar will be created at the same location to speed up execution in the next run. If ``keyword_entries`` are passed, content of ``grammar`` will be ignored.
718
719        Returns the most likely transcription if ``show_all`` is false (the default). Otherwise, returns the Sphinx ``pocketsphinx.pocketsphinx.Decoder`` object resulting from the recognition.
720
721        Raises a ``speech_recognition.UnknownValueError`` exception if the speech is unintelligible. Raises a ``speech_recognition.RequestError`` exception if there are any issues with the Sphinx installation.
722        """
723        assert isinstance(audio_data, AudioData), "``audio_data`` must be audio data"
724        assert isinstance(language, str) or (isinstance(language, tuple) and len(language) == 3), "``language`` must be a string or 3-tuple of Sphinx data file paths of the form ``(acoustic_parameters, language_model, phoneme_dictionary)``"
725        assert keyword_entries is None or all(isinstance(keyword, (type(""), type(u""))) and 0 <= sensitivity <= 1 for keyword, sensitivity in keyword_entries), "``keyword_entries`` must be ``None`` or a list of pairs of strings and numbers between 0 and 1"
726
727        # import the PocketSphinx speech recognition module
728        try:
729            from pocketsphinx import pocketsphinx, Jsgf, FsgModel
730
731        except ImportError:
732            raise RequestError("missing PocketSphinx module: ensure that PocketSphinx is set up correctly.")
733        except ValueError:
734            raise RequestError("bad PocketSphinx installation; try reinstalling PocketSphinx version 0.0.9 or better.")
735        if not hasattr(pocketsphinx, "Decoder") or not hasattr(pocketsphinx.Decoder, "default_config"):
736            raise RequestError("outdated PocketSphinx installation; ensure you have PocketSphinx version 0.0.9 or better.")
737
738        if isinstance(language, str):  # directory containing language data
739            language_directory = os.path.join(os.path.dirname(os.path.realpath(__file__)), "pocketsphinx-data", language)
740            if not os.path.isdir(language_directory):
741                raise RequestError("missing PocketSphinx language data directory: \"{}\"".format(language_directory))
742            acoustic_parameters_directory = os.path.join(language_directory, "acoustic-model")
743            language_model_file = os.path.join(language_directory, "language-model.lm.bin")
744            phoneme_dictionary_file = os.path.join(language_directory, "pronounciation-dictionary.dict")
745        else:  # 3-tuple of Sphinx data file paths
746            acoustic_parameters_directory, language_model_file, phoneme_dictionary_file = language
747        if not os.path.isdir(acoustic_parameters_directory):
748            raise RequestError("missing PocketSphinx language model parameters directory: \"{}\"".format(acoustic_parameters_directory))
749        if not os.path.isfile(language_model_file):
750            raise RequestError("missing PocketSphinx language model file: \"{}\"".format(language_model_file))
751        if not os.path.isfile(phoneme_dictionary_file):
752            raise RequestError("missing PocketSphinx phoneme dictionary file: \"{}\"".format(phoneme_dictionary_file))
753
754        # create decoder object
755        config = pocketsphinx.Decoder.default_config()
756        config.set_string("-hmm", acoustic_parameters_directory)  # set the path of the hidden Markov model (HMM) parameter files
757        config.set_string("-lm", language_model_file)
758        config.set_string("-dict", phoneme_dictionary_file)
759        config.set_string("-logfn", os.devnull)  # disable logging (logging causes unwanted output in terminal)
760        decoder = pocketsphinx.Decoder(config)
761
762        # obtain audio data
763        raw_data = audio_data.get_raw_data(convert_rate=16000, convert_width=2)  # the included language models require audio to be 16-bit mono 16 kHz in little-endian format
764
765        # obtain recognition results
766        if keyword_entries is not None:  # explicitly specified set of keywords
767            with PortableNamedTemporaryFile("w") as f:
768                # generate a keywords file - Sphinx documentation recommendeds sensitivities between 1e-50 and 1e-5
769                f.writelines("{} /1e{}/\n".format(keyword, 100 * sensitivity - 110) for keyword, sensitivity in keyword_entries)
770                f.flush()
771
772                # perform the speech recognition with the keywords file (this is inside the context manager so the file isn;t deleted until we're done)
773                decoder.set_kws("keywords", f.name)
774                decoder.set_search("keywords")
775                decoder.start_utt()  # begin utterance processing
776                decoder.process_raw(raw_data, False, True)  # process audio data with recognition enabled (no_search = False), as a full utterance (full_utt = True)
777                decoder.end_utt()  # stop utterance processing
778        elif grammar is not None:  # a path to a FSG or JSGF grammar
779            if not os.path.exists(grammar):
780                raise ValueError("Grammar '{0}' does not exist.".format(grammar))
781            grammar_path = os.path.abspath(os.path.dirname(grammar))
782            grammar_name = os.path.splitext(os.path.basename(grammar))[0]
783            fsg_path = "{0}/{1}.fsg".format(grammar_path, grammar_name)
784            if not os.path.exists(fsg_path):  # create FSG grammar if not available
785                jsgf = Jsgf(grammar)
786                rule = jsgf.get_rule("{0}.{0}".format(grammar_name))
787                fsg = jsgf.build_fsg(rule, decoder.get_logmath(), 7.5)
788                fsg.writefile(fsg_path)
789            else:
790                fsg = FsgModel(fsg_path, decoder.get_logmath(), 7.5)
791            decoder.set_fsg(grammar_name, fsg)
792            decoder.set_search(grammar_name)
793            decoder.start_utt()
794            decoder.process_raw(raw_data, False, True)  # process audio data with recognition enabled (no_search = False), as a full utterance (full_utt = True)
795            decoder.end_utt()  # stop utterance processing
796        else:  # no keywords, perform freeform recognition
797            decoder.start_utt()  # begin utterance processing
798            decoder.process_raw(raw_data, False, True)  # process audio data with recognition enabled (no_search = False), as a full utterance (full_utt = True)
799            decoder.end_utt()  # stop utterance processing
800
801        if show_all: return decoder
802
803        # return results
804        hypothesis = decoder.hyp()
805        if hypothesis is not None: return hypothesis.hypstr
806        raise UnknownValueError()  # no transcriptions available
807
808    def recognize_google(self, audio_data, key=None, language="en-US", show_all=False):
809        """
810        Performs speech recognition on ``audio_data`` (an ``AudioData`` instance), using the Google Speech Recognition API.
811
812        The Google Speech Recognition API key is specified by ``key``. If not specified, it uses a generic key that works out of the box. This should generally be used for personal or testing purposes only, as it **may be revoked by Google at any time**.
813
814        To obtain your own API key, simply following the steps on the `API Keys <http://www.chromium.org/developers/how-tos/api-keys>`__ page at the Chromium Developers site. In the Google Developers Console, Google Speech Recognition is listed as "Speech API".
815
816        The recognition language is determined by ``language``, an RFC5646 language tag like ``"en-US"`` (US English) or ``"fr-FR"`` (International French), defaulting to US English. A list of supported language tags can be found in this `StackOverflow answer <http://stackoverflow.com/a/14302134>`__.
817
818        Returns the most likely transcription if ``show_all`` is false (the default). Otherwise, returns the raw API response as a JSON dictionary.
819
820        Raises a ``speech_recognition.UnknownValueError`` exception if the speech is unintelligible. Raises a ``speech_recognition.RequestError`` exception if the speech recognition operation failed, if the key isn't valid, or if there is no internet connection.
821        """
822        assert isinstance(audio_data, AudioData), "``audio_data`` must be audio data"
823        assert key is None or isinstance(key, str), "``key`` must be ``None`` or a string"
824        assert isinstance(language, str), "``language`` must be a string"
825
826        flac_data = audio_data.get_flac_data(
827            convert_rate=None if audio_data.sample_rate >= 8000 else 8000,  # audio samples must be at least 8 kHz
828            convert_width=2  # audio samples must be 16-bit
829        )
830        if key is None: key = "AIzaSyBOti4mM-6x9WDnZIjIeyEU21OpBXqWBgw"
831        url = "http://www.google.com/speech-api/v2/recognize?{}".format(urlencode({
832            "client": "chromium",
833            "lang": language,
834            "key": key,
835        }))
836        request = Request(url, data=flac_data, headers={"Content-Type": "audio/x-flac; rate={}".format(audio_data.sample_rate)})
837
838        # obtain audio transcription results
839        try:
840            response = urlopen(request, timeout=self.operation_timeout)
841        except HTTPError as e:
842            raise RequestError("recognition request failed: {}".format(e.reason))
843        except URLError as e:
844            raise RequestError("recognition connection failed: {}".format(e.reason))
845        response_text = response.read().decode("utf-8")
846
847        # ignore any blank blocks
848        actual_result = []
849        for line in response_text.split("\n"):
850            if not line: continue
851            result = json.loads(line)["result"]
852            if len(result) != 0:
853                actual_result = result[0]
854                break
855
856        # return results
857        if show_all: return actual_result
858        if not isinstance(actual_result, dict) or len(actual_result.get("alternative", [])) == 0: raise UnknownValueError()
859
860        if "confidence" in actual_result["alternative"]:
861            # return alternative with highest confidence score
862            best_hypothesis = max(actual_result["alternative"], key=lambda alternative: alternative["confidence"])
863        else:
864            # when there is no confidence available, we arbitrarily choose the first hypothesis.
865            best_hypothesis = actual_result["alternative"][0]
866        if "transcript" not in best_hypothesis: raise UnknownValueError()
867        return best_hypothesis["transcript"]
868
869    def recognize_google_cloud(self, audio_data, credentials_json=None, language="en-US", preferred_phrases=None, show_all=False):
870        """
871        Performs speech recognition on ``audio_data`` (an ``AudioData`` instance), using the Google Cloud Speech API.
872
873        This function requires a Google Cloud Platform account; see the `Google Cloud Speech API Quickstart <https://cloud.google.com/speech/docs/getting-started>`__ for details and instructions. Basically, create a project, enable billing for the project, enable the Google Cloud Speech API for the project, and set up Service Account Key credentials for the project. The result is a JSON file containing the API credentials. The text content of this JSON file is specified by ``credentials_json``. If not specified, the library will try to automatically `find the default API credentials JSON file <https://developers.google.com/identity/protocols/application-default-credentials>`__.
874
875        The recognition language is determined by ``language``, which is a BCP-47 language tag like ``"en-US"`` (US English). A list of supported language tags can be found in the `Google Cloud Speech API documentation <https://cloud.google.com/speech/docs/languages>`__.
876
877        If ``preferred_phrases`` is an iterable of phrase strings, those given phrases will be more likely to be recognized over similar-sounding alternatives. This is useful for things like keyword/command recognition or adding new phrases that aren't in Google's vocabulary. Note that the API imposes certain `restrictions on the list of phrase strings <https://cloud.google.com/speech/limits#content>`__.
878
879        Returns the most likely transcription if ``show_all`` is False (the default). Otherwise, returns the raw API response as a JSON dictionary.
880
881        Raises a ``speech_recognition.UnknownValueError`` exception if the speech is unintelligible. Raises a ``speech_recognition.RequestError`` exception if the speech recognition operation failed, if the credentials aren't valid, or if there is no Internet connection.
882        """
883        assert isinstance(audio_data, AudioData), "``audio_data`` must be audio data"
884        if credentials_json is not None:
885            try: json.loads(credentials_json)
886            except Exception: raise AssertionError("``credentials_json`` must be ``None`` or a valid JSON string")
887        assert isinstance(language, str), "``language`` must be a string"
888        assert preferred_phrases is None or all(isinstance(preferred_phrases, (type(""), type(u""))) for preferred_phrases in preferred_phrases), "``preferred_phrases`` must be a list of strings"
889
890        # See https://cloud.google.com/speech/reference/rest/v1/RecognitionConfig
891        flac_data = audio_data.get_flac_data(
892            convert_rate=None if 8000 <= audio_data.sample_rate <= 48000 else max(8000, min(audio_data.sample_rate, 48000)),  # audio sample rate must be between 8 kHz and 48 kHz inclusive - clamp sample rate into this range
893            convert_width=2  # audio samples must be 16-bit
894        )
895
896        try:
897            from oauth2client.client import GoogleCredentials
898            from googleapiclient.discovery import build
899            import googleapiclient.errors
900
901            # cannot simply use 'http = httplib2.Http(timeout=self.operation_timeout)'
902            # because discovery.build() says 'Arguments http and credentials are mutually exclusive'
903            import socket
904            import googleapiclient.http
905            if self.operation_timeout and socket.getdefaulttimeout() is None:
906                # override constant (used by googleapiclient.http.build_http())
907                googleapiclient.http.DEFAULT_HTTP_TIMEOUT_SEC = self.operation_timeout
908
909            if credentials_json is None:
910                api_credentials = GoogleCredentials.get_application_default()
911            else:
912                # the credentials can only be read from a file, so we'll make a temp file and write in the contents to work around that
913                with PortableNamedTemporaryFile("w") as f:
914                    f.write(credentials_json)
915                    f.flush()
916                    api_credentials = GoogleCredentials.from_stream(f.name)
917
918            speech_service = build("speech", "v1", credentials=api_credentials, cache_discovery=False)
919        except ImportError:
920            raise RequestError("missing google-api-python-client module: ensure that google-api-python-client is set up correctly.")
921
922        speech_config = {"encoding": "FLAC", "sampleRateHertz": audio_data.sample_rate, "languageCode": language}
923        if preferred_phrases is not None:
924            speech_config["speechContext"] = {"phrases": preferred_phrases}
925        if show_all:
926            speech_config["enableWordTimeOffsets"] = True  # some useful extra options for when we want all the output
927        request = speech_service.speech().recognize(body={"audio": {"content": base64.b64encode(flac_data).decode("utf8")}, "config": speech_config})
928
929        try:
930            response = request.execute()
931        except googleapiclient.errors.HttpError as e:
932            raise RequestError(e)
933        except URLError as e:
934            raise RequestError("recognition connection failed: {0}".format(e.reason))
935
936        if show_all: return response
937        if "results" not in response or len(response["results"]) == 0: raise UnknownValueError()
938        transcript = ""
939        for result in response["results"]:
940            transcript += result["alternatives"][0]["transcript"].strip() + " "
941
942        return transcript
943
944    def recognize_wit(self, audio_data, key, show_all=False):
945        """
946        Performs speech recognition on ``audio_data`` (an ``AudioData`` instance), using the Wit.ai API.
947
948        The Wit.ai API key is specified by ``key``. Unfortunately, these are not available without `signing up for an account <https://wit.ai/>`__ and creating an app. You will need to add at least one intent to the app before you can see the API key, though the actual intent settings don't matter.
949
950        To get the API key for a Wit.ai app, go to the app's overview page, go to the section titled "Make an API request", and look for something along the lines of ``Authorization: Bearer XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX``; ``XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX`` is the API key. Wit.ai API keys are 32-character uppercase alphanumeric strings.
951
952        The recognition language is configured in the Wit.ai app settings.
953
954        Returns the most likely transcription if ``show_all`` is false (the default). Otherwise, returns the `raw API response <https://wit.ai/docs/http/20141022#get-intent-via-text-link>`__ as a JSON dictionary.
955
956        Raises a ``speech_recognition.UnknownValueError`` exception if the speech is unintelligible. Raises a ``speech_recognition.RequestError`` exception if the speech recognition operation failed, if the key isn't valid, or if there is no internet connection.
957        """
958        assert isinstance(audio_data, AudioData), "Data must be audio data"
959        assert isinstance(key, str), "``key`` must be a string"
960
961        wav_data = audio_data.get_wav_data(
962            convert_rate=None if audio_data.sample_rate >= 8000 else 8000,  # audio samples must be at least 8 kHz
963            convert_width=2  # audio samples should be 16-bit
964        )
965        url = "https://api.wit.ai/speech?v=20160526"
966        request = Request(url, data=wav_data, headers={"Authorization": "Bearer {}".format(key), "Content-Type": "audio/wav"})
967        try:
968            response = urlopen(request, timeout=self.operation_timeout)
969        except HTTPError as e:
970            raise RequestError("recognition request failed: {}".format(e.reason))
971        except URLError as e:
972            raise RequestError("recognition connection failed: {}".format(e.reason))
973        response_text = response.read().decode("utf-8")
974        result = json.loads(response_text)
975
976        # return results
977        if show_all: return result
978        if "_text" not in result or result["_text"] is None: raise UnknownValueError()
979        return result["_text"]
980
981    def recognize_bing(self, audio_data, key, language="en-US", show_all=False):
982        """
983        Performs speech recognition on ``audio_data`` (an ``AudioData`` instance), using the Microsoft Bing Speech API.
984
985        The Microsoft Bing Speech API key is specified by ``key``. Unfortunately, these are not available without `signing up for an account <https://azure.microsoft.com/en-ca/pricing/details/cognitive-services/speech-api/>`__ with Microsoft Azure.
986
987        To get the API key, go to the `Microsoft Azure Portal Resources <https://portal.azure.com/>`__ page, go to "All Resources" > "Add" > "See All" > Search "Bing Speech API > "Create", and fill in the form to make a "Bing Speech API" resource. On the resulting page (which is also accessible from the "All Resources" page in the Azure Portal), go to the "Show Access Keys" page, which will have two API keys, either of which can be used for the `key` parameter. Microsoft Bing Speech API keys are 32-character lowercase hexadecimal strings.
988
989        The recognition language is determined by ``language``, a BCP-47 language tag like ``"en-US"`` (US English) or ``"fr-FR"`` (International French), defaulting to US English. A list of supported language values can be found in the `API documentation <https://docs.microsoft.com/en-us/azure/cognitive-services/speech/api-reference-rest/bingvoicerecognition#recognition-language>`__ under "Interactive and dictation mode".
990
991        Returns the most likely transcription if ``show_all`` is false (the default). Otherwise, returns the `raw API response <https://docs.microsoft.com/en-us/azure/cognitive-services/speech/api-reference-rest/bingvoicerecognition#sample-responses>`__ as a JSON dictionary.
992
993        Raises a ``speech_recognition.UnknownValueError`` exception if the speech is unintelligible. Raises a ``speech_recognition.RequestError`` exception if the speech recognition operation failed, if the key isn't valid, or if there is no internet connection.
994        """
995        assert isinstance(audio_data, AudioData), "Data must be audio data"
996        assert isinstance(key, str), "``key`` must be a string"
997        assert isinstance(language, str), "``language`` must be a string"
998
999        access_token, expire_time = getattr(self, "bing_cached_access_token", None), getattr(self, "bing_cached_access_token_expiry", None)
1000        allow_caching = True
1001        try:
1002            from time import monotonic  # we need monotonic time to avoid being affected by system clock changes, but this is only available in Python 3.3+
1003        except ImportError:
1004            try:
1005                from monotonic import monotonic  # use time.monotonic backport for Python 2 if available (from https://pypi.python.org/pypi/monotonic)
1006            except (ImportError, RuntimeError):
1007                expire_time = None  # monotonic time not available, don't cache access tokens
1008                allow_caching = False  # don't allow caching, since monotonic time isn't available
1009        if expire_time is None or monotonic() > expire_time:  # caching not enabled, first credential request, or the access token from the previous one expired
1010            # get an access token using OAuth
1011            credential_url = "https://api.cognitive.microsoft.com/sts/v1.0/issueToken"
1012            credential_request = Request(credential_url, data=b"", headers={
1013                "Content-type": "application/x-www-form-urlencoded",
1014                "Content-Length": "0",
1015                "Ocp-Apim-Subscription-Key": key,
1016            })
1017
1018            if allow_caching:
1019                start_time = monotonic()
1020
1021            try:
1022                credential_response = urlopen(credential_request, timeout=60)  # credential response can take longer, use longer timeout instead of default one
1023            except HTTPError as e:
1024                raise RequestError("credential request failed: {}".format(e.reason))
1025            except URLError as e:
1026                raise RequestError("credential connection failed: {}".format(e.reason))
1027            access_token = credential_response.read().decode("utf-8")
1028
1029            if allow_caching:
1030                # save the token for the duration it is valid for
1031                self.bing_cached_access_token = access_token
1032                self.bing_cached_access_token_expiry = start_time + 600  # according to https://docs.microsoft.com/en-us/azure/cognitive-services/speech/api-reference-rest/bingvoicerecognition, the token expires in exactly 10 minutes
1033
1034        wav_data = audio_data.get_wav_data(
1035            convert_rate=16000,  # audio samples must be 8kHz or 16 kHz
1036            convert_width=2  # audio samples should be 16-bit
1037        )
1038
1039        url = "https://speech.platform.bing.com/speech/recognition/interactive/cognitiveservices/v1?{}".format(urlencode({
1040            "language": language,
1041            "locale": language,
1042            "requestid": uuid.uuid4(),
1043        }))
1044
1045        if sys.version_info >= (3, 6):  # chunked-transfer requests are only supported in the standard library as of Python 3.6+, use it if possible
1046            request = Request(url, data=io.BytesIO(wav_data), headers={
1047                "Authorization": "Bearer {}".format(access_token),
1048                "Content-type": "audio/wav; codec=\"audio/pcm\"; samplerate=16000",
1049                "Transfer-Encoding": "chunked",
1050            })
1051        else:  # fall back on manually formatting the POST body as a chunked request
1052            ascii_hex_data_length = "{:X}".format(len(wav_data)).encode("utf-8")
1053            chunked_transfer_encoding_data = ascii_hex_data_length + b"\r\n" + wav_data + b"\r\n0\r\n\r\n"
1054            request = Request(url, data=chunked_transfer_encoding_data, headers={
1055                "Authorization": "Bearer {}".format(access_token),
1056                "Content-type": "audio/wav; codec=\"audio/pcm\"; samplerate=16000",
1057                "Transfer-Encoding": "chunked",
1058            })
1059
1060        try:
1061            response = urlopen(request, timeout=self.operation_timeout)
1062        except HTTPError as e:
1063            raise RequestError("recognition request failed: {}".format(e.reason))
1064        except URLError as e:
1065            raise RequestError("recognition connection failed: {}".format(e.reason))
1066        response_text = response.read().decode("utf-8")
1067        result = json.loads(response_text)
1068
1069        # return results
1070        if show_all: return result
1071        if "RecognitionStatus" not in result or result["RecognitionStatus"] != "Success" or "DisplayText" not in result: raise UnknownValueError()
1072        return result["DisplayText"]
1073
1074    def recognize_houndify(self, audio_data, client_id, client_key, show_all=False):
1075        """
1076        Performs speech recognition on ``audio_data`` (an ``AudioData`` instance), using the Houndify API.
1077
1078        The Houndify client ID and client key are specified by ``client_id`` and ``client_key``, respectively. Unfortunately, these are not available without `signing up for an account <https://www.houndify.com/signup>`__. Once logged into the `dashboard <https://www.houndify.com/dashboard>`__, you will want to select "Register a new client", and fill in the form as necessary. When at the "Enable Domains" page, enable the "Speech To Text Only" domain, and then select "Save & Continue".
1079
1080        To get the client ID and client key for a Houndify client, go to the `dashboard <https://www.houndify.com/dashboard>`__ and select the client's "View Details" link. On the resulting page, the client ID and client key will be visible. Client IDs and client keys are both Base64-encoded strings.
1081
1082        Currently, only English is supported as a recognition language.
1083
1084        Returns the most likely transcription if ``show_all`` is false (the default). Otherwise, returns the raw API response as a JSON dictionary.
1085
1086        Raises a ``speech_recognition.UnknownValueError`` exception if the speech is unintelligible. Raises a ``speech_recognition.RequestError`` exception if the speech recognition operation failed, if the key isn't valid, or if there is no internet connection.
1087        """
1088        assert isinstance(audio_data, AudioData), "Data must be audio data"
1089        assert isinstance(client_id, str), "``client_id`` must be a string"
1090        assert isinstance(client_key, str), "``client_key`` must be a string"
1091
1092        wav_data = audio_data.get_wav_data(
1093            convert_rate=None if audio_data.sample_rate in [8000, 16000] else 16000,  # audio samples must be 8 kHz or 16 kHz
1094            convert_width=2  # audio samples should be 16-bit
1095        )
1096        url = "https://api.houndify.com/v1/audio"
1097        user_id, request_id = str(uuid.uuid4()), str(uuid.uuid4())
1098        request_time = str(int(time.time()))
1099        request_signature = base64.urlsafe_b64encode(
1100            hmac.new(
1101                base64.urlsafe_b64decode(client_key),
1102                user_id.encode("utf-8") + b";" + request_id.encode("utf-8") + request_time.encode("utf-8"),
1103                hashlib.sha256
1104            ).digest()  # get the HMAC digest as bytes
1105        ).decode("utf-8")
1106        request = Request(url, data=wav_data, headers={
1107            "Content-Type": "application/json",
1108            "Hound-Request-Info": json.dumps({"ClientID": client_id, "UserID": user_id}),
1109            "Hound-Request-Authentication": "{};{}".format(user_id, request_id),
1110            "Hound-Client-Authentication": "{};{};{}".format(client_id, request_time, request_signature)
1111        })
1112        try:
1113            response = urlopen(request, timeout=self.operation_timeout)
1114        except HTTPError as e:
1115            raise RequestError("recognition request failed: {}".format(e.reason))
1116        except URLError as e:
1117            raise RequestError("recognition connection failed: {}".format(e.reason))
1118        response_text = response.read().decode("utf-8")
1119        result = json.loads(response_text)
1120
1121        # return results
1122        if show_all: return result
1123        if "Disambiguation" not in result or result["Disambiguation"] is None:
1124            raise UnknownValueError()
1125        return result['Disambiguation']['ChoiceData'][0]['Transcription']
1126
1127    def recognize_ibm(self, audio_data, username, password, language="en-US", show_all=False):
1128        """
1129        Performs speech recognition on ``audio_data`` (an ``AudioData`` instance), using the IBM Speech to Text API.
1130
1131        The IBM Speech to Text username and password are specified by ``username`` and ``password``, respectively. Unfortunately, these are not available without `signing up for an account <https://console.ng.bluemix.net/registration/>`__. Once logged into the Bluemix console, follow the instructions for `creating an IBM Watson service instance <https://www.ibm.com/watson/developercloud/doc/getting_started/gs-credentials.shtml>`__, where the Watson service is "Speech To Text". IBM Speech to Text usernames are strings of the form XXXXXXXX-XXXX-XXXX-XXXX-XXXXXXXXXXXX, while passwords are mixed-case alphanumeric strings.
1132
1133        The recognition language is determined by ``language``, an RFC5646 language tag with a dialect like ``"en-US"`` (US English) or ``"zh-CN"`` (Mandarin Chinese), defaulting to US English. The supported language values are listed under the ``model`` parameter of the `audio recognition API documentation <https://www.ibm.com/watson/developercloud/speech-to-text/api/v1/#sessionless_methods>`__, in the form ``LANGUAGE_BroadbandModel``, where ``LANGUAGE`` is the language value.
1134
1135        Returns the most likely transcription if ``show_all`` is false (the default). Otherwise, returns the `raw API response <https://www.ibm.com/watson/developercloud/speech-to-text/api/v1/#sessionless_methods>`__ as a JSON dictionary.
1136
1137        Raises a ``speech_recognition.UnknownValueError`` exception if the speech is unintelligible. Raises a ``speech_recognition.RequestError`` exception if the speech recognition operation failed, if the key isn't valid, or if there is no internet connection.
1138        """
1139        assert isinstance(audio_data, AudioData), "Data must be audio data"
1140        assert isinstance(username, str), "``username`` must be a string"
1141        assert isinstance(password, str), "``password`` must be a string"
1142
1143        flac_data = audio_data.get_flac_data(
1144            convert_rate=None if audio_data.sample_rate >= 16000 else 16000,  # audio samples should be at least 16 kHz
1145            convert_width=None if audio_data.sample_width >= 2 else 2  # audio samples should be at least 16-bit
1146        )
1147        url = "https://stream.watsonplatform.net/speech-to-text/api/v1/recognize?{}".format(urlencode({
1148            "profanity_filter": "false",
1149            "model": "{}_BroadbandModel".format(language),
1150            "inactivity_timeout": -1,  # don't stop recognizing when the audio stream activity stops
1151        }))
1152        request = Request(url, data=flac_data, headers={
1153            "Content-Type": "audio/x-flac",
1154            "X-Watson-Learning-Opt-Out": "true",  # prevent requests from being logged, for improved privacy
1155        })
1156        authorization_value = base64.standard_b64encode("{}:{}".format(username, password).encode("utf-8")).decode("utf-8")
1157        request.add_header("Authorization", "Basic {}".format(authorization_value))
1158        try:
1159            response = urlopen(request, timeout=self.operation_timeout)
1160        except HTTPError as e:
1161            raise RequestError("recognition request failed: {}".format(e.reason))
1162        except URLError as e:
1163            raise RequestError("recognition connection failed: {}".format(e.reason))
1164        response_text = response.read().decode("utf-8")
1165        result = json.loads(response_text)
1166
1167        # return results
1168        if show_all: return result
1169        if "results" not in result or len(result["results"]) < 1 or "alternatives" not in result["results"][0]:
1170            raise UnknownValueError()
1171
1172        transcription = []
1173        for utterance in result["results"]:
1174            if "alternatives" not in utterance: raise UnknownValueError()
1175            for hypothesis in utterance["alternatives"]:
1176                if "transcript" in hypothesis:
1177                    transcription.append(hypothesis["transcript"])
1178        return "\n".join(transcription)
1179
1180
1181def get_flac_converter():
1182    """Returns the absolute path of a FLAC converter executable, or raises an OSError if none can be found."""
1183    flac_converter = shutil_which("flac")  # check for installed version first
1184    if flac_converter is None:  # flac utility is not installed
1185        base_path = os.path.dirname(os.path.abspath(__file__))  # directory of the current module file, where all the FLAC bundled binaries are stored
1186        system, machine = platform.system(), platform.machine()
1187        if system == "Windows" and machine in {"i686", "i786", "x86", "x86_64", "AMD64"}:
1188            flac_converter = os.path.join(base_path, "flac-win32.exe")
1189        elif system == "Darwin" and machine in {"i686", "i786", "x86", "x86_64", "AMD64"}:
1190            flac_converter = os.path.join(base_path, "flac-mac")
1191        elif system == "Linux" and machine in {"i686", "i786", "x86"}:
1192            flac_converter = os.path.join(base_path, "flac-linux-x86")
1193        elif system == "Linux" and machine in {"x86_64", "AMD64"}:
1194            flac_converter = os.path.join(base_path, "flac-linux-x86_64")
1195        else:  # no FLAC converter available
1196            raise OSError("FLAC conversion utility not available - consider installing the FLAC command line application by running `apt-get install flac` or your operating system's equivalent")
1197
1198    # mark FLAC converter as executable if possible
1199    try:
1200        # handle known issue when running on docker:
1201        # run executable right after chmod() may result in OSError "Text file busy"
1202        # fix: flush FS with sync
1203        if not os.access(flac_converter, os.X_OK):
1204            stat_info = os.stat(flac_converter)
1205            os.chmod(flac_converter, stat_info.st_mode | stat.S_IEXEC)
1206            if 'Linux' in platform.system():
1207                os.sync() if sys.version_info >= (3, 3) else os.system('sync')
1208
1209    except OSError: pass
1210
1211    return flac_converter
1212
1213
1214def shutil_which(pgm):
1215    """Python 2 compatibility: backport of ``shutil.which()`` from Python 3"""
1216    path = os.getenv('PATH')
1217    for p in path.split(os.path.pathsep):
1218        p = os.path.join(p, pgm)
1219        if os.path.exists(p) and os.access(p, os.X_OK):
1220            return p
1221
1222
1223class PortableNamedTemporaryFile(object):
1224    """Limited replacement for ``tempfile.NamedTemporaryFile``, except unlike ``tempfile.NamedTemporaryFile``, the file can be opened again while it's currently open, even on Windows."""
1225    def __init__(self, mode="w+b"):
1226        self.mode = mode
1227
1228    def __enter__(self):
1229        # create the temporary file and open it
1230        import tempfile
1231        file_descriptor, file_path = tempfile.mkstemp()
1232        self._file = os.fdopen(file_descriptor, self.mode)
1233
1234        # the name property is a public field
1235        self.name = file_path
1236        return self
1237
1238    def __exit__(self, exc_type, exc_value, traceback):
1239        self._file.close()
1240        os.remove(self.name)
1241
1242    def write(self, *args, **kwargs):
1243        return self._file.write(*args, **kwargs)
1244
1245    def writelines(self, *args, **kwargs):
1246        return self._file.writelines(*args, **kwargs)
1247
1248    def flush(self, *args, **kwargs):
1249        return self._file.flush(*args, **kwargs)
1250
1251
1252# ===============================
1253#  backwards compatibility shims
1254# ===============================
1255
1256WavFile = AudioFile  # WavFile was renamed to AudioFile in 3.4.1
1257
1258
1259def recognize_api(self, audio_data, client_access_token, language="en", session_id=None, show_all=False):
1260    wav_data = audio_data.get_wav_data(convert_rate=16000, convert_width=2)
1261    url = "https://api.api.ai/v1/query"
1262    while True:
1263        boundary = uuid.uuid4().hex
1264        if boundary.encode("utf-8") not in wav_data: break
1265    if session_id is None: session_id = uuid.uuid4().hex
1266    data = b"--" + boundary.encode("utf-8") + b"\r\n" + b"Content-Disposition: form-data; name=\"request\"\r\n" + b"Content-Type: application/json\r\n" + b"\r\n" + b"{\"v\": \"20150910\", \"sessionId\": \"" + session_id.encode("utf-8") + b"\", \"lang\": \"" + language.encode("utf-8") + b"\"}\r\n" + b"--" + boundary.encode("utf-8") + b"\r\n" + b"Content-Disposition: form-data; name=\"voiceData\"; filename=\"audio.wav\"\r\n" + b"Content-Type: audio/wav\r\n" + b"\r\n" + wav_data + b"\r\n" + b"--" + boundary.encode("utf-8") + b"--\r\n"
1267    request = Request(url, data=data, headers={"Authorization": "Bearer {}".format(client_access_token), "Content-Length": str(len(data)), "Expect": "100-continue", "Content-Type": "multipart/form-data; boundary={}".format(boundary)})
1268    try: response = urlopen(request, timeout=10)
1269    except HTTPError as e: raise RequestError("recognition request failed: {}".format(e.reason))
1270    except URLError as e: raise RequestError("recognition connection failed: {}".format(e.reason))
1271    response_text = response.read().decode("utf-8")
1272    result = json.loads(response_text)
1273    if show_all: return result
1274    if "status" not in result or "errorType" not in result["status"] or result["status"]["errorType"] != "success":
1275        raise UnknownValueError()
1276    return result["result"]["resolvedQuery"]
1277
1278
1279Recognizer.recognize_api = classmethod(recognize_api)  # API.AI Speech Recognition is deprecated/not recommended as of 3.5.0, and currently is only optionally available for paid plans
1280