1#!/usr/bin/env python3 2 3"""Library for performing speech recognition, with support for several engines and APIs, online and offline.""" 4 5import io 6import os 7import sys 8import subprocess 9import wave 10import aifc 11import math 12import audioop 13import collections 14import json 15import base64 16import threading 17import platform 18import stat 19import hashlib 20import hmac 21import time 22import uuid 23 24__author__ = "Anthony Zhang (Uberi)" 25__version__ = "3.8.1" 26__license__ = "BSD" 27 28try: # attempt to use the Python 2 modules 29 from urllib import urlencode 30 from urllib2 import Request, urlopen, URLError, HTTPError 31except ImportError: # use the Python 3 modules 32 from urllib.parse import urlencode 33 from urllib.request import Request, urlopen 34 from urllib.error import URLError, HTTPError 35 36 37class WaitTimeoutError(Exception): pass 38 39 40class RequestError(Exception): pass 41 42 43class UnknownValueError(Exception): pass 44 45 46class AudioSource(object): 47 def __init__(self): 48 raise NotImplementedError("this is an abstract class") 49 50 def __enter__(self): 51 raise NotImplementedError("this is an abstract class") 52 53 def __exit__(self, exc_type, exc_value, traceback): 54 raise NotImplementedError("this is an abstract class") 55 56 57class Microphone(AudioSource): 58 """ 59 Creates a new ``Microphone`` instance, which represents a physical microphone on the computer. Subclass of ``AudioSource``. 60 61 This will throw an ``AttributeError`` if you don't have PyAudio 0.2.11 or later installed. 62 63 If ``device_index`` is unspecified or ``None``, the default microphone is used as the audio source. Otherwise, ``device_index`` should be the index of the device to use for audio input. 64 65 A device index is an integer between 0 and ``pyaudio.get_device_count() - 1`` (assume we have used ``import pyaudio`` beforehand) inclusive. It represents an audio device such as a microphone or speaker. See the `PyAudio documentation <http://people.csail.mit.edu/hubert/pyaudio/docs/>`__ for more details. 66 67 The microphone audio is recorded in chunks of ``chunk_size`` samples, at a rate of ``sample_rate`` samples per second (Hertz). If not specified, the value of ``sample_rate`` is determined automatically from the system's microphone settings. 68 69 Higher ``sample_rate`` values result in better audio quality, but also more bandwidth (and therefore, slower recognition). Additionally, some CPUs, such as those in older Raspberry Pi models, can't keep up if this value is too high. 70 71 Higher ``chunk_size`` values help avoid triggering on rapidly changing ambient noise, but also makes detection less sensitive. This value, generally, should be left at its default. 72 """ 73 def __init__(self, device_index=None, sample_rate=None, chunk_size=1024): 74 assert device_index is None or isinstance(device_index, int), "Device index must be None or an integer" 75 assert sample_rate is None or (isinstance(sample_rate, int) and sample_rate > 0), "Sample rate must be None or a positive integer" 76 assert isinstance(chunk_size, int) and chunk_size > 0, "Chunk size must be a positive integer" 77 78 # set up PyAudio 79 self.pyaudio_module = self.get_pyaudio() 80 audio = self.pyaudio_module.PyAudio() 81 try: 82 count = audio.get_device_count() # obtain device count 83 if device_index is not None: # ensure device index is in range 84 assert 0 <= device_index < count, "Device index out of range ({} devices available; device index should be between 0 and {} inclusive)".format(count, count - 1) 85 if sample_rate is None: # automatically set the sample rate to the hardware's default sample rate if not specified 86 device_info = audio.get_device_info_by_index(device_index) if device_index is not None else audio.get_default_input_device_info() 87 assert isinstance(device_info.get("defaultSampleRate"), (float, int)) and device_info["defaultSampleRate"] > 0, "Invalid device info returned from PyAudio: {}".format(device_info) 88 sample_rate = int(device_info["defaultSampleRate"]) 89 except Exception: 90 audio.terminate() 91 raise 92 93 self.device_index = device_index 94 self.format = self.pyaudio_module.paInt16 # 16-bit int sampling 95 self.SAMPLE_WIDTH = self.pyaudio_module.get_sample_size(self.format) # size of each sample 96 self.SAMPLE_RATE = sample_rate # sampling rate in Hertz 97 self.CHUNK = chunk_size # number of frames stored in each buffer 98 99 self.audio = None 100 self.stream = None 101 102 @staticmethod 103 def get_pyaudio(): 104 """ 105 Imports the pyaudio module and checks its version. Throws exceptions if pyaudio can't be found or a wrong version is installed 106 """ 107 try: 108 import pyaudio 109 except ImportError: 110 raise AttributeError("Could not find PyAudio; check installation") 111 from distutils.version import LooseVersion 112 if LooseVersion(pyaudio.__version__) < LooseVersion("0.2.11"): 113 raise AttributeError("PyAudio 0.2.11 or later is required (found version {})".format(pyaudio.__version__)) 114 return pyaudio 115 116 @staticmethod 117 def list_microphone_names(): 118 """ 119 Returns a list of the names of all available microphones. For microphones where the name can't be retrieved, the list entry contains ``None`` instead. 120 121 The index of each microphone's name is the same as its device index when creating a ``Microphone`` instance - indices in this list can be used as values of ``device_index``. 122 """ 123 audio = Microphone.get_pyaudio().PyAudio() 124 try: 125 result = [] 126 for i in range(audio.get_device_count()): 127 device_info = audio.get_device_info_by_index(i) 128 result.append(device_info.get("name")) 129 finally: 130 audio.terminate() 131 return result 132 133 def __enter__(self): 134 assert self.stream is None, "This audio source is already inside a context manager" 135 self.audio = self.pyaudio_module.PyAudio() 136 try: 137 self.stream = Microphone.MicrophoneStream( 138 self.audio.open( 139 input_device_index=self.device_index, channels=1, 140 format=self.format, rate=self.SAMPLE_RATE, frames_per_buffer=self.CHUNK, 141 input=True, # stream is an input stream 142 ) 143 ) 144 except Exception: 145 self.audio.terminate() 146 raise 147 return self 148 149 def __exit__(self, exc_type, exc_value, traceback): 150 try: 151 self.stream.close() 152 finally: 153 self.stream = None 154 self.audio.terminate() 155 156 class MicrophoneStream(object): 157 def __init__(self, pyaudio_stream): 158 self.pyaudio_stream = pyaudio_stream 159 160 def read(self, size): 161 return self.pyaudio_stream.read(size, exception_on_overflow=False) 162 163 def close(self): 164 try: 165 # sometimes, if the stream isn't stopped, closing the stream throws an exception 166 if not self.pyaudio_stream.is_stopped(): 167 self.pyaudio_stream.stop_stream() 168 finally: 169 self.pyaudio_stream.close() 170 171 172class AudioFile(AudioSource): 173 """ 174 Creates a new ``AudioFile`` instance given a WAV/AIFF/FLAC audio file ``filename_or_fileobject``. Subclass of ``AudioSource``. 175 176 If ``filename_or_fileobject`` is a string, then it is interpreted as a path to an audio file on the filesystem. Otherwise, ``filename_or_fileobject`` should be a file-like object such as ``io.BytesIO`` or similar. 177 178 Note that functions that read from the audio (such as ``recognizer_instance.record`` or ``recognizer_instance.listen``) will move ahead in the stream. For example, if you execute ``recognizer_instance.record(audiofile_instance, duration=10)`` twice, the first time it will return the first 10 seconds of audio, and the second time it will return the 10 seconds of audio right after that. This is always reset to the beginning when entering an ``AudioFile`` context. 179 180 WAV files must be in PCM/LPCM format; WAVE_FORMAT_EXTENSIBLE and compressed WAV are not supported and may result in undefined behaviour. 181 182 Both AIFF and AIFF-C (compressed AIFF) formats are supported. 183 184 FLAC files must be in native FLAC format; OGG-FLAC is not supported and may result in undefined behaviour. 185 """ 186 187 def __init__(self, filename_or_fileobject): 188 assert isinstance(filename_or_fileobject, (type(""), type(u""))) or hasattr(filename_or_fileobject, "read"), "Given audio file must be a filename string or a file-like object" 189 self.filename_or_fileobject = filename_or_fileobject 190 self.stream = None 191 self.DURATION = None 192 193 self.audio_reader = None 194 self.little_endian = False 195 self.SAMPLE_RATE = None 196 self.CHUNK = None 197 self.FRAME_COUNT = None 198 199 def __enter__(self): 200 assert self.stream is None, "This audio source is already inside a context manager" 201 try: 202 # attempt to read the file as WAV 203 self.audio_reader = wave.open(self.filename_or_fileobject, "rb") 204 self.little_endian = True # RIFF WAV is a little-endian format (most ``audioop`` operations assume that the frames are stored in little-endian form) 205 except (wave.Error, EOFError): 206 try: 207 # attempt to read the file as AIFF 208 self.audio_reader = aifc.open(self.filename_or_fileobject, "rb") 209 self.little_endian = False # AIFF is a big-endian format 210 except (aifc.Error, EOFError): 211 # attempt to read the file as FLAC 212 if hasattr(self.filename_or_fileobject, "read"): 213 flac_data = self.filename_or_fileobject.read() 214 else: 215 with open(self.filename_or_fileobject, "rb") as f: flac_data = f.read() 216 217 # run the FLAC converter with the FLAC data to get the AIFF data 218 flac_converter = get_flac_converter() 219 if os.name == "nt": # on Windows, specify that the process is to be started without showing a console window 220 startup_info = subprocess.STARTUPINFO() 221 startup_info.dwFlags |= subprocess.STARTF_USESHOWWINDOW # specify that the wShowWindow field of `startup_info` contains a value 222 startup_info.wShowWindow = subprocess.SW_HIDE # specify that the console window should be hidden 223 else: 224 startup_info = None # default startupinfo 225 process = subprocess.Popen([ 226 flac_converter, 227 "--stdout", "--totally-silent", # put the resulting AIFF file in stdout, and make sure it's not mixed with any program output 228 "--decode", "--force-aiff-format", # decode the FLAC file into an AIFF file 229 "-", # the input FLAC file contents will be given in stdin 230 ], stdin=subprocess.PIPE, stdout=subprocess.PIPE, startupinfo=startup_info) 231 aiff_data, _ = process.communicate(flac_data) 232 aiff_file = io.BytesIO(aiff_data) 233 try: 234 self.audio_reader = aifc.open(aiff_file, "rb") 235 except (aifc.Error, EOFError): 236 raise ValueError("Audio file could not be read as PCM WAV, AIFF/AIFF-C, or Native FLAC; check if file is corrupted or in another format") 237 self.little_endian = False # AIFF is a big-endian format 238 assert 1 <= self.audio_reader.getnchannels() <= 2, "Audio must be mono or stereo" 239 self.SAMPLE_WIDTH = self.audio_reader.getsampwidth() 240 241 # 24-bit audio needs some special handling for old Python versions (workaround for https://bugs.python.org/issue12866) 242 samples_24_bit_pretending_to_be_32_bit = False 243 if self.SAMPLE_WIDTH == 3: # 24-bit audio 244 try: audioop.bias(b"", self.SAMPLE_WIDTH, 0) # test whether this sample width is supported (for example, ``audioop`` in Python 3.3 and below don't support sample width 3, while Python 3.4+ do) 245 except audioop.error: # this version of audioop doesn't support 24-bit audio (probably Python 3.3 or less) 246 samples_24_bit_pretending_to_be_32_bit = True # while the ``AudioFile`` instance will outwardly appear to be 32-bit, it will actually internally be 24-bit 247 self.SAMPLE_WIDTH = 4 # the ``AudioFile`` instance should present itself as a 32-bit stream now, since we'll be converting into 32-bit on the fly when reading 248 249 self.SAMPLE_RATE = self.audio_reader.getframerate() 250 self.CHUNK = 4096 251 self.FRAME_COUNT = self.audio_reader.getnframes() 252 self.DURATION = self.FRAME_COUNT / float(self.SAMPLE_RATE) 253 self.stream = AudioFile.AudioFileStream(self.audio_reader, self.little_endian, samples_24_bit_pretending_to_be_32_bit) 254 return self 255 256 def __exit__(self, exc_type, exc_value, traceback): 257 if not hasattr(self.filename_or_fileobject, "read"): # only close the file if it was opened by this class in the first place (if the file was originally given as a path) 258 self.audio_reader.close() 259 self.stream = None 260 self.DURATION = None 261 262 class AudioFileStream(object): 263 def __init__(self, audio_reader, little_endian, samples_24_bit_pretending_to_be_32_bit): 264 self.audio_reader = audio_reader # an audio file object (e.g., a `wave.Wave_read` instance) 265 self.little_endian = little_endian # whether the audio data is little-endian (when working with big-endian things, we'll have to convert it to little-endian before we process it) 266 self.samples_24_bit_pretending_to_be_32_bit = samples_24_bit_pretending_to_be_32_bit # this is true if the audio is 24-bit audio, but 24-bit audio isn't supported, so we have to pretend that this is 32-bit audio and convert it on the fly 267 268 def read(self, size=-1): 269 buffer = self.audio_reader.readframes(self.audio_reader.getnframes() if size == -1 else size) 270 if not isinstance(buffer, bytes): buffer = b"" # workaround for https://bugs.python.org/issue24608 271 272 sample_width = self.audio_reader.getsampwidth() 273 if not self.little_endian: # big endian format, convert to little endian on the fly 274 if hasattr(audioop, "byteswap"): # ``audioop.byteswap`` was only added in Python 3.4 (incidentally, that also means that we don't need to worry about 24-bit audio being unsupported, since Python 3.4+ always has that functionality) 275 buffer = audioop.byteswap(buffer, sample_width) 276 else: # manually reverse the bytes of each sample, which is slower but works well enough as a fallback 277 buffer = buffer[sample_width - 1::-1] + b"".join(buffer[i + sample_width:i:-1] for i in range(sample_width - 1, len(buffer), sample_width)) 278 279 # workaround for https://bugs.python.org/issue12866 280 if self.samples_24_bit_pretending_to_be_32_bit: # we need to convert samples from 24-bit to 32-bit before we can process them with ``audioop`` functions 281 buffer = b"".join(b"\x00" + buffer[i:i + sample_width] for i in range(0, len(buffer), sample_width)) # since we're in little endian, we prepend a zero byte to each 24-bit sample to get a 32-bit sample 282 sample_width = 4 # make sure we thread the buffer as 32-bit audio now, after converting it from 24-bit audio 283 if self.audio_reader.getnchannels() != 1: # stereo audio 284 buffer = audioop.tomono(buffer, sample_width, 1, 1) # convert stereo audio data to mono 285 return buffer 286 287 288class AudioData(object): 289 """ 290 Creates a new ``AudioData`` instance, which represents mono audio data. 291 292 The raw audio data is specified by ``frame_data``, which is a sequence of bytes representing audio samples. This is the frame data structure used by the PCM WAV format. 293 294 The width of each sample, in bytes, is specified by ``sample_width``. Each group of ``sample_width`` bytes represents a single audio sample. 295 296 The audio data is assumed to have a sample rate of ``sample_rate`` samples per second (Hertz). 297 298 Usually, instances of this class are obtained from ``recognizer_instance.record`` or ``recognizer_instance.listen``, or in the callback for ``recognizer_instance.listen_in_background``, rather than instantiating them directly. 299 """ 300 def __init__(self, frame_data, sample_rate, sample_width): 301 assert sample_rate > 0, "Sample rate must be a positive integer" 302 assert sample_width % 1 == 0 and 1 <= sample_width <= 4, "Sample width must be between 1 and 4 inclusive" 303 self.frame_data = frame_data 304 self.sample_rate = sample_rate 305 self.sample_width = int(sample_width) 306 307 def get_segment(self, start_ms=None, end_ms=None): 308 """ 309 Returns a new ``AudioData`` instance, trimmed to a given time interval. In other words, an ``AudioData`` instance with the same audio data except starting at ``start_ms`` milliseconds in and ending ``end_ms`` milliseconds in. 310 311 If not specified, ``start_ms`` defaults to the beginning of the audio, and ``end_ms`` defaults to the end. 312 """ 313 assert start_ms is None or start_ms >= 0, "``start_ms`` must be a non-negative number" 314 assert end_ms is None or end_ms >= (0 if start_ms is None else start_ms), "``end_ms`` must be a non-negative number greater or equal to ``start_ms``" 315 if start_ms is None: 316 start_byte = 0 317 else: 318 start_byte = int((start_ms * self.sample_rate * self.sample_width) // 1000) 319 if end_ms is None: 320 end_byte = len(self.frame_data) 321 else: 322 end_byte = int((end_ms * self.sample_rate * self.sample_width) // 1000) 323 return AudioData(self.frame_data[start_byte:end_byte], self.sample_rate, self.sample_width) 324 325 def get_raw_data(self, convert_rate=None, convert_width=None): 326 """ 327 Returns a byte string representing the raw frame data for the audio represented by the ``AudioData`` instance. 328 329 If ``convert_rate`` is specified and the audio sample rate is not ``convert_rate`` Hz, the resulting audio is resampled to match. 330 331 If ``convert_width`` is specified and the audio samples are not ``convert_width`` bytes each, the resulting audio is converted to match. 332 333 Writing these bytes directly to a file results in a valid `RAW/PCM audio file <https://en.wikipedia.org/wiki/Raw_audio_format>`__. 334 """ 335 assert convert_rate is None or convert_rate > 0, "Sample rate to convert to must be a positive integer" 336 assert convert_width is None or (convert_width % 1 == 0 and 1 <= convert_width <= 4), "Sample width to convert to must be between 1 and 4 inclusive" 337 338 raw_data = self.frame_data 339 340 # make sure unsigned 8-bit audio (which uses unsigned samples) is handled like higher sample width audio (which uses signed samples) 341 if self.sample_width == 1: 342 raw_data = audioop.bias(raw_data, 1, -128) # subtract 128 from every sample to make them act like signed samples 343 344 # resample audio at the desired rate if specified 345 if convert_rate is not None and self.sample_rate != convert_rate: 346 raw_data, _ = audioop.ratecv(raw_data, self.sample_width, 1, self.sample_rate, convert_rate, None) 347 348 # convert samples to desired sample width if specified 349 if convert_width is not None and self.sample_width != convert_width: 350 if convert_width == 3: # we're converting the audio into 24-bit (workaround for https://bugs.python.org/issue12866) 351 raw_data = audioop.lin2lin(raw_data, self.sample_width, 4) # convert audio into 32-bit first, which is always supported 352 try: audioop.bias(b"", 3, 0) # test whether 24-bit audio is supported (for example, ``audioop`` in Python 3.3 and below don't support sample width 3, while Python 3.4+ do) 353 except audioop.error: # this version of audioop doesn't support 24-bit audio (probably Python 3.3 or less) 354 raw_data = b"".join(raw_data[i + 1:i + 4] for i in range(0, len(raw_data), 4)) # since we're in little endian, we discard the first byte from each 32-bit sample to get a 24-bit sample 355 else: # 24-bit audio fully supported, we don't need to shim anything 356 raw_data = audioop.lin2lin(raw_data, self.sample_width, convert_width) 357 else: 358 raw_data = audioop.lin2lin(raw_data, self.sample_width, convert_width) 359 360 # if the output is 8-bit audio with unsigned samples, convert the samples we've been treating as signed to unsigned again 361 if convert_width == 1: 362 raw_data = audioop.bias(raw_data, 1, 128) # add 128 to every sample to make them act like unsigned samples again 363 364 return raw_data 365 366 def get_wav_data(self, convert_rate=None, convert_width=None): 367 """ 368 Returns a byte string representing the contents of a WAV file containing the audio represented by the ``AudioData`` instance. 369 370 If ``convert_width`` is specified and the audio samples are not ``convert_width`` bytes each, the resulting audio is converted to match. 371 372 If ``convert_rate`` is specified and the audio sample rate is not ``convert_rate`` Hz, the resulting audio is resampled to match. 373 374 Writing these bytes directly to a file results in a valid `WAV file <https://en.wikipedia.org/wiki/WAV>`__. 375 """ 376 raw_data = self.get_raw_data(convert_rate, convert_width) 377 sample_rate = self.sample_rate if convert_rate is None else convert_rate 378 sample_width = self.sample_width if convert_width is None else convert_width 379 380 # generate the WAV file contents 381 with io.BytesIO() as wav_file: 382 wav_writer = wave.open(wav_file, "wb") 383 try: # note that we can't use context manager, since that was only added in Python 3.4 384 wav_writer.setframerate(sample_rate) 385 wav_writer.setsampwidth(sample_width) 386 wav_writer.setnchannels(1) 387 wav_writer.writeframes(raw_data) 388 wav_data = wav_file.getvalue() 389 finally: # make sure resources are cleaned up 390 wav_writer.close() 391 return wav_data 392 393 def get_aiff_data(self, convert_rate=None, convert_width=None): 394 """ 395 Returns a byte string representing the contents of an AIFF-C file containing the audio represented by the ``AudioData`` instance. 396 397 If ``convert_width`` is specified and the audio samples are not ``convert_width`` bytes each, the resulting audio is converted to match. 398 399 If ``convert_rate`` is specified and the audio sample rate is not ``convert_rate`` Hz, the resulting audio is resampled to match. 400 401 Writing these bytes directly to a file results in a valid `AIFF-C file <https://en.wikipedia.org/wiki/Audio_Interchange_File_Format>`__. 402 """ 403 raw_data = self.get_raw_data(convert_rate, convert_width) 404 sample_rate = self.sample_rate if convert_rate is None else convert_rate 405 sample_width = self.sample_width if convert_width is None else convert_width 406 407 # the AIFF format is big-endian, so we need to covnert the little-endian raw data to big-endian 408 if hasattr(audioop, "byteswap"): # ``audioop.byteswap`` was only added in Python 3.4 409 raw_data = audioop.byteswap(raw_data, sample_width) 410 else: # manually reverse the bytes of each sample, which is slower but works well enough as a fallback 411 raw_data = raw_data[sample_width - 1::-1] + b"".join(raw_data[i + sample_width:i:-1] for i in range(sample_width - 1, len(raw_data), sample_width)) 412 413 # generate the AIFF-C file contents 414 with io.BytesIO() as aiff_file: 415 aiff_writer = aifc.open(aiff_file, "wb") 416 try: # note that we can't use context manager, since that was only added in Python 3.4 417 aiff_writer.setframerate(sample_rate) 418 aiff_writer.setsampwidth(sample_width) 419 aiff_writer.setnchannels(1) 420 aiff_writer.writeframes(raw_data) 421 aiff_data = aiff_file.getvalue() 422 finally: # make sure resources are cleaned up 423 aiff_writer.close() 424 return aiff_data 425 426 def get_flac_data(self, convert_rate=None, convert_width=None): 427 """ 428 Returns a byte string representing the contents of a FLAC file containing the audio represented by the ``AudioData`` instance. 429 430 Note that 32-bit FLAC is not supported. If the audio data is 32-bit and ``convert_width`` is not specified, then the resulting FLAC will be a 24-bit FLAC. 431 432 If ``convert_rate`` is specified and the audio sample rate is not ``convert_rate`` Hz, the resulting audio is resampled to match. 433 434 If ``convert_width`` is specified and the audio samples are not ``convert_width`` bytes each, the resulting audio is converted to match. 435 436 Writing these bytes directly to a file results in a valid `FLAC file <https://en.wikipedia.org/wiki/FLAC>`__. 437 """ 438 assert convert_width is None or (convert_width % 1 == 0 and 1 <= convert_width <= 3), "Sample width to convert to must be between 1 and 3 inclusive" 439 440 if self.sample_width > 3 and convert_width is None: # resulting WAV data would be 32-bit, which is not convertable to FLAC using our encoder 441 convert_width = 3 # the largest supported sample width is 24-bit, so we'll limit the sample width to that 442 443 # run the FLAC converter with the WAV data to get the FLAC data 444 wav_data = self.get_wav_data(convert_rate, convert_width) 445 flac_converter = get_flac_converter() 446 if os.name == "nt": # on Windows, specify that the process is to be started without showing a console window 447 startup_info = subprocess.STARTUPINFO() 448 startup_info.dwFlags |= subprocess.STARTF_USESHOWWINDOW # specify that the wShowWindow field of `startup_info` contains a value 449 startup_info.wShowWindow = subprocess.SW_HIDE # specify that the console window should be hidden 450 else: 451 startup_info = None # default startupinfo 452 process = subprocess.Popen([ 453 flac_converter, 454 "--stdout", "--totally-silent", # put the resulting FLAC file in stdout, and make sure it's not mixed with any program output 455 "--best", # highest level of compression available 456 "-", # the input FLAC file contents will be given in stdin 457 ], stdin=subprocess.PIPE, stdout=subprocess.PIPE, startupinfo=startup_info) 458 flac_data, stderr = process.communicate(wav_data) 459 return flac_data 460 461 462class Recognizer(AudioSource): 463 def __init__(self): 464 """ 465 Creates a new ``Recognizer`` instance, which represents a collection of speech recognition functionality. 466 """ 467 self.energy_threshold = 300 # minimum audio energy to consider for recording 468 self.dynamic_energy_threshold = True 469 self.dynamic_energy_adjustment_damping = 0.15 470 self.dynamic_energy_ratio = 1.5 471 self.pause_threshold = 0.8 # seconds of non-speaking audio before a phrase is considered complete 472 self.operation_timeout = None # seconds after an internal operation (e.g., an API request) starts before it times out, or ``None`` for no timeout 473 474 self.phrase_threshold = 0.3 # minimum seconds of speaking audio before we consider the speaking audio a phrase - values below this are ignored (for filtering out clicks and pops) 475 self.non_speaking_duration = 0.5 # seconds of non-speaking audio to keep on both sides of the recording 476 477 def record(self, source, duration=None, offset=None): 478 """ 479 Records up to ``duration`` seconds of audio from ``source`` (an ``AudioSource`` instance) starting at ``offset`` (or at the beginning if not specified) into an ``AudioData`` instance, which it returns. 480 481 If ``duration`` is not specified, then it will record until there is no more audio input. 482 """ 483 assert isinstance(source, AudioSource), "Source must be an audio source" 484 assert source.stream is not None, "Audio source must be entered before recording, see documentation for ``AudioSource``; are you using ``source`` outside of a ``with`` statement?" 485 486 frames = io.BytesIO() 487 seconds_per_buffer = (source.CHUNK + 0.0) / source.SAMPLE_RATE 488 elapsed_time = 0 489 offset_time = 0 490 offset_reached = False 491 while True: # loop for the total number of chunks needed 492 if offset and not offset_reached: 493 offset_time += seconds_per_buffer 494 if offset_time > offset: 495 offset_reached = True 496 497 buffer = source.stream.read(source.CHUNK) 498 if len(buffer) == 0: break 499 500 if offset_reached or not offset: 501 elapsed_time += seconds_per_buffer 502 if duration and elapsed_time > duration: break 503 504 frames.write(buffer) 505 506 frame_data = frames.getvalue() 507 frames.close() 508 return AudioData(frame_data, source.SAMPLE_RATE, source.SAMPLE_WIDTH) 509 510 def adjust_for_ambient_noise(self, source, duration=1): 511 """ 512 Adjusts the energy threshold dynamically using audio from ``source`` (an ``AudioSource`` instance) to account for ambient noise. 513 514 Intended to calibrate the energy threshold with the ambient energy level. Should be used on periods of audio without speech - will stop early if any speech is detected. 515 516 The ``duration`` parameter is the maximum number of seconds that it will dynamically adjust the threshold for before returning. This value should be at least 0.5 in order to get a representative sample of the ambient noise. 517 """ 518 assert isinstance(source, AudioSource), "Source must be an audio source" 519 assert source.stream is not None, "Audio source must be entered before adjusting, see documentation for ``AudioSource``; are you using ``source`` outside of a ``with`` statement?" 520 assert self.pause_threshold >= self.non_speaking_duration >= 0 521 522 seconds_per_buffer = (source.CHUNK + 0.0) / source.SAMPLE_RATE 523 elapsed_time = 0 524 525 # adjust energy threshold until a phrase starts 526 while True: 527 elapsed_time += seconds_per_buffer 528 if elapsed_time > duration: break 529 buffer = source.stream.read(source.CHUNK) 530 energy = audioop.rms(buffer, source.SAMPLE_WIDTH) # energy of the audio signal 531 532 # dynamically adjust the energy threshold using asymmetric weighted average 533 damping = self.dynamic_energy_adjustment_damping ** seconds_per_buffer # account for different chunk sizes and rates 534 target_energy = energy * self.dynamic_energy_ratio 535 self.energy_threshold = self.energy_threshold * damping + target_energy * (1 - damping) 536 537 def snowboy_wait_for_hot_word(self, snowboy_location, snowboy_hot_word_files, source, timeout=None): 538 # load snowboy library (NOT THREAD SAFE) 539 sys.path.append(snowboy_location) 540 import snowboydetect 541 sys.path.pop() 542 543 detector = snowboydetect.SnowboyDetect( 544 resource_filename=os.path.join(snowboy_location, "resources", "common.res").encode(), 545 model_str=",".join(snowboy_hot_word_files).encode() 546 ) 547 detector.SetAudioGain(1.0) 548 detector.SetSensitivity(",".join(["0.4"] * len(snowboy_hot_word_files)).encode()) 549 snowboy_sample_rate = detector.SampleRate() 550 551 elapsed_time = 0 552 seconds_per_buffer = float(source.CHUNK) / source.SAMPLE_RATE 553 resampling_state = None 554 555 # buffers capable of holding 5 seconds of original and resampled audio 556 five_seconds_buffer_count = int(math.ceil(5 / seconds_per_buffer)) 557 frames = collections.deque(maxlen=five_seconds_buffer_count) 558 resampled_frames = collections.deque(maxlen=five_seconds_buffer_count) 559 while True: 560 elapsed_time += seconds_per_buffer 561 if timeout and elapsed_time > timeout: 562 raise WaitTimeoutError("listening timed out while waiting for hotword to be said") 563 564 buffer = source.stream.read(source.CHUNK) 565 if len(buffer) == 0: break # reached end of the stream 566 frames.append(buffer) 567 568 # resample audio to the required sample rate 569 resampled_buffer, resampling_state = audioop.ratecv(buffer, source.SAMPLE_WIDTH, 1, source.SAMPLE_RATE, snowboy_sample_rate, resampling_state) 570 resampled_frames.append(resampled_buffer) 571 572 # run Snowboy on the resampled audio 573 snowboy_result = detector.RunDetection(b"".join(resampled_frames)) 574 assert snowboy_result != -1, "Error initializing streams or reading audio data" 575 if snowboy_result > 0: break # wake word found 576 577 return b"".join(frames), elapsed_time 578 579 def listen(self, source, timeout=None, phrase_time_limit=None, snowboy_configuration=None): 580 """ 581 Records a single phrase from ``source`` (an ``AudioSource`` instance) into an ``AudioData`` instance, which it returns. 582 583 This is done by waiting until the audio has an energy above ``recognizer_instance.energy_threshold`` (the user has started speaking), and then recording until it encounters ``recognizer_instance.pause_threshold`` seconds of non-speaking or there is no more audio input. The ending silence is not included. 584 585 The ``timeout`` parameter is the maximum number of seconds that this will wait for a phrase to start before giving up and throwing an ``speech_recognition.WaitTimeoutError`` exception. If ``timeout`` is ``None``, there will be no wait timeout. 586 587 The ``phrase_time_limit`` parameter is the maximum number of seconds that this will allow a phrase to continue before stopping and returning the part of the phrase processed before the time limit was reached. The resulting audio will be the phrase cut off at the time limit. If ``phrase_timeout`` is ``None``, there will be no phrase time limit. 588 589 The ``snowboy_configuration`` parameter allows integration with `Snowboy <https://snowboy.kitt.ai/>`__, an offline, high-accuracy, power-efficient hotword recognition engine. When used, this function will pause until Snowboy detects a hotword, after which it will unpause. This parameter should either be ``None`` to turn off Snowboy support, or a tuple of the form ``(SNOWBOY_LOCATION, LIST_OF_HOT_WORD_FILES)``, where ``SNOWBOY_LOCATION`` is the path to the Snowboy root directory, and ``LIST_OF_HOT_WORD_FILES`` is a list of paths to Snowboy hotword configuration files (`*.pmdl` or `*.umdl` format). 590 591 This operation will always complete within ``timeout + phrase_timeout`` seconds if both are numbers, either by returning the audio data, or by raising a ``speech_recognition.WaitTimeoutError`` exception. 592 """ 593 assert isinstance(source, AudioSource), "Source must be an audio source" 594 assert source.stream is not None, "Audio source must be entered before listening, see documentation for ``AudioSource``; are you using ``source`` outside of a ``with`` statement?" 595 assert self.pause_threshold >= self.non_speaking_duration >= 0 596 if snowboy_configuration is not None: 597 assert os.path.isfile(os.path.join(snowboy_configuration[0], "snowboydetect.py")), "``snowboy_configuration[0]`` must be a Snowboy root directory containing ``snowboydetect.py``" 598 for hot_word_file in snowboy_configuration[1]: 599 assert os.path.isfile(hot_word_file), "``snowboy_configuration[1]`` must be a list of Snowboy hot word configuration files" 600 601 seconds_per_buffer = float(source.CHUNK) / source.SAMPLE_RATE 602 pause_buffer_count = int(math.ceil(self.pause_threshold / seconds_per_buffer)) # number of buffers of non-speaking audio during a phrase, before the phrase should be considered complete 603 phrase_buffer_count = int(math.ceil(self.phrase_threshold / seconds_per_buffer)) # minimum number of buffers of speaking audio before we consider the speaking audio a phrase 604 non_speaking_buffer_count = int(math.ceil(self.non_speaking_duration / seconds_per_buffer)) # maximum number of buffers of non-speaking audio to retain before and after a phrase 605 606 # read audio input for phrases until there is a phrase that is long enough 607 elapsed_time = 0 # number of seconds of audio read 608 buffer = b"" # an empty buffer means that the stream has ended and there is no data left to read 609 while True: 610 frames = collections.deque() 611 612 if snowboy_configuration is None: 613 # store audio input until the phrase starts 614 while True: 615 # handle waiting too long for phrase by raising an exception 616 elapsed_time += seconds_per_buffer 617 if timeout and elapsed_time > timeout: 618 raise WaitTimeoutError("listening timed out while waiting for phrase to start") 619 620 buffer = source.stream.read(source.CHUNK) 621 if len(buffer) == 0: break # reached end of the stream 622 frames.append(buffer) 623 if len(frames) > non_speaking_buffer_count: # ensure we only keep the needed amount of non-speaking buffers 624 frames.popleft() 625 626 # detect whether speaking has started on audio input 627 energy = audioop.rms(buffer, source.SAMPLE_WIDTH) # energy of the audio signal 628 if energy > self.energy_threshold: break 629 630 # dynamically adjust the energy threshold using asymmetric weighted average 631 if self.dynamic_energy_threshold: 632 damping = self.dynamic_energy_adjustment_damping ** seconds_per_buffer # account for different chunk sizes and rates 633 target_energy = energy * self.dynamic_energy_ratio 634 self.energy_threshold = self.energy_threshold * damping + target_energy * (1 - damping) 635 else: 636 # read audio input until the hotword is said 637 snowboy_location, snowboy_hot_word_files = snowboy_configuration 638 buffer, delta_time = self.snowboy_wait_for_hot_word(snowboy_location, snowboy_hot_word_files, source, timeout) 639 elapsed_time += delta_time 640 if len(buffer) == 0: break # reached end of the stream 641 frames.append(buffer) 642 643 # read audio input until the phrase ends 644 pause_count, phrase_count = 0, 0 645 phrase_start_time = elapsed_time 646 while True: 647 # handle phrase being too long by cutting off the audio 648 elapsed_time += seconds_per_buffer 649 if phrase_time_limit and elapsed_time - phrase_start_time > phrase_time_limit: 650 break 651 652 buffer = source.stream.read(source.CHUNK) 653 if len(buffer) == 0: break # reached end of the stream 654 frames.append(buffer) 655 phrase_count += 1 656 657 # check if speaking has stopped for longer than the pause threshold on the audio input 658 energy = audioop.rms(buffer, source.SAMPLE_WIDTH) # unit energy of the audio signal within the buffer 659 if energy > self.energy_threshold: 660 pause_count = 0 661 else: 662 pause_count += 1 663 if pause_count > pause_buffer_count: # end of the phrase 664 break 665 666 # check how long the detected phrase is, and retry listening if the phrase is too short 667 phrase_count -= pause_count # exclude the buffers for the pause before the phrase 668 if phrase_count >= phrase_buffer_count or len(buffer) == 0: break # phrase is long enough or we've reached the end of the stream, so stop listening 669 670 # obtain frame data 671 for i in range(pause_count - non_speaking_buffer_count): frames.pop() # remove extra non-speaking frames at the end 672 frame_data = b"".join(frames) 673 674 return AudioData(frame_data, source.SAMPLE_RATE, source.SAMPLE_WIDTH) 675 676 def listen_in_background(self, source, callback, phrase_time_limit=None): 677 """ 678 Spawns a thread to repeatedly record phrases from ``source`` (an ``AudioSource`` instance) into an ``AudioData`` instance and call ``callback`` with that ``AudioData`` instance as soon as each phrase are detected. 679 680 Returns a function object that, when called, requests that the background listener thread stop. The background thread is a daemon and will not stop the program from exiting if there are no other non-daemon threads. The function accepts one parameter, ``wait_for_stop``: if truthy, the function will wait for the background listener to stop before returning, otherwise it will return immediately and the background listener thread might still be running for a second or two afterwards. Additionally, if you are using a truthy value for ``wait_for_stop``, you must call the function from the same thread you originally called ``listen_in_background`` from. 681 682 Phrase recognition uses the exact same mechanism as ``recognizer_instance.listen(source)``. The ``phrase_time_limit`` parameter works in the same way as the ``phrase_time_limit`` parameter for ``recognizer_instance.listen(source)``, as well. 683 684 The ``callback`` parameter is a function that should accept two parameters - the ``recognizer_instance``, and an ``AudioData`` instance representing the captured audio. Note that ``callback`` function will be called from a non-main thread. 685 """ 686 assert isinstance(source, AudioSource), "Source must be an audio source" 687 running = [True] 688 689 def threaded_listen(): 690 with source as s: 691 while running[0]: 692 try: # listen for 1 second, then check again if the stop function has been called 693 audio = self.listen(s, 1, phrase_time_limit) 694 except WaitTimeoutError: # listening timed out, just try again 695 pass 696 else: 697 if running[0]: callback(self, audio) 698 699 def stopper(wait_for_stop=True): 700 running[0] = False 701 if wait_for_stop: 702 listener_thread.join() # block until the background thread is done, which can take around 1 second 703 704 listener_thread = threading.Thread(target=threaded_listen) 705 listener_thread.daemon = True 706 listener_thread.start() 707 return stopper 708 709 def recognize_sphinx(self, audio_data, language="en-US", keyword_entries=None, grammar=None, show_all=False): 710 """ 711 Performs speech recognition on ``audio_data`` (an ``AudioData`` instance), using CMU Sphinx. 712 713 The recognition language is determined by ``language``, an RFC5646 language tag like ``"en-US"`` or ``"en-GB"``, defaulting to US English. Out of the box, only ``en-US`` is supported. See `Notes on using `PocketSphinx <https://github.com/Uberi/speech_recognition/blob/master/reference/pocketsphinx.rst>`__ for information about installing other languages. This document is also included under ``reference/pocketsphinx.rst``. The ``language`` parameter can also be a tuple of filesystem paths, of the form ``(acoustic_parameters_directory, language_model_file, phoneme_dictionary_file)`` - this allows you to load arbitrary Sphinx models. 714 715 If specified, the keywords to search for are determined by ``keyword_entries``, an iterable of tuples of the form ``(keyword, sensitivity)``, where ``keyword`` is a phrase, and ``sensitivity`` is how sensitive to this phrase the recognizer should be, on a scale of 0 (very insensitive, more false negatives) to 1 (very sensitive, more false positives) inclusive. If not specified or ``None``, no keywords are used and Sphinx will simply transcribe whatever words it recognizes. Specifying ``keyword_entries`` is more accurate than just looking for those same keywords in non-keyword-based transcriptions, because Sphinx knows specifically what sounds to look for. 716 717 Sphinx can also handle FSG or JSGF grammars. The parameter ``grammar`` expects a path to the grammar file. Note that if a JSGF grammar is passed, an FSG grammar will be created at the same location to speed up execution in the next run. If ``keyword_entries`` are passed, content of ``grammar`` will be ignored. 718 719 Returns the most likely transcription if ``show_all`` is false (the default). Otherwise, returns the Sphinx ``pocketsphinx.pocketsphinx.Decoder`` object resulting from the recognition. 720 721 Raises a ``speech_recognition.UnknownValueError`` exception if the speech is unintelligible. Raises a ``speech_recognition.RequestError`` exception if there are any issues with the Sphinx installation. 722 """ 723 assert isinstance(audio_data, AudioData), "``audio_data`` must be audio data" 724 assert isinstance(language, str) or (isinstance(language, tuple) and len(language) == 3), "``language`` must be a string or 3-tuple of Sphinx data file paths of the form ``(acoustic_parameters, language_model, phoneme_dictionary)``" 725 assert keyword_entries is None or all(isinstance(keyword, (type(""), type(u""))) and 0 <= sensitivity <= 1 for keyword, sensitivity in keyword_entries), "``keyword_entries`` must be ``None`` or a list of pairs of strings and numbers between 0 and 1" 726 727 # import the PocketSphinx speech recognition module 728 try: 729 from pocketsphinx import pocketsphinx, Jsgf, FsgModel 730 731 except ImportError: 732 raise RequestError("missing PocketSphinx module: ensure that PocketSphinx is set up correctly.") 733 except ValueError: 734 raise RequestError("bad PocketSphinx installation; try reinstalling PocketSphinx version 0.0.9 or better.") 735 if not hasattr(pocketsphinx, "Decoder") or not hasattr(pocketsphinx.Decoder, "default_config"): 736 raise RequestError("outdated PocketSphinx installation; ensure you have PocketSphinx version 0.0.9 or better.") 737 738 if isinstance(language, str): # directory containing language data 739 language_directory = os.path.join(os.path.dirname(os.path.realpath(__file__)), "pocketsphinx-data", language) 740 if not os.path.isdir(language_directory): 741 raise RequestError("missing PocketSphinx language data directory: \"{}\"".format(language_directory)) 742 acoustic_parameters_directory = os.path.join(language_directory, "acoustic-model") 743 language_model_file = os.path.join(language_directory, "language-model.lm.bin") 744 phoneme_dictionary_file = os.path.join(language_directory, "pronounciation-dictionary.dict") 745 else: # 3-tuple of Sphinx data file paths 746 acoustic_parameters_directory, language_model_file, phoneme_dictionary_file = language 747 if not os.path.isdir(acoustic_parameters_directory): 748 raise RequestError("missing PocketSphinx language model parameters directory: \"{}\"".format(acoustic_parameters_directory)) 749 if not os.path.isfile(language_model_file): 750 raise RequestError("missing PocketSphinx language model file: \"{}\"".format(language_model_file)) 751 if not os.path.isfile(phoneme_dictionary_file): 752 raise RequestError("missing PocketSphinx phoneme dictionary file: \"{}\"".format(phoneme_dictionary_file)) 753 754 # create decoder object 755 config = pocketsphinx.Decoder.default_config() 756 config.set_string("-hmm", acoustic_parameters_directory) # set the path of the hidden Markov model (HMM) parameter files 757 config.set_string("-lm", language_model_file) 758 config.set_string("-dict", phoneme_dictionary_file) 759 config.set_string("-logfn", os.devnull) # disable logging (logging causes unwanted output in terminal) 760 decoder = pocketsphinx.Decoder(config) 761 762 # obtain audio data 763 raw_data = audio_data.get_raw_data(convert_rate=16000, convert_width=2) # the included language models require audio to be 16-bit mono 16 kHz in little-endian format 764 765 # obtain recognition results 766 if keyword_entries is not None: # explicitly specified set of keywords 767 with PortableNamedTemporaryFile("w") as f: 768 # generate a keywords file - Sphinx documentation recommendeds sensitivities between 1e-50 and 1e-5 769 f.writelines("{} /1e{}/\n".format(keyword, 100 * sensitivity - 110) for keyword, sensitivity in keyword_entries) 770 f.flush() 771 772 # perform the speech recognition with the keywords file (this is inside the context manager so the file isn;t deleted until we're done) 773 decoder.set_kws("keywords", f.name) 774 decoder.set_search("keywords") 775 decoder.start_utt() # begin utterance processing 776 decoder.process_raw(raw_data, False, True) # process audio data with recognition enabled (no_search = False), as a full utterance (full_utt = True) 777 decoder.end_utt() # stop utterance processing 778 elif grammar is not None: # a path to a FSG or JSGF grammar 779 if not os.path.exists(grammar): 780 raise ValueError("Grammar '{0}' does not exist.".format(grammar)) 781 grammar_path = os.path.abspath(os.path.dirname(grammar)) 782 grammar_name = os.path.splitext(os.path.basename(grammar))[0] 783 fsg_path = "{0}/{1}.fsg".format(grammar_path, grammar_name) 784 if not os.path.exists(fsg_path): # create FSG grammar if not available 785 jsgf = Jsgf(grammar) 786 rule = jsgf.get_rule("{0}.{0}".format(grammar_name)) 787 fsg = jsgf.build_fsg(rule, decoder.get_logmath(), 7.5) 788 fsg.writefile(fsg_path) 789 else: 790 fsg = FsgModel(fsg_path, decoder.get_logmath(), 7.5) 791 decoder.set_fsg(grammar_name, fsg) 792 decoder.set_search(grammar_name) 793 decoder.start_utt() 794 decoder.process_raw(raw_data, False, True) # process audio data with recognition enabled (no_search = False), as a full utterance (full_utt = True) 795 decoder.end_utt() # stop utterance processing 796 else: # no keywords, perform freeform recognition 797 decoder.start_utt() # begin utterance processing 798 decoder.process_raw(raw_data, False, True) # process audio data with recognition enabled (no_search = False), as a full utterance (full_utt = True) 799 decoder.end_utt() # stop utterance processing 800 801 if show_all: return decoder 802 803 # return results 804 hypothesis = decoder.hyp() 805 if hypothesis is not None: return hypothesis.hypstr 806 raise UnknownValueError() # no transcriptions available 807 808 def recognize_google(self, audio_data, key=None, language="en-US", show_all=False): 809 """ 810 Performs speech recognition on ``audio_data`` (an ``AudioData`` instance), using the Google Speech Recognition API. 811 812 The Google Speech Recognition API key is specified by ``key``. If not specified, it uses a generic key that works out of the box. This should generally be used for personal or testing purposes only, as it **may be revoked by Google at any time**. 813 814 To obtain your own API key, simply following the steps on the `API Keys <http://www.chromium.org/developers/how-tos/api-keys>`__ page at the Chromium Developers site. In the Google Developers Console, Google Speech Recognition is listed as "Speech API". 815 816 The recognition language is determined by ``language``, an RFC5646 language tag like ``"en-US"`` (US English) or ``"fr-FR"`` (International French), defaulting to US English. A list of supported language tags can be found in this `StackOverflow answer <http://stackoverflow.com/a/14302134>`__. 817 818 Returns the most likely transcription if ``show_all`` is false (the default). Otherwise, returns the raw API response as a JSON dictionary. 819 820 Raises a ``speech_recognition.UnknownValueError`` exception if the speech is unintelligible. Raises a ``speech_recognition.RequestError`` exception if the speech recognition operation failed, if the key isn't valid, or if there is no internet connection. 821 """ 822 assert isinstance(audio_data, AudioData), "``audio_data`` must be audio data" 823 assert key is None or isinstance(key, str), "``key`` must be ``None`` or a string" 824 assert isinstance(language, str), "``language`` must be a string" 825 826 flac_data = audio_data.get_flac_data( 827 convert_rate=None if audio_data.sample_rate >= 8000 else 8000, # audio samples must be at least 8 kHz 828 convert_width=2 # audio samples must be 16-bit 829 ) 830 if key is None: key = "AIzaSyBOti4mM-6x9WDnZIjIeyEU21OpBXqWBgw" 831 url = "http://www.google.com/speech-api/v2/recognize?{}".format(urlencode({ 832 "client": "chromium", 833 "lang": language, 834 "key": key, 835 })) 836 request = Request(url, data=flac_data, headers={"Content-Type": "audio/x-flac; rate={}".format(audio_data.sample_rate)}) 837 838 # obtain audio transcription results 839 try: 840 response = urlopen(request, timeout=self.operation_timeout) 841 except HTTPError as e: 842 raise RequestError("recognition request failed: {}".format(e.reason)) 843 except URLError as e: 844 raise RequestError("recognition connection failed: {}".format(e.reason)) 845 response_text = response.read().decode("utf-8") 846 847 # ignore any blank blocks 848 actual_result = [] 849 for line in response_text.split("\n"): 850 if not line: continue 851 result = json.loads(line)["result"] 852 if len(result) != 0: 853 actual_result = result[0] 854 break 855 856 # return results 857 if show_all: return actual_result 858 if not isinstance(actual_result, dict) or len(actual_result.get("alternative", [])) == 0: raise UnknownValueError() 859 860 if "confidence" in actual_result["alternative"]: 861 # return alternative with highest confidence score 862 best_hypothesis = max(actual_result["alternative"], key=lambda alternative: alternative["confidence"]) 863 else: 864 # when there is no confidence available, we arbitrarily choose the first hypothesis. 865 best_hypothesis = actual_result["alternative"][0] 866 if "transcript" not in best_hypothesis: raise UnknownValueError() 867 return best_hypothesis["transcript"] 868 869 def recognize_google_cloud(self, audio_data, credentials_json=None, language="en-US", preferred_phrases=None, show_all=False): 870 """ 871 Performs speech recognition on ``audio_data`` (an ``AudioData`` instance), using the Google Cloud Speech API. 872 873 This function requires a Google Cloud Platform account; see the `Google Cloud Speech API Quickstart <https://cloud.google.com/speech/docs/getting-started>`__ for details and instructions. Basically, create a project, enable billing for the project, enable the Google Cloud Speech API for the project, and set up Service Account Key credentials for the project. The result is a JSON file containing the API credentials. The text content of this JSON file is specified by ``credentials_json``. If not specified, the library will try to automatically `find the default API credentials JSON file <https://developers.google.com/identity/protocols/application-default-credentials>`__. 874 875 The recognition language is determined by ``language``, which is a BCP-47 language tag like ``"en-US"`` (US English). A list of supported language tags can be found in the `Google Cloud Speech API documentation <https://cloud.google.com/speech/docs/languages>`__. 876 877 If ``preferred_phrases`` is an iterable of phrase strings, those given phrases will be more likely to be recognized over similar-sounding alternatives. This is useful for things like keyword/command recognition or adding new phrases that aren't in Google's vocabulary. Note that the API imposes certain `restrictions on the list of phrase strings <https://cloud.google.com/speech/limits#content>`__. 878 879 Returns the most likely transcription if ``show_all`` is False (the default). Otherwise, returns the raw API response as a JSON dictionary. 880 881 Raises a ``speech_recognition.UnknownValueError`` exception if the speech is unintelligible. Raises a ``speech_recognition.RequestError`` exception if the speech recognition operation failed, if the credentials aren't valid, or if there is no Internet connection. 882 """ 883 assert isinstance(audio_data, AudioData), "``audio_data`` must be audio data" 884 if credentials_json is not None: 885 try: json.loads(credentials_json) 886 except Exception: raise AssertionError("``credentials_json`` must be ``None`` or a valid JSON string") 887 assert isinstance(language, str), "``language`` must be a string" 888 assert preferred_phrases is None or all(isinstance(preferred_phrases, (type(""), type(u""))) for preferred_phrases in preferred_phrases), "``preferred_phrases`` must be a list of strings" 889 890 # See https://cloud.google.com/speech/reference/rest/v1/RecognitionConfig 891 flac_data = audio_data.get_flac_data( 892 convert_rate=None if 8000 <= audio_data.sample_rate <= 48000 else max(8000, min(audio_data.sample_rate, 48000)), # audio sample rate must be between 8 kHz and 48 kHz inclusive - clamp sample rate into this range 893 convert_width=2 # audio samples must be 16-bit 894 ) 895 896 try: 897 from oauth2client.client import GoogleCredentials 898 from googleapiclient.discovery import build 899 import googleapiclient.errors 900 901 # cannot simply use 'http = httplib2.Http(timeout=self.operation_timeout)' 902 # because discovery.build() says 'Arguments http and credentials are mutually exclusive' 903 import socket 904 import googleapiclient.http 905 if self.operation_timeout and socket.getdefaulttimeout() is None: 906 # override constant (used by googleapiclient.http.build_http()) 907 googleapiclient.http.DEFAULT_HTTP_TIMEOUT_SEC = self.operation_timeout 908 909 if credentials_json is None: 910 api_credentials = GoogleCredentials.get_application_default() 911 else: 912 # the credentials can only be read from a file, so we'll make a temp file and write in the contents to work around that 913 with PortableNamedTemporaryFile("w") as f: 914 f.write(credentials_json) 915 f.flush() 916 api_credentials = GoogleCredentials.from_stream(f.name) 917 918 speech_service = build("speech", "v1", credentials=api_credentials, cache_discovery=False) 919 except ImportError: 920 raise RequestError("missing google-api-python-client module: ensure that google-api-python-client is set up correctly.") 921 922 speech_config = {"encoding": "FLAC", "sampleRateHertz": audio_data.sample_rate, "languageCode": language} 923 if preferred_phrases is not None: 924 speech_config["speechContext"] = {"phrases": preferred_phrases} 925 if show_all: 926 speech_config["enableWordTimeOffsets"] = True # some useful extra options for when we want all the output 927 request = speech_service.speech().recognize(body={"audio": {"content": base64.b64encode(flac_data).decode("utf8")}, "config": speech_config}) 928 929 try: 930 response = request.execute() 931 except googleapiclient.errors.HttpError as e: 932 raise RequestError(e) 933 except URLError as e: 934 raise RequestError("recognition connection failed: {0}".format(e.reason)) 935 936 if show_all: return response 937 if "results" not in response or len(response["results"]) == 0: raise UnknownValueError() 938 transcript = "" 939 for result in response["results"]: 940 transcript += result["alternatives"][0]["transcript"].strip() + " " 941 942 return transcript 943 944 def recognize_wit(self, audio_data, key, show_all=False): 945 """ 946 Performs speech recognition on ``audio_data`` (an ``AudioData`` instance), using the Wit.ai API. 947 948 The Wit.ai API key is specified by ``key``. Unfortunately, these are not available without `signing up for an account <https://wit.ai/>`__ and creating an app. You will need to add at least one intent to the app before you can see the API key, though the actual intent settings don't matter. 949 950 To get the API key for a Wit.ai app, go to the app's overview page, go to the section titled "Make an API request", and look for something along the lines of ``Authorization: Bearer XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX``; ``XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX`` is the API key. Wit.ai API keys are 32-character uppercase alphanumeric strings. 951 952 The recognition language is configured in the Wit.ai app settings. 953 954 Returns the most likely transcription if ``show_all`` is false (the default). Otherwise, returns the `raw API response <https://wit.ai/docs/http/20141022#get-intent-via-text-link>`__ as a JSON dictionary. 955 956 Raises a ``speech_recognition.UnknownValueError`` exception if the speech is unintelligible. Raises a ``speech_recognition.RequestError`` exception if the speech recognition operation failed, if the key isn't valid, or if there is no internet connection. 957 """ 958 assert isinstance(audio_data, AudioData), "Data must be audio data" 959 assert isinstance(key, str), "``key`` must be a string" 960 961 wav_data = audio_data.get_wav_data( 962 convert_rate=None if audio_data.sample_rate >= 8000 else 8000, # audio samples must be at least 8 kHz 963 convert_width=2 # audio samples should be 16-bit 964 ) 965 url = "https://api.wit.ai/speech?v=20160526" 966 request = Request(url, data=wav_data, headers={"Authorization": "Bearer {}".format(key), "Content-Type": "audio/wav"}) 967 try: 968 response = urlopen(request, timeout=self.operation_timeout) 969 except HTTPError as e: 970 raise RequestError("recognition request failed: {}".format(e.reason)) 971 except URLError as e: 972 raise RequestError("recognition connection failed: {}".format(e.reason)) 973 response_text = response.read().decode("utf-8") 974 result = json.loads(response_text) 975 976 # return results 977 if show_all: return result 978 if "_text" not in result or result["_text"] is None: raise UnknownValueError() 979 return result["_text"] 980 981 def recognize_bing(self, audio_data, key, language="en-US", show_all=False): 982 """ 983 Performs speech recognition on ``audio_data`` (an ``AudioData`` instance), using the Microsoft Bing Speech API. 984 985 The Microsoft Bing Speech API key is specified by ``key``. Unfortunately, these are not available without `signing up for an account <https://azure.microsoft.com/en-ca/pricing/details/cognitive-services/speech-api/>`__ with Microsoft Azure. 986 987 To get the API key, go to the `Microsoft Azure Portal Resources <https://portal.azure.com/>`__ page, go to "All Resources" > "Add" > "See All" > Search "Bing Speech API > "Create", and fill in the form to make a "Bing Speech API" resource. On the resulting page (which is also accessible from the "All Resources" page in the Azure Portal), go to the "Show Access Keys" page, which will have two API keys, either of which can be used for the `key` parameter. Microsoft Bing Speech API keys are 32-character lowercase hexadecimal strings. 988 989 The recognition language is determined by ``language``, a BCP-47 language tag like ``"en-US"`` (US English) or ``"fr-FR"`` (International French), defaulting to US English. A list of supported language values can be found in the `API documentation <https://docs.microsoft.com/en-us/azure/cognitive-services/speech/api-reference-rest/bingvoicerecognition#recognition-language>`__ under "Interactive and dictation mode". 990 991 Returns the most likely transcription if ``show_all`` is false (the default). Otherwise, returns the `raw API response <https://docs.microsoft.com/en-us/azure/cognitive-services/speech/api-reference-rest/bingvoicerecognition#sample-responses>`__ as a JSON dictionary. 992 993 Raises a ``speech_recognition.UnknownValueError`` exception if the speech is unintelligible. Raises a ``speech_recognition.RequestError`` exception if the speech recognition operation failed, if the key isn't valid, or if there is no internet connection. 994 """ 995 assert isinstance(audio_data, AudioData), "Data must be audio data" 996 assert isinstance(key, str), "``key`` must be a string" 997 assert isinstance(language, str), "``language`` must be a string" 998 999 access_token, expire_time = getattr(self, "bing_cached_access_token", None), getattr(self, "bing_cached_access_token_expiry", None) 1000 allow_caching = True 1001 try: 1002 from time import monotonic # we need monotonic time to avoid being affected by system clock changes, but this is only available in Python 3.3+ 1003 except ImportError: 1004 try: 1005 from monotonic import monotonic # use time.monotonic backport for Python 2 if available (from https://pypi.python.org/pypi/monotonic) 1006 except (ImportError, RuntimeError): 1007 expire_time = None # monotonic time not available, don't cache access tokens 1008 allow_caching = False # don't allow caching, since monotonic time isn't available 1009 if expire_time is None or monotonic() > expire_time: # caching not enabled, first credential request, or the access token from the previous one expired 1010 # get an access token using OAuth 1011 credential_url = "https://api.cognitive.microsoft.com/sts/v1.0/issueToken" 1012 credential_request = Request(credential_url, data=b"", headers={ 1013 "Content-type": "application/x-www-form-urlencoded", 1014 "Content-Length": "0", 1015 "Ocp-Apim-Subscription-Key": key, 1016 }) 1017 1018 if allow_caching: 1019 start_time = monotonic() 1020 1021 try: 1022 credential_response = urlopen(credential_request, timeout=60) # credential response can take longer, use longer timeout instead of default one 1023 except HTTPError as e: 1024 raise RequestError("credential request failed: {}".format(e.reason)) 1025 except URLError as e: 1026 raise RequestError("credential connection failed: {}".format(e.reason)) 1027 access_token = credential_response.read().decode("utf-8") 1028 1029 if allow_caching: 1030 # save the token for the duration it is valid for 1031 self.bing_cached_access_token = access_token 1032 self.bing_cached_access_token_expiry = start_time + 600 # according to https://docs.microsoft.com/en-us/azure/cognitive-services/speech/api-reference-rest/bingvoicerecognition, the token expires in exactly 10 minutes 1033 1034 wav_data = audio_data.get_wav_data( 1035 convert_rate=16000, # audio samples must be 8kHz or 16 kHz 1036 convert_width=2 # audio samples should be 16-bit 1037 ) 1038 1039 url = "https://speech.platform.bing.com/speech/recognition/interactive/cognitiveservices/v1?{}".format(urlencode({ 1040 "language": language, 1041 "locale": language, 1042 "requestid": uuid.uuid4(), 1043 })) 1044 1045 if sys.version_info >= (3, 6): # chunked-transfer requests are only supported in the standard library as of Python 3.6+, use it if possible 1046 request = Request(url, data=io.BytesIO(wav_data), headers={ 1047 "Authorization": "Bearer {}".format(access_token), 1048 "Content-type": "audio/wav; codec=\"audio/pcm\"; samplerate=16000", 1049 "Transfer-Encoding": "chunked", 1050 }) 1051 else: # fall back on manually formatting the POST body as a chunked request 1052 ascii_hex_data_length = "{:X}".format(len(wav_data)).encode("utf-8") 1053 chunked_transfer_encoding_data = ascii_hex_data_length + b"\r\n" + wav_data + b"\r\n0\r\n\r\n" 1054 request = Request(url, data=chunked_transfer_encoding_data, headers={ 1055 "Authorization": "Bearer {}".format(access_token), 1056 "Content-type": "audio/wav; codec=\"audio/pcm\"; samplerate=16000", 1057 "Transfer-Encoding": "chunked", 1058 }) 1059 1060 try: 1061 response = urlopen(request, timeout=self.operation_timeout) 1062 except HTTPError as e: 1063 raise RequestError("recognition request failed: {}".format(e.reason)) 1064 except URLError as e: 1065 raise RequestError("recognition connection failed: {}".format(e.reason)) 1066 response_text = response.read().decode("utf-8") 1067 result = json.loads(response_text) 1068 1069 # return results 1070 if show_all: return result 1071 if "RecognitionStatus" not in result or result["RecognitionStatus"] != "Success" or "DisplayText" not in result: raise UnknownValueError() 1072 return result["DisplayText"] 1073 1074 def recognize_houndify(self, audio_data, client_id, client_key, show_all=False): 1075 """ 1076 Performs speech recognition on ``audio_data`` (an ``AudioData`` instance), using the Houndify API. 1077 1078 The Houndify client ID and client key are specified by ``client_id`` and ``client_key``, respectively. Unfortunately, these are not available without `signing up for an account <https://www.houndify.com/signup>`__. Once logged into the `dashboard <https://www.houndify.com/dashboard>`__, you will want to select "Register a new client", and fill in the form as necessary. When at the "Enable Domains" page, enable the "Speech To Text Only" domain, and then select "Save & Continue". 1079 1080 To get the client ID and client key for a Houndify client, go to the `dashboard <https://www.houndify.com/dashboard>`__ and select the client's "View Details" link. On the resulting page, the client ID and client key will be visible. Client IDs and client keys are both Base64-encoded strings. 1081 1082 Currently, only English is supported as a recognition language. 1083 1084 Returns the most likely transcription if ``show_all`` is false (the default). Otherwise, returns the raw API response as a JSON dictionary. 1085 1086 Raises a ``speech_recognition.UnknownValueError`` exception if the speech is unintelligible. Raises a ``speech_recognition.RequestError`` exception if the speech recognition operation failed, if the key isn't valid, or if there is no internet connection. 1087 """ 1088 assert isinstance(audio_data, AudioData), "Data must be audio data" 1089 assert isinstance(client_id, str), "``client_id`` must be a string" 1090 assert isinstance(client_key, str), "``client_key`` must be a string" 1091 1092 wav_data = audio_data.get_wav_data( 1093 convert_rate=None if audio_data.sample_rate in [8000, 16000] else 16000, # audio samples must be 8 kHz or 16 kHz 1094 convert_width=2 # audio samples should be 16-bit 1095 ) 1096 url = "https://api.houndify.com/v1/audio" 1097 user_id, request_id = str(uuid.uuid4()), str(uuid.uuid4()) 1098 request_time = str(int(time.time())) 1099 request_signature = base64.urlsafe_b64encode( 1100 hmac.new( 1101 base64.urlsafe_b64decode(client_key), 1102 user_id.encode("utf-8") + b";" + request_id.encode("utf-8") + request_time.encode("utf-8"), 1103 hashlib.sha256 1104 ).digest() # get the HMAC digest as bytes 1105 ).decode("utf-8") 1106 request = Request(url, data=wav_data, headers={ 1107 "Content-Type": "application/json", 1108 "Hound-Request-Info": json.dumps({"ClientID": client_id, "UserID": user_id}), 1109 "Hound-Request-Authentication": "{};{}".format(user_id, request_id), 1110 "Hound-Client-Authentication": "{};{};{}".format(client_id, request_time, request_signature) 1111 }) 1112 try: 1113 response = urlopen(request, timeout=self.operation_timeout) 1114 except HTTPError as e: 1115 raise RequestError("recognition request failed: {}".format(e.reason)) 1116 except URLError as e: 1117 raise RequestError("recognition connection failed: {}".format(e.reason)) 1118 response_text = response.read().decode("utf-8") 1119 result = json.loads(response_text) 1120 1121 # return results 1122 if show_all: return result 1123 if "Disambiguation" not in result or result["Disambiguation"] is None: 1124 raise UnknownValueError() 1125 return result['Disambiguation']['ChoiceData'][0]['Transcription'] 1126 1127 def recognize_ibm(self, audio_data, username, password, language="en-US", show_all=False): 1128 """ 1129 Performs speech recognition on ``audio_data`` (an ``AudioData`` instance), using the IBM Speech to Text API. 1130 1131 The IBM Speech to Text username and password are specified by ``username`` and ``password``, respectively. Unfortunately, these are not available without `signing up for an account <https://console.ng.bluemix.net/registration/>`__. Once logged into the Bluemix console, follow the instructions for `creating an IBM Watson service instance <https://www.ibm.com/watson/developercloud/doc/getting_started/gs-credentials.shtml>`__, where the Watson service is "Speech To Text". IBM Speech to Text usernames are strings of the form XXXXXXXX-XXXX-XXXX-XXXX-XXXXXXXXXXXX, while passwords are mixed-case alphanumeric strings. 1132 1133 The recognition language is determined by ``language``, an RFC5646 language tag with a dialect like ``"en-US"`` (US English) or ``"zh-CN"`` (Mandarin Chinese), defaulting to US English. The supported language values are listed under the ``model`` parameter of the `audio recognition API documentation <https://www.ibm.com/watson/developercloud/speech-to-text/api/v1/#sessionless_methods>`__, in the form ``LANGUAGE_BroadbandModel``, where ``LANGUAGE`` is the language value. 1134 1135 Returns the most likely transcription if ``show_all`` is false (the default). Otherwise, returns the `raw API response <https://www.ibm.com/watson/developercloud/speech-to-text/api/v1/#sessionless_methods>`__ as a JSON dictionary. 1136 1137 Raises a ``speech_recognition.UnknownValueError`` exception if the speech is unintelligible. Raises a ``speech_recognition.RequestError`` exception if the speech recognition operation failed, if the key isn't valid, or if there is no internet connection. 1138 """ 1139 assert isinstance(audio_data, AudioData), "Data must be audio data" 1140 assert isinstance(username, str), "``username`` must be a string" 1141 assert isinstance(password, str), "``password`` must be a string" 1142 1143 flac_data = audio_data.get_flac_data( 1144 convert_rate=None if audio_data.sample_rate >= 16000 else 16000, # audio samples should be at least 16 kHz 1145 convert_width=None if audio_data.sample_width >= 2 else 2 # audio samples should be at least 16-bit 1146 ) 1147 url = "https://stream.watsonplatform.net/speech-to-text/api/v1/recognize?{}".format(urlencode({ 1148 "profanity_filter": "false", 1149 "model": "{}_BroadbandModel".format(language), 1150 "inactivity_timeout": -1, # don't stop recognizing when the audio stream activity stops 1151 })) 1152 request = Request(url, data=flac_data, headers={ 1153 "Content-Type": "audio/x-flac", 1154 "X-Watson-Learning-Opt-Out": "true", # prevent requests from being logged, for improved privacy 1155 }) 1156 authorization_value = base64.standard_b64encode("{}:{}".format(username, password).encode("utf-8")).decode("utf-8") 1157 request.add_header("Authorization", "Basic {}".format(authorization_value)) 1158 try: 1159 response = urlopen(request, timeout=self.operation_timeout) 1160 except HTTPError as e: 1161 raise RequestError("recognition request failed: {}".format(e.reason)) 1162 except URLError as e: 1163 raise RequestError("recognition connection failed: {}".format(e.reason)) 1164 response_text = response.read().decode("utf-8") 1165 result = json.loads(response_text) 1166 1167 # return results 1168 if show_all: return result 1169 if "results" not in result or len(result["results"]) < 1 or "alternatives" not in result["results"][0]: 1170 raise UnknownValueError() 1171 1172 transcription = [] 1173 for utterance in result["results"]: 1174 if "alternatives" not in utterance: raise UnknownValueError() 1175 for hypothesis in utterance["alternatives"]: 1176 if "transcript" in hypothesis: 1177 transcription.append(hypothesis["transcript"]) 1178 return "\n".join(transcription) 1179 1180 1181def get_flac_converter(): 1182 """Returns the absolute path of a FLAC converter executable, or raises an OSError if none can be found.""" 1183 flac_converter = shutil_which("flac") # check for installed version first 1184 if flac_converter is None: # flac utility is not installed 1185 base_path = os.path.dirname(os.path.abspath(__file__)) # directory of the current module file, where all the FLAC bundled binaries are stored 1186 system, machine = platform.system(), platform.machine() 1187 if system == "Windows" and machine in {"i686", "i786", "x86", "x86_64", "AMD64"}: 1188 flac_converter = os.path.join(base_path, "flac-win32.exe") 1189 elif system == "Darwin" and machine in {"i686", "i786", "x86", "x86_64", "AMD64"}: 1190 flac_converter = os.path.join(base_path, "flac-mac") 1191 elif system == "Linux" and machine in {"i686", "i786", "x86"}: 1192 flac_converter = os.path.join(base_path, "flac-linux-x86") 1193 elif system == "Linux" and machine in {"x86_64", "AMD64"}: 1194 flac_converter = os.path.join(base_path, "flac-linux-x86_64") 1195 else: # no FLAC converter available 1196 raise OSError("FLAC conversion utility not available - consider installing the FLAC command line application by running `apt-get install flac` or your operating system's equivalent") 1197 1198 # mark FLAC converter as executable if possible 1199 try: 1200 # handle known issue when running on docker: 1201 # run executable right after chmod() may result in OSError "Text file busy" 1202 # fix: flush FS with sync 1203 if not os.access(flac_converter, os.X_OK): 1204 stat_info = os.stat(flac_converter) 1205 os.chmod(flac_converter, stat_info.st_mode | stat.S_IEXEC) 1206 if 'Linux' in platform.system(): 1207 os.sync() if sys.version_info >= (3, 3) else os.system('sync') 1208 1209 except OSError: pass 1210 1211 return flac_converter 1212 1213 1214def shutil_which(pgm): 1215 """Python 2 compatibility: backport of ``shutil.which()`` from Python 3""" 1216 path = os.getenv('PATH') 1217 for p in path.split(os.path.pathsep): 1218 p = os.path.join(p, pgm) 1219 if os.path.exists(p) and os.access(p, os.X_OK): 1220 return p 1221 1222 1223class PortableNamedTemporaryFile(object): 1224 """Limited replacement for ``tempfile.NamedTemporaryFile``, except unlike ``tempfile.NamedTemporaryFile``, the file can be opened again while it's currently open, even on Windows.""" 1225 def __init__(self, mode="w+b"): 1226 self.mode = mode 1227 1228 def __enter__(self): 1229 # create the temporary file and open it 1230 import tempfile 1231 file_descriptor, file_path = tempfile.mkstemp() 1232 self._file = os.fdopen(file_descriptor, self.mode) 1233 1234 # the name property is a public field 1235 self.name = file_path 1236 return self 1237 1238 def __exit__(self, exc_type, exc_value, traceback): 1239 self._file.close() 1240 os.remove(self.name) 1241 1242 def write(self, *args, **kwargs): 1243 return self._file.write(*args, **kwargs) 1244 1245 def writelines(self, *args, **kwargs): 1246 return self._file.writelines(*args, **kwargs) 1247 1248 def flush(self, *args, **kwargs): 1249 return self._file.flush(*args, **kwargs) 1250 1251 1252# =============================== 1253# backwards compatibility shims 1254# =============================== 1255 1256WavFile = AudioFile # WavFile was renamed to AudioFile in 3.4.1 1257 1258 1259def recognize_api(self, audio_data, client_access_token, language="en", session_id=None, show_all=False): 1260 wav_data = audio_data.get_wav_data(convert_rate=16000, convert_width=2) 1261 url = "https://api.api.ai/v1/query" 1262 while True: 1263 boundary = uuid.uuid4().hex 1264 if boundary.encode("utf-8") not in wav_data: break 1265 if session_id is None: session_id = uuid.uuid4().hex 1266 data = b"--" + boundary.encode("utf-8") + b"\r\n" + b"Content-Disposition: form-data; name=\"request\"\r\n" + b"Content-Type: application/json\r\n" + b"\r\n" + b"{\"v\": \"20150910\", \"sessionId\": \"" + session_id.encode("utf-8") + b"\", \"lang\": \"" + language.encode("utf-8") + b"\"}\r\n" + b"--" + boundary.encode("utf-8") + b"\r\n" + b"Content-Disposition: form-data; name=\"voiceData\"; filename=\"audio.wav\"\r\n" + b"Content-Type: audio/wav\r\n" + b"\r\n" + wav_data + b"\r\n" + b"--" + boundary.encode("utf-8") + b"--\r\n" 1267 request = Request(url, data=data, headers={"Authorization": "Bearer {}".format(client_access_token), "Content-Length": str(len(data)), "Expect": "100-continue", "Content-Type": "multipart/form-data; boundary={}".format(boundary)}) 1268 try: response = urlopen(request, timeout=10) 1269 except HTTPError as e: raise RequestError("recognition request failed: {}".format(e.reason)) 1270 except URLError as e: raise RequestError("recognition connection failed: {}".format(e.reason)) 1271 response_text = response.read().decode("utf-8") 1272 result = json.loads(response_text) 1273 if show_all: return result 1274 if "status" not in result or "errorType" not in result["status"] or result["status"]["errorType"] != "success": 1275 raise UnknownValueError() 1276 return result["result"]["resolvedQuery"] 1277 1278 1279Recognizer.recognize_api = classmethod(recognize_api) # API.AI Speech Recognition is deprecated/not recommended as of 3.5.0, and currently is only optionally available for paid plans 1280