1 /* -*- c-basic-offset: 4; indent-tabs-mode: nil -*- */ 2 /* ==================================================================== 3 * Copyright (c) 1999-2001 Carnegie Mellon University. All rights 4 * reserved. 5 * 6 * Redistribution and use in source and binary forms, with or without 7 * modification, are permitted provided that the following conditions 8 * are met: 9 * 10 * 1. Redistributions of source code must retain the above copyright 11 * notice, this list of conditions and the following disclaimer. 12 * 13 * 2. Redistributions in binary form must reproduce the above copyright 14 * notice, this list of conditions and the following disclaimer in 15 * the documentation and/or other materials provided with the 16 * distribution. 17 * 18 * This work was supported in part by funding from the Defense Advanced 19 * Research Projects Agency and the National Science Foundation of the 20 * United States of America, and the CMU Sphinx Speech Consortium. 21 * 22 * THIS SOFTWARE IS PROVIDED BY CARNEGIE MELLON UNIVERSITY ``AS IS'' AND 23 * ANY EXPRESSED OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, 24 * THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR 25 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL CARNEGIE MELLON UNIVERSITY 26 * NOR ITS EMPLOYEES BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, 27 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT 28 * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, 29 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY 30 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 31 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 32 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 33 * 34 * ==================================================================== 35 * 36 */ 37 /* 38 * cont_ad.h -- Continuous A/D listening and silence filtering module. 39 * 40 * ********************************************** 41 * CMU ARPA Speech Project 42 * 43 * Copyright (c) 1996 Carnegie Mellon University. 44 * ALL RIGHTS RESERVED. 45 * ********************************************** 46 * 47 * HISTORY 48 * 49 * 13-Jul-98 M K Ravishankar (rkm@cs.cmu.edu) at Carnegie Mellon University 50 * Added spf and adbufsize to cont_ad_t in order to support variable 51 * frame sizes depending on audio sampling rate. 52 * 53 * 30-Jun-98 M K Ravishankar (rkm@cs.cmu.edu) at Carnegie Mellon University 54 * Added FILE* argument to cont_ad_powhist_dump(). 55 * 56 * 16-Jan-98 Paul Placeway (pwp@cs.cmu.edu) at Carnegie Mellon University 57 * Changed to use dB instead of the weird power measure. 58 * Added most system parameters to cont_ad_t instead of hardwiring 59 * them in cont_ad.c. 60 * Added cont_ad_set_params() and cont_ad_get_params(). 61 * 62 * 28-Jul-96 M K Ravishankar (rkm@cs.cmu.edu) at Carnegie Mellon University 63 * Added cont_ad_t.siglvl. 64 * 65 * 27-Jun-96 M K Ravishankar (rkm@cs.cmu.edu) at Carnegie Mellon University 66 * Added the option for cont_ad_read to return -1 on EOF. 67 * 68 * 21-Jun-96 M K Ravishankar (rkm@cs.cmu.edu) at Carnegie Mellon University 69 * Added cont_ad_set_thresh(). 70 * 71 * 20-Jun-96 M K Ravishankar (rkm@cs.cmu.edu) at Carnegie Mellon University 72 * Separated thresholds for speech and silence. 73 * 74 * 17-Jun-96 M K Ravishankar (rkm@cs.cmu.edu) at Carnegie Mellon University 75 * Created, based loosely on Steve Reed's original implementation. 76 */ 77 78 79 #ifndef _CONT_AD_H_ 80 #define _CONT_AD_H_ 81 82 /* Win32/WinCE DLL gunk */ 83 #include <sphinxbase/sphinxbase_export.h> 84 #include <sphinxbase/prim_type.h> 85 #include <sphinxbase/ad.h> 86 87 /** 88 * \file cont_ad.h 89 * \brief Continuous A/D listening and silence filtering module. 90 * 91 * This module is intended to be interposed as a filter between any 92 * raw A/D source and the application to remove silence regions. Its 93 * main purpose is to remove regions of silence from the raw input 94 * speech. It is initialized with a raw A/D source function (during 95 * the cont_ad_init call). The application is responsible for setting 96 * up the A/D source, turning recording on and off as it desires. 97 * Filtered A/D data can be read by the application using the 98 * cont_ad_read function. 99 * 100 * In other words, the application calls cont_ad_read instead of the 101 * raw A/D source function (e.g., ad_read in libad) to obtain filtered 102 * A/D data with silence regions removed. This module itself does not 103 * enforce any other structural changes to the application. 104 * 105 * The cont_ad_read function also updates an "absolute" timestamp (see 106 * cont_ad_t.read_ts) at the end of each invocation. The timestamp 107 * indicates the total number of samples of A/D data read until this 108 * point, including data discarded as silence frames. The application 109 * is responsible for using this timestamp to make any policy 110 * decisions regarding utterance boundaries or whatever. 111 */ 112 113 114 #include <stdio.h> 115 116 117 #ifdef __cplusplus 118 extern "C" { 119 #endif 120 #if 0 121 /* Fool Emacs. */ 122 } 123 #endif 124 125 /* States of continuous listening module */ 126 #define CONT_AD_STATE_SIL 0 127 #define CONT_AD_STATE_SPEECH 1 128 129 130 /** 131 * \struct spseg_t 132 * \brief (FOR INTERNAL USE ) Data structure for maintaining speech (non-silence) segments not yet consumed by the 133 * application. 134 */ 135 typedef struct spseg_s { 136 int32 startfrm; /**< Frame-id in adbuf (see below) of start of this segment */ 137 int32 nfrm; /**< Number of frames in segment (may wrap around adbuf) */ 138 struct spseg_s *next; /**< Next speech segment (with some intervening silence) */ 139 } spseg_t; 140 141 142 /** 143 * \struct cont_ad_t 144 * \brief Continuous listening module or object 145 * Continuous listening module or object. An application can open and maintain several 146 * such objects, if necessary. 147 * FYI: Module always in one of two states: SILENCE or SPEECH. Transitions between the 148 * two detected by sliding a window spanning several frames and looking for some minimum 149 * number of frames of the other type. 150 */ 151 typedef struct { 152 /* Function to be called for obtaining A/D data (see prototype for ad_read in ad.h) */ 153 int32 (*adfunc)(ad_rec_t *ad, int16 *buf, int32 max); 154 ad_rec_t *ad; /**< A/D device argument for adfunc. Also, ad->sps used to 155 determine frame size (spf, see below) */ 156 int32 rawmode; /**< Pass all input data through, without filtering silence */ 157 158 int16 *adbuf; /**< Circular buffer for maintaining A/D data read until consumed */ 159 160 /* ************************************************************************** 161 * state, read_ts, and siglvl are provided for READ-ONLY use by client 162 * applications, and are updated by calls to cont_ad_read() (see below). All 163 * other variables should be left alone. 164 */ 165 int32 state; /**< State of data returned by most recent cont_ad_read call; 166 CONT_AD_STATE_SIL or CONT_AD_STATE_SPEECH. */ 167 int32 read_ts; /**< Absolute timestamp (total no. of raw samples consumed 168 upto the most recent cont_ad_read call, starting from 169 the very beginning). Note that this is a 32-bit 170 integer; applications should guard against overflow. */ 171 int32 seglen; /**< Total no. of raw samples consumed in the segment 172 returned by the most recent cont_ad_read call. Can be 173 used to detect silence segments that have stretched long 174 enough to terminate an utterance */ 175 int32 siglvl; /**< Max signal level for the data consumed by the most recent 176 cont_ad_read call (dB range: 0-99). Can be used to 177 update a V-U meter, for example. */ 178 /* ************************************************************************ */ 179 180 int32 sps; /**< Samples/sec; moved from ad->sps to break dependence on 181 ad by N. Roy.*/ 182 183 int32 eof; /**< Whether the source ad device has encountered EOF */ 184 185 int32 spf; /**< Samples/frame; audio level is analyzed within frames */ 186 int32 adbufsize; /**< Buffer size (Number of samples) */ 187 int32 prev_sample; /**< For pre-emphasis filter */ 188 int32 headfrm; /**< Frame number in adbuf with unconsumed A/D data */ 189 int32 n_frm; /**< Number of complete frames of unconsumed A/D data in adbuf */ 190 int32 n_sample; /**< Number of samples of unconsumed data in adbuf */ 191 int32 tot_frm; /**< Total number of frames of A/D data read, including consumed ones */ 192 int32 noise_level; /**< PWP: what we claim as the "current" noise level */ 193 194 int32 *pow_hist; /**< Histogram of frame power, moving window, decayed */ 195 char *frm_pow; /**< Frame power */ 196 197 int32 auto_thresh; /**< Do automatic threshold adjustment or not */ 198 int32 delta_sil; /**< Max silence power/frame ABOVE noise level */ 199 int32 delta_speech; /**< Min speech power/frame ABOVE noise level */ 200 int32 min_noise; /**< noise lower than this we ignore */ 201 int32 max_noise; /**< noise higher than this signals an error */ 202 int32 winsize; /**< how many frames to look at for speech det */ 203 int32 speech_onset; /**< start speech on >= these many frames out of winsize, of >= delta_speech */ 204 int32 sil_onset; /**< end speech on >= these many frames out of winsize, of <= delta_sil */ 205 int32 leader; /**< pad beggining of speech with this many extra frms */ 206 int32 trailer; /**< pad end of speech with this many extra frms */ 207 208 int32 thresh_speech;/**< Frame considered to be speech if power >= thresh_speech 209 (for transitioning from SILENCE to SPEECH state) */ 210 int32 thresh_sil; /**< Frame considered to be silence if power <= thresh_sil 211 (for transitioning from SPEECH to SILENCE state) */ 212 int32 thresh_update;/**< Number of frames before next update to pow_hist/thresholds */ 213 float32 adapt_rate; /**< Linear interpolation constant for rate at which noise level adapted 214 to each estimate; 215 range: 0-1; 0=> no adaptation, 1=> instant adaptation */ 216 217 int32 tail_state; /**< State at the end of its internal buffer (internal use): 218 CONT_AD_STATE_SIL or CONT_AD_STATE_SPEECH. Note: This is 219 different from cont_ad_t.state. */ 220 int32 win_startfrm; /**< Where next analysis window begins */ 221 int32 win_validfrm; /**< Number of frames currently available from win_startfrm for analysis */ 222 int32 n_other; /**< If in SILENCE state, number of frames in analysis window considered to 223 be speech; otherwise number of frames considered to be silence */ 224 spseg_t *spseg_head;/**< First of unconsumed speech segments */ 225 spseg_t *spseg_tail;/**< Last of unconsumed speech segments */ 226 227 FILE *rawfp; /**< If non-NULL, raw audio input data processed by cont_ad 228 is dumped to this file. Controlled by user application 229 via cont_ad_set_rawfp(). NULL when cont_ad object is 230 initially created. */ 231 FILE *logfp; /**< If non-NULL, write detailed logs of this object's 232 progress to the file. Controlled by user application 233 via cont_ad_set_logfp(). NULL when cont_ad object is 234 initially created. */ 235 236 int32 n_calib_frame; /**< Number of frames of calibration data seen so far. */ 237 } cont_ad_t; 238 239 240 /** 241 * Initialize a continuous listening/silence filtering object. 242 * 243 * One time initialization of a continuous listening/silence filtering 244 * object/module. This can work in either "stream mode", where it 245 * reads data from an audio device represented by 246 * <code>ad_rec_t</code>, or in "block mode", where it filters out 247 * silence regions from blocks of data passed into it. 248 * 249 * @param ad An audio device to read from, or NULL to operate in block mode. 250 * @param adfunc The function used to read audio from <code>ad</code>, 251 * or NULL to operate in block mode. This is usually ad_read(). 252 * @return A pointer to a READ-ONLY structure used in other calls to 253 * the object. If any error occurs, the return value is NULL. 254 */ 255 SPHINXBASE_EXPORT 256 cont_ad_t *cont_ad_init (ad_rec_t *ad, /**< In: The A/D source object to be filtered */ 257 int32 (*adfunc)(ad_rec_t *ad, int16 *buf, int32 max) 258 /**< In: adfunc = source function to be invoked 259 to obtain raw A/D data. See ad.h for the 260 required prototype definition. */ 261 ); 262 263 /** 264 * Initializes a continuous listening object which simply passes data through (!) 265 * 266 * Like cont_ad_init, but put the module in raw mode; i.e., all data is passed 267 * through, unfiltered. (By special request.) 268 */ 269 SPHINXBASE_EXPORT 270 cont_ad_t *cont_ad_init_rawmode (ad_rec_t *ad, 271 int32 (*adfunc)(ad_rec_t *ad, int16 *buf, int32 max)); 272 273 274 /** 275 * Read raw audio data into the silence filter. 276 * 277 * The main read routine for reading speech/silence segmented audio data. Audio 278 * data is copied into the caller provided buffer, much like a file read routine. 279 * 280 * In "block mode", i.e. if NULL was passed as a read function to 281 * <code>cont_ad_init</code>, the data in <code>buf</code> is taken as 282 * input, and any non-silence data is written back to <code>buf</code> 283 * on exit. In this case, you must take care that <code>max</code> 284 * does not overflow the internal buffer of the silence filter. The 285 * available number of samples can be obtained by calling 286 * cont_ad_buffer_space(). Any excess data will be discarded. 287 * 288 * In normal mode, only speech segments are copied; silence segments are dropped. 289 * In rawmode (cont_ad module initialized using cont_ad_init_rawmode()), all data 290 * are passed through to the caller. But, in either case, any single call to 291 * cont_ad_read will never return data that crosses a speech/silence segment 292 * boundary. 293 * 294 * The following variables are updated for use by the caller (see cont_ad_t above): 295 * cont_ad_t.state, 296 * cont_ad_t.read_ts, 297 * cont_ad_t.seglen, 298 * cont_ad_t.siglvl. 299 * 300 * Return value: Number of samples actually read, possibly 0; <0 if EOF on A/D source. 301 */ 302 SPHINXBASE_EXPORT 303 int32 cont_ad_read (cont_ad_t *r, /**< In: Object pointer returned by cont_ad_init */ 304 int16 *buf, /**< In/Out: In block mode, contains input data. 305 On return, buf contains A/D data returned 306 by this function, if any. */ 307 int32 max /**< In: Maximum number of samples to be filled into buf. 308 NOTE: max must be at least 256; otherwise 309 the functions returns -1. */ 310 ); 311 312 /** 313 * Get the maximum number of samples which can be passed into cont_ad_read(). 314 */ 315 SPHINXBASE_EXPORT 316 int32 cont_ad_buffer_space(cont_ad_t *r); 317 318 /** 319 * Calibrate the silence filter. 320 * 321 * Calibration to determine an initial silence threshold. This function can be called 322 * any number of times. It should be called at least once immediately after cont_ad_init. 323 * The silence threshold is also updated internally once in a while, so this function 324 * only needs to be called in the middle if there is a definite change in the recording 325 * environment. 326 * The application is responsible for making sure that the raw audio source is turned on 327 * before the calibration. 328 * Return value: 0 if successful, <0 otherwise. 329 */ 330 SPHINXBASE_EXPORT 331 int32 cont_ad_calib (cont_ad_t *cont /**< In: object pointer returned by cont_ad_init */ 332 ); 333 334 /** 335 * Calibrate the silence filter without an audio device. 336 * 337 * If the application has not passed an audio device into the silence filter 338 * at initialisation, this routine can be used to calibrate the filter. The 339 * buf (of length max samples) should contain audio data for calibration. This 340 * data is assumed to be completely consumed. More than one call may be 341 * necessary to fully calibrate. 342 * Return value: 0 if successful, <0 on failure, >0 if calibration not 343 * complete. 344 */ 345 SPHINXBASE_EXPORT 346 int32 cont_ad_calib_loop (cont_ad_t *r, int16 *buf, int32 max); 347 348 /** 349 * Get the number of samples required to calibrate the silence filter. 350 * 351 * Since, as mentioned above, the calibration data is assumed to be 352 * fully consumed, it may be desirable to "hold onto" this data in 353 * case it contains useful speech. This function returns the number 354 * of samples required to calibrate the silence filter, which is 355 * useful in allocating a buffer to store this data. 356 * 357 * @return Number of samples required for successful calibration. 358 */ 359 SPHINXBASE_EXPORT 360 int32 cont_ad_calib_size(cont_ad_t *r); 361 362 /** 363 * Set silence and speech threshold parameters. 364 * 365 * The silence threshold is the max power 366 * level, RELATIVE to the peak background noise level, in any silence frame. Similarly, 367 * the speech threshold is the min power level, RELATIVE to the peak background noise 368 * level, in any speech frame. In general, silence threshold <= speech threshold. 369 * Increasing the thresholds (say, from the default value of 2 to 3 or 4) reduces the 370 * sensitivity to background noise, but may also increase the chances of clipping actual 371 * speech. 372 * @return: 0 if successful, <0 otherwise. 373 */ 374 SPHINXBASE_EXPORT 375 int32 cont_ad_set_thresh (cont_ad_t *cont, /**< In: Object ptr from cont_ad_init */ 376 int32 sil, /**< In: silence threshold (default 2) */ 377 int32 sp /**< In: speech threshold (default 2) */ 378 ); 379 380 381 /** 382 * Set the changable parameters. 383 * 384 * delta_sil, delta_speech, min_noise, and max_noise are in dB, 385 * winsize, speech_onset, sil_onset, leader and trailer are in frames of 386 * 16 ms length (256 samples @ 16kHz sampling). 387 */ 388 SPHINXBASE_EXPORT 389 int32 cont_ad_set_params (cont_ad_t *r, int32 delta_sil, int32 delta_speech, 390 int32 min_noise, int32 max_noise, 391 int32 winsize, int32 speech_onset, int32 sil_onset, 392 int32 leader, int32 trailer, 393 float32 adapt_rate); 394 395 /** 396 * PWP 1/14/98 -- get the changable params. 397 * 398 * delta_sil, delta_speech, min_noise, and max_noise are in dB, 399 * winsize, speech_onset, sil_onset, leader and trailer are in frames of 400 * 16 ms length (256 samples @ 16kHz sampling). 401 */ 402 SPHINXBASE_EXPORT 403 int32 cont_ad_get_params (cont_ad_t *r, int32 *delta_sil, int32 *delta_speech, 404 int32 *min_noise, int32 *max_noise, 405 int32 *winsize, int32 *speech_onset, int32 *sil_onset, 406 int32 *leader, int32 *trailer, 407 float32 *adapt_rate); 408 409 /** 410 * Reset, discarding any accumulated speech segments. 411 * @return 0 if successful, <0 otherwise. 412 */ 413 SPHINXBASE_EXPORT 414 int32 cont_ad_reset (cont_ad_t *cont); /* In: Object pointer from cont_ad_init */ 415 416 417 /** 418 * Close the continuous listening object. 419 */ 420 SPHINXBASE_EXPORT 421 int32 cont_ad_close (cont_ad_t *cont); /* In: Object pointer from cont_ad_init */ 422 423 424 /** 425 * Dump the power histogram. For debugging... 426 */ 427 SPHINXBASE_EXPORT 428 void cont_ad_powhist_dump (FILE *fp, cont_ad_t *cont); 429 430 431 /** 432 * Detach the given continuous listening module from the associated audio device. 433 * @return 0 if successful, -1 otherwise. 434 */ 435 SPHINXBASE_EXPORT 436 int32 cont_ad_detach (cont_ad_t *c); 437 438 439 /** 440 * Attach the continuous listening module to the given audio device/function. 441 * (Like cont_ad_init, but without the calibration.) 442 * @return 0 if successful, -1 otherwise. 443 */ 444 SPHINXBASE_EXPORT 445 int32 cont_ad_attach (cont_ad_t *c, ad_rec_t *a, int32 (*func)(ad_rec_t *, int16 *, int32)); 446 447 448 /** 449 * Set a file for dumping raw audio input. 450 * 451 * The application can ask cont_ad to dump the raw audio input that cont_ad 452 * processes to a file. Use this function to give the FILE* to the cont_ad 453 * object. If invoked with fp == NULL, dumping is turned off. The application 454 * is responsible for opening and closing the file. If fp is non-NULL, cont_ad 455 * assumes the file pointer is valid and opened for writing. 456 * 457 * @return 0 if successful, -1 otherwise. 458 */ 459 SPHINXBASE_EXPORT 460 int32 cont_ad_set_rawfp (cont_ad_t *c, /* The cont_ad object being addressed */ 461 FILE *fp); /* File to which raw audio data is to 462 be dumped; NULL to stop dumping. */ 463 464 /** 465 * Set the file to which cont_ad logs its progress. 466 * 467 * Mainly for debugging. If <code>fp</code> is NULL, logging is turned off. 468 * 469 * @return 0 if successful, -1 otherwise. 470 */ 471 SPHINXBASE_EXPORT 472 int32 cont_ad_set_logfp (cont_ad_t *c, /* The cont_ad object being addressed */ 473 FILE *fp); /* File to which logs are written; 474 NULL to stop logging. */ 475 476 /** 477 * Set the silence and speech thresholds. 478 * 479 * For this to remain permanently in effect, the auto_thresh field of 480 * the continuous listening module should be set to FALSE or 0. 481 * Otherwise the thresholds may be modified by the noise- level 482 * adaptation. 483 */ 484 SPHINXBASE_EXPORT 485 int32 cont_set_thresh(cont_ad_t *r, int32 silence, int32 speech); 486 487 #ifdef __cplusplus 488 } 489 #endif 490 491 492 #endif 493