1 /* -*- c-basic-offset: 4; indent-tabs-mode: nil -*- */
2 /* ====================================================================
3  * Copyright (c) 1999-2001 Carnegie Mellon University.  All rights
4  * reserved.
5  *
6  * Redistribution and use in source and binary forms, with or without
7  * modification, are permitted provided that the following conditions
8  * are met:
9  *
10  * 1. Redistributions of source code must retain the above copyright
11  *    notice, this list of conditions and the following disclaimer.
12  *
13  * 2. Redistributions in binary form must reproduce the above copyright
14  *    notice, this list of conditions and the following disclaimer in
15  *    the documentation and/or other materials provided with the
16  *    distribution.
17  *
18  * This work was supported in part by funding from the Defense Advanced
19  * Research Projects Agency and the National Science Foundation of the
20  * United States of America, and the CMU Sphinx Speech Consortium.
21  *
22  * THIS SOFTWARE IS PROVIDED BY CARNEGIE MELLON UNIVERSITY ``AS IS'' AND
23  * ANY EXPRESSED OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO,
24  * THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
25  * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL CARNEGIE MELLON UNIVERSITY
26  * NOR ITS EMPLOYEES BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
27  * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
28  * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
29  * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
30  * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
31  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
32  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
33  *
34  * ====================================================================
35  *
36  */
37 /*
38  * cont_ad.h -- Continuous A/D listening and silence filtering module.
39  *
40  * **********************************************
41  * CMU ARPA Speech Project
42  *
43  * Copyright (c) 1996 Carnegie Mellon University.
44  * ALL RIGHTS RESERVED.
45  * **********************************************
46  *
47  * HISTORY
48  *
49  * 13-Jul-98	M K Ravishankar (rkm@cs.cmu.edu) at Carnegie Mellon University
50  * 		Added spf and adbufsize to cont_ad_t in order to support variable
51  * 		frame sizes depending on audio sampling rate.
52  *
53  * 30-Jun-98	M K Ravishankar (rkm@cs.cmu.edu) at Carnegie Mellon University
54  * 		Added FILE* argument to cont_ad_powhist_dump().
55  *
56  * 16-Jan-98	Paul Placeway (pwp@cs.cmu.edu) at Carnegie Mellon University
57  * 		Changed to use dB instead of the weird power measure.
58  * 		Added most system parameters to cont_ad_t instead of hardwiring
59  * 		them in cont_ad.c.
60  * 		Added cont_ad_set_params() and cont_ad_get_params().
61  *
62  * 28-Jul-96	M K Ravishankar (rkm@cs.cmu.edu) at Carnegie Mellon University
63  * 		Added cont_ad_t.siglvl.
64  *
65  * 27-Jun-96	M K Ravishankar (rkm@cs.cmu.edu) at Carnegie Mellon University
66  * 		Added the option for cont_ad_read to return -1 on EOF.
67  *
68  * 21-Jun-96	M K Ravishankar (rkm@cs.cmu.edu) at Carnegie Mellon University
69  * 		Added cont_ad_set_thresh().
70  *
71  * 20-Jun-96	M K Ravishankar (rkm@cs.cmu.edu) at Carnegie Mellon University
72  * 		Separated thresholds for speech and silence.
73  *
74  * 17-Jun-96	M K Ravishankar (rkm@cs.cmu.edu) at Carnegie Mellon University
75  * 		Created, based loosely on Steve Reed's original implementation.
76  */
77 
78 
79 #ifndef _CONT_AD_H_
80 #define _CONT_AD_H_
81 
82 /* Win32/WinCE DLL gunk */
83 #include <sphinxbase/sphinxbase_export.h>
84 #include <sphinxbase/prim_type.h>
85 #include <sphinxbase/ad.h>
86 
87 /**
88  * \file cont_ad.h
89  * \brief Continuous A/D listening and silence filtering module.
90  *
91  * This module is intended to be interposed as a filter between any
92  * raw A/D source and the application to remove silence regions.  Its
93  * main purpose is to remove regions of silence from the raw input
94  * speech.  It is initialized with a raw A/D source function (during
95  * the cont_ad_init call).  The application is responsible for setting
96  * up the A/D source, turning recording on and off as it desires.
97  * Filtered A/D data can be read by the application using the
98  * cont_ad_read function.
99  *
100  * In other words, the application calls cont_ad_read instead of the
101  * raw A/D source function (e.g., ad_read in libad) to obtain filtered
102  * A/D data with silence regions removed.  This module itself does not
103  * enforce any other structural changes to the application.
104  *
105  * The cont_ad_read function also updates an "absolute" timestamp (see
106  * cont_ad_t.read_ts) at the end of each invocation.  The timestamp
107  * indicates the total number of samples of A/D data read until this
108  * point, including data discarded as silence frames.  The application
109  * is responsible for using this timestamp to make any policy
110  * decisions regarding utterance boundaries or whatever.
111  */
112 
113 
114 #include <stdio.h>
115 
116 
117 #ifdef __cplusplus
118 extern "C" {
119 #endif
120 #if 0
121 /* Fool Emacs. */
122 }
123 #endif
124 
125 /* States of continuous listening module */
126 #define CONT_AD_STATE_SIL	0
127 #define CONT_AD_STATE_SPEECH	1
128 
129 
130 /**
131  * \struct spseg_t
132  * \brief  (FOR INTERNAL USE ) Data structure for maintaining speech (non-silence) segments not yet consumed by the
133  * application.
134  */
135 typedef struct spseg_s {
136     int32 startfrm;	/**< Frame-id in adbuf (see below) of start of this segment */
137     int32 nfrm;		/**< Number of frames in segment (may wrap around adbuf) */
138     struct spseg_s *next;	/**< Next speech segment (with some intervening silence) */
139 } spseg_t;
140 
141 
142 /**
143  * \struct cont_ad_t
144  * \brief Continuous listening module or object
145  * Continuous listening module or object.  An application can open and maintain several
146  * such objects, if necessary.
147  * FYI: Module always in one of two states: SILENCE or SPEECH.  Transitions between the
148  * two detected by sliding a window spanning several frames and looking for some minimum
149  * number of frames of the other type.
150  */
151 typedef struct {
152     /* Function to be called for obtaining A/D data (see prototype for ad_read in ad.h) */
153     int32 (*adfunc)(ad_rec_t *ad, int16 *buf, int32 max);
154     ad_rec_t *ad;	/**< A/D device argument for adfunc.  Also, ad->sps used to
155 			   determine frame size (spf, see below) */
156     int32 rawmode;	/**< Pass all input data through, without filtering silence */
157 
158     int16 *adbuf;	/**< Circular buffer for maintaining A/D data read until consumed */
159 
160     /* **************************************************************************
161      * state, read_ts, and siglvl are provided for READ-ONLY use by client
162      * applications, and are updated by calls to cont_ad_read() (see below).  All
163      * other variables should be left alone.
164      */
165     int32 state;	/**< State of data returned by most recent cont_ad_read call;
166 			   CONT_AD_STATE_SIL or CONT_AD_STATE_SPEECH. */
167     int32 read_ts;	/**< Absolute timestamp (total no. of raw samples consumed
168 			   upto the most recent cont_ad_read call, starting from
169 			   the very beginning).  Note that this is a 32-bit
170 			   integer; applications should guard against overflow. */
171     int32 seglen;	/**< Total no. of raw samples consumed in the segment
172 			   returned by the most recent cont_ad_read call.  Can be
173 			   used to detect silence segments that have stretched long
174 			   enough to terminate an utterance */
175     int32 siglvl;	/**< Max signal level for the data consumed by the most recent
176 			   cont_ad_read call (dB range: 0-99).  Can be used to
177 			   update a V-U meter, for example. */
178     /* ************************************************************************ */
179 
180     int32 sps;		/**< Samples/sec; moved from ad->sps to break dependence on
181 			   ad by N. Roy.*/
182 
183     int32 eof;		/**< Whether the source ad device has encountered EOF */
184 
185     int32 spf;		/**< Samples/frame; audio level is analyzed within frames */
186     int32 adbufsize;	/**< Buffer size (Number of samples) */
187     int32 prev_sample;	/**< For pre-emphasis filter */
188     int32 headfrm;	/**< Frame number in adbuf with unconsumed A/D data */
189     int32 n_frm;	/**< Number of complete frames of unconsumed A/D data in adbuf */
190     int32 n_sample;	/**< Number of samples of unconsumed data in adbuf */
191     int32 tot_frm;	/**< Total number of frames of A/D data read, including consumed ones */
192     int32 noise_level;	/**< PWP: what we claim as the "current" noise level */
193 
194     int32 *pow_hist;	/**< Histogram of frame power, moving window, decayed */
195     char *frm_pow;	/**< Frame power */
196 
197     int32 auto_thresh;  /**< Do automatic threshold adjustment or not */
198     int32 delta_sil;	/**< Max silence power/frame ABOVE noise level */
199     int32 delta_speech;	/**< Min speech power/frame ABOVE noise level */
200     int32 min_noise;	/**< noise lower than this we ignore */
201     int32 max_noise;	/**< noise higher than this signals an error */
202     int32 winsize;	/**< how many frames to look at for speech det */
203     int32 speech_onset;	/**< start speech on >= these many frames out of winsize, of >= delta_speech */
204     int32 sil_onset;	/**< end speech on >= these many frames out of winsize, of <= delta_sil */
205     int32 leader;	/**< pad beggining of speech with this many extra frms */
206     int32 trailer;	/**< pad end of speech with this many extra frms */
207 
208     int32 thresh_speech;/**< Frame considered to be speech if power >= thresh_speech
209 			   (for transitioning from SILENCE to SPEECH state) */
210     int32 thresh_sil;	/**< Frame considered to be silence if power <= thresh_sil
211 			   (for transitioning from SPEECH to SILENCE state) */
212     int32 thresh_update;/**< Number of frames before next update to pow_hist/thresholds */
213     float32 adapt_rate;	/**< Linear interpolation constant for rate at which noise level adapted
214 			   to each estimate;
215 			   range: 0-1; 0=> no adaptation, 1=> instant adaptation */
216 
217     int32 tail_state;	/**< State at the end of its internal buffer (internal use):
218 			   CONT_AD_STATE_SIL or CONT_AD_STATE_SPEECH.  Note: This is
219 			   different from cont_ad_t.state. */
220     int32 win_startfrm;	/**< Where next analysis window begins */
221     int32 win_validfrm;	/**< Number of frames currently available from win_startfrm for analysis */
222     int32 n_other;	/**< If in SILENCE state, number of frames in analysis window considered to
223 			   be speech; otherwise number of frames considered to be silence */
224     spseg_t *spseg_head;/**< First of unconsumed speech segments */
225     spseg_t *spseg_tail;/**< Last of unconsumed speech segments */
226 
227     FILE *rawfp;	/**< If non-NULL, raw audio input data processed by cont_ad
228 			   is dumped to this file.  Controlled by user application
229 			   via cont_ad_set_rawfp().  NULL when cont_ad object is
230 			   initially created. */
231     FILE *logfp;	/**< If non-NULL, write detailed logs of this object's
232 			   progress to the file.  Controlled by user application
233 			   via cont_ad_set_logfp().  NULL when cont_ad object is
234 			   initially created. */
235 
236     int32 n_calib_frame; /**< Number of frames of calibration data seen so far. */
237 } cont_ad_t;
238 
239 
240 /**
241  * Initialize a continuous listening/silence filtering object.
242  *
243  * One time initialization of a continuous listening/silence filtering
244  * object/module.  This can work in either "stream mode", where it
245  * reads data from an audio device represented by
246  * <code>ad_rec_t</code>, or in "block mode", where it filters out
247  * silence regions from blocks of data passed into it.
248  *
249  * @param ad An audio device to read from, or NULL to operate in block mode.
250  * @param adfunc The function used to read audio from <code>ad</code>,
251  * or NULL to operate in block mode.  This is usually ad_read().
252  * @return A pointer to a READ-ONLY structure used in other calls to
253  * the object.  If any error occurs, the return value is NULL.
254  */
255 SPHINXBASE_EXPORT
256 cont_ad_t *cont_ad_init (ad_rec_t *ad,	/**< In: The A/D source object to be filtered */
257 			 int32 (*adfunc)(ad_rec_t *ad, int16 *buf, int32 max)
258 			 /**< In: adfunc = source function to be invoked
259 					   to obtain raw A/D data.  See ad.h for the
260 					   required prototype definition. */
261 			 );
262 
263 /**
264  * Initializes a continuous listening object which simply passes data through (!)
265  *
266  * Like cont_ad_init, but put the module in raw mode; i.e., all data is passed
267  * through, unfiltered.  (By special request.)
268  */
269 SPHINXBASE_EXPORT
270 cont_ad_t *cont_ad_init_rawmode (ad_rec_t *ad,
271 				 int32 (*adfunc)(ad_rec_t *ad, int16 *buf, int32 max));
272 
273 
274 /**
275  * Read raw audio data into the silence filter.
276  *
277  * The main read routine for reading speech/silence segmented audio data.  Audio
278  * data is copied into the caller provided buffer, much like a file read routine.
279  *
280  * In "block mode", i.e. if NULL was passed as a read function to
281  * <code>cont_ad_init</code>, the data in <code>buf</code> is taken as
282  * input, and any non-silence data is written back to <code>buf</code>
283  * on exit.  In this case, you must take care that <code>max</code>
284  * does not overflow the internal buffer of the silence filter.  The
285  * available number of samples can be obtained by calling
286  * cont_ad_buffer_space().  Any excess data will be discarded.
287  *
288  * In normal mode, only speech segments are copied; silence segments are dropped.
289  * In rawmode (cont_ad module initialized using cont_ad_init_rawmode()), all data
290  * are passed through to the caller.  But, in either case, any single call to
291  * cont_ad_read will never return data that crosses a speech/silence segment
292  * boundary.
293  *
294  * The following variables are updated for use by the caller (see cont_ad_t above):
295  *   cont_ad_t.state,
296  *   cont_ad_t.read_ts,
297  *   cont_ad_t.seglen,
298  *   cont_ad_t.siglvl.
299  *
300  * Return value: Number of samples actually read, possibly 0; <0 if EOF on A/D source.
301  */
302 SPHINXBASE_EXPORT
303 int32 cont_ad_read (cont_ad_t *r,	/**< In: Object pointer returned by cont_ad_init */
304 		    int16 *buf,		/**< In/Out: In block mode, contains input data.
305                                            On return, buf contains A/D data returned
306 					   by this function, if any. */
307 		    int32 max		/**< In: Maximum number of samples to be filled into buf.
308 					   NOTE: max must be at least 256; otherwise
309 					   the functions returns -1. */
310 	);
311 
312 /**
313  * Get the maximum number of samples which can be passed into cont_ad_read().
314  */
315 SPHINXBASE_EXPORT
316 int32 cont_ad_buffer_space(cont_ad_t *r);
317 
318 /**
319  * Calibrate the silence filter.
320  *
321  * Calibration to determine an initial silence threshold.  This function can be called
322  * any number of times.  It should be called at least once immediately after cont_ad_init.
323  * The silence threshold is also updated internally once in a while, so this function
324  * only needs to be called in the middle if there is a definite change in the recording
325  * environment.
326  * The application is responsible for making sure that the raw audio source is turned on
327  * before the calibration.
328  * Return value: 0 if successful, <0 otherwise.
329  */
330 SPHINXBASE_EXPORT
331 int32 cont_ad_calib (cont_ad_t *cont	/**< In: object pointer returned by cont_ad_init */
332 		     );
333 
334 /**
335  * Calibrate the silence filter without an audio device.
336  *
337  * If the application has not passed an audio device into the silence filter
338  * at initialisation,  this routine can be used to calibrate the filter. The
339  * buf (of length max samples) should contain audio data for calibration. This
340  * data is assumed to be completely consumed. More than one call may be
341  * necessary to fully calibrate.
342  * Return value: 0 if successful, <0 on failure, >0 if calibration not
343  * complete.
344  */
345 SPHINXBASE_EXPORT
346 int32 cont_ad_calib_loop (cont_ad_t *r, int16 *buf, int32 max);
347 
348 /**
349  * Get the number of samples required to calibrate the silence filter.
350  *
351  * Since, as mentioned above, the calibration data is assumed to be
352  * fully consumed, it may be desirable to "hold onto" this data in
353  * case it contains useful speech.  This function returns the number
354  * of samples required to calibrate the silence filter, which is
355  * useful in allocating a buffer to store this data.
356  *
357  * @return Number of samples required for successful calibration.
358  */
359 SPHINXBASE_EXPORT
360 int32 cont_ad_calib_size(cont_ad_t *r);
361 
362 /**
363  * Set silence and speech threshold parameters.
364  *
365  * The silence threshold is the max power
366  * level, RELATIVE to the peak background noise level, in any silence frame.  Similarly,
367  * the speech threshold is the min power level, RELATIVE to the peak background noise
368  * level, in any speech frame.  In general, silence threshold <= speech threshold.
369  * Increasing the thresholds (say, from the default value of 2 to 3 or 4) reduces the
370  * sensitivity to background noise, but may also increase the chances of clipping actual
371  * speech.
372  * @return: 0 if successful, <0 otherwise.
373  */
374 SPHINXBASE_EXPORT
375 int32 cont_ad_set_thresh (cont_ad_t *cont,	/**< In: Object ptr from cont_ad_init */
376 			  int32 sil,	/**< In: silence threshold (default 2) */
377 			  int32 sp	/**< In: speech threshold (default 2) */
378 			  );
379 
380 
381 /**
382  * Set the changable parameters.
383  *
384  *   delta_sil, delta_speech, min_noise, and max_noise are in dB,
385  *   winsize, speech_onset, sil_onset, leader and trailer are in frames of
386  *   16 ms length (256 samples @ 16kHz sampling).
387  */
388 SPHINXBASE_EXPORT
389 int32 cont_ad_set_params (cont_ad_t *r, int32 delta_sil, int32 delta_speech,
390 			  int32 min_noise, int32 max_noise,
391 			  int32 winsize, int32 speech_onset, int32 sil_onset,
392 			  int32 leader, int32 trailer,
393 			  float32 adapt_rate);
394 
395 /**
396  * PWP 1/14/98 -- get the changable params.
397  *
398  *   delta_sil, delta_speech, min_noise, and max_noise are in dB,
399  *   winsize, speech_onset, sil_onset, leader and trailer are in frames of
400  *   16 ms length (256 samples @ 16kHz sampling).
401  */
402 SPHINXBASE_EXPORT
403 int32 cont_ad_get_params (cont_ad_t *r, int32 *delta_sil, int32 *delta_speech,
404 			  int32 *min_noise, int32 *max_noise,
405 			  int32 *winsize, int32 *speech_onset, int32 *sil_onset,
406 			  int32 *leader, int32 *trailer,
407 			  float32 *adapt_rate);
408 
409 /**
410  * Reset, discarding any accumulated speech segments.
411  * @return 0 if successful, <0 otherwise.
412  */
413 SPHINXBASE_EXPORT
414 int32 cont_ad_reset (cont_ad_t *cont);	/* In: Object pointer from cont_ad_init */
415 
416 
417 /**
418  * Close the continuous listening object.
419  */
420 SPHINXBASE_EXPORT
421 int32 cont_ad_close (cont_ad_t *cont);	/* In: Object pointer from cont_ad_init */
422 
423 
424 /**
425  * Dump the power histogram.  For debugging...
426  */
427 SPHINXBASE_EXPORT
428 void cont_ad_powhist_dump (FILE *fp, cont_ad_t *cont);
429 
430 
431 /**
432  * Detach the given continuous listening module from the associated audio device.
433  * @return 0 if successful, -1 otherwise.
434  */
435 SPHINXBASE_EXPORT
436 int32 cont_ad_detach (cont_ad_t *c);
437 
438 
439 /**
440  * Attach the continuous listening module to the given audio device/function.
441  * (Like cont_ad_init, but without the calibration.)
442  * @return 0 if successful, -1 otherwise.
443  */
444 SPHINXBASE_EXPORT
445 int32 cont_ad_attach (cont_ad_t *c, ad_rec_t *a, int32 (*func)(ad_rec_t *, int16 *, int32));
446 
447 
448 /**
449  * Set a file for dumping raw audio input.
450  *
451  * The application can ask cont_ad to dump the raw audio input that cont_ad
452  * processes to a file.  Use this function to give the FILE* to the cont_ad
453  * object.  If invoked with fp == NULL, dumping is turned off.  The application
454  * is responsible for opening and closing the file.  If fp is non-NULL, cont_ad
455  * assumes the file pointer is valid and opened for writing.
456  *
457  * @return 0 if successful, -1 otherwise.
458  */
459 SPHINXBASE_EXPORT
460 int32 cont_ad_set_rawfp (cont_ad_t *c,	/* The cont_ad object being addressed */
461 			 FILE *fp);	/* File to which raw audio data is to
462 					   be dumped; NULL to stop dumping. */
463 
464 /**
465  * Set the file to which cont_ad logs its progress.
466  *
467  * Mainly for debugging.  If <code>fp</code> is NULL, logging is turned off.
468  *
469  * @return 0 if successful, -1 otherwise.
470  */
471 SPHINXBASE_EXPORT
472 int32 cont_ad_set_logfp (cont_ad_t *c,	/* The cont_ad object being addressed */
473 			 FILE *fp);	/* File to which logs are written;
474 					   NULL to stop logging. */
475 
476 /**
477  * Set the silence and speech thresholds.
478  *
479  * For this to remain permanently in effect, the auto_thresh field of
480  * the continuous listening module should be set to FALSE or 0.
481  * Otherwise the thresholds may be modified by the noise- level
482  * adaptation.
483  */
484 SPHINXBASE_EXPORT
485 int32 cont_set_thresh(cont_ad_t *r, int32 silence, int32 speech);
486 
487 #ifdef __cplusplus
488 }
489 #endif
490 
491 
492 #endif
493