1 /* -*- c-basic-offset: 4; indent-tabs-mode: nil -*- */
2 /* ====================================================================
3  * Copyright (c) 1999-2004 Carnegie Mellon University.  All rights
4  * reserved.
5  *
6  * Redistribution and use in source and binary forms, with or without
7  * modification, are permitted provided that the following conditions
8  * are met:
9  *
10  * 1. Redistributions of source code must retain the above copyright
11  *    notice, this list of conditions and the following disclaimer.
12  *
13  * 2. Redistributions in binary form must reproduce the above copyright
14  *    notice, this list of conditions and the following disclaimer in
15  *    the documentation and/or other materials provided with the
16  *    distribution.
17  *
18  * This work was supported in part by funding from the Defense Advanced
19  * Research Projects Agency and the National Science Foundation of the
20  * United States of America, and the CMU Sphinx Speech Consortium.
21  *
22  * THIS SOFTWARE IS PROVIDED BY CARNEGIE MELLON UNIVERSITY ``AS IS'' AND
23  * ANY EXPRESSED OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO,
24  * THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
25  * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL CARNEGIE MELLON UNIVERSITY
26  * NOR ITS EMPLOYEES BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
27  * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
28  * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
29  * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
30  * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
31  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
32  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
33  *
34  * ====================================================================
35  *
36  */
37 /*
38  * feat.h -- Cepstral features computation.
39  *
40  * **********************************************
41  * CMU ARPA Speech Project
42  *
43  * Copyright (c) 1999 Carnegie Mellon University.
44  * ALL RIGHTS RESERVED.
45  * **********************************************
46  *
47  * HISTORY
48  * $Log$
49  * Revision 1.1  2006/04/05  20:27:30  dhdfu
50  * A Great Reorganzation of header files and executables
51  *
52  * Revision 1.17  2006/02/23 03:59:40  arthchan2003
53  * Merged from branch SPHINX3_5_2_RCI_IRII_BRANCH: a, Free buffers correctly. b, Fixed dox-doc.
54  *
55  * Revision 1.16.4.1  2005/07/05 06:25:08  arthchan2003
56  * Fixed dox-doc.
57  *
58  * Revision 1.16  2005/06/22 03:29:35  arthchan2003
59  * Makefile.am s  for all subdirectory of libs3decoder/
60  *
61  * Revision 1.5  2005/06/13 04:02:56  archan
62  * Fixed most doxygen-style documentation under libs3decoder.
63  *
64  * Revision 1.4  2005/04/21 23:50:26  archan
65  * Some more refactoring on the how reporting of structures inside kbcore_t is done, it is now 50% nice. Also added class-based LM test case into test-decode.sh.in.  At this moment, everything in search mode 5 is already done.  It is time to test the idea whether the search can really be used.
66  *
67  * Revision 1.3  2005/03/30 01:22:46  archan
68  * Fixed mistakes in last updates. Add
69  *
70  *
71  * 20.Apr.2001  RAH (rhoughton@mediasite.com, ricky.houghton@cs.cmu.edu)
72  *              Adding feat_free() to free allocated memory
73  *
74  * 04-Jan-1999	M K Ravishankar (rkm@cs.cmu.edu) at Carnegie Mellon University
75  * 		Started.
76  */
77 
78 
79 #ifndef _S3_FEAT_H_
80 #define _S3_FEAT_H_
81 
82 #include <stdio.h>
83 
84 /* Win32/WinCE DLL gunk */
85 #include <sphinxbase/sphinxbase_export.h>
86 #include <sphinxbase/prim_type.h>
87 #include <sphinxbase/fe.h>
88 #include <sphinxbase/cmn.h>
89 #include <sphinxbase/agc.h>
90 
91 #ifdef __cplusplus
92 extern "C" {
93 #endif
94 #if 0
95 /* Fool Emacs. */
96 }
97 #endif
98 
99 /** \file feat.h
100  * \brief compute the dynamic coefficients from the cepstral vector.
101  */
102 #define LIVEBUFBLOCKSIZE        256    /** Blocks of 256 vectors allocated
103 					   for livemode decoder */
104 #define S3_MAX_FRAMES		15000    /* RAH, I believe this is still too large, but better than before */
105 
106 #define cepstral_to_feature_command_line_macro()                        \
107 { "-feat",                                                              \
108       ARG_STRING,                                                       \
109       "1s_c_d_dd",                                                      \
110       "Feature stream type, depends on the acoustic model" },           \
111 { "-ceplen",                                                            \
112       ARG_INT32,                                                        \
113       "13",                                                             \
114      "Number of components in the input feature vector" },              \
115 { "-cmn",                                                               \
116       ARG_STRING,                                                       \
117       "current",                                                        \
118       "Cepstral mean normalization scheme ('current', 'prior', or 'none')" }, \
119 { "-cmninit",                                                           \
120       ARG_STRING,                                                       \
121       "8.0",                                                            \
122       "Initial values (comma-separated) for cepstral mean when 'prior' is used" }, \
123 { "-varnorm",                                                           \
124       ARG_BOOLEAN,                                                      \
125       "no",                                                             \
126       "Variance normalize each utterance (only if CMN == current)" },   \
127 { "-agc",                                                               \
128       ARG_STRING,                                                       \
129       "none",                                                           \
130       "Automatic gain control for c0 ('max', 'emax', 'noise', or 'none')" }, \
131 { "-agcthresh",                                                         \
132       ARG_FLOAT32,                                                      \
133       "2.0",                                                            \
134       "Initial threshold for automatic gain control" },                 \
135 { "-lda",                                                               \
136       ARG_STRING,                                                       \
137       NULL,                                                             \
138       "File containing transformation matrix to be applied to features (single-stream features only)" }, \
139 { "-ldadim",                                                            \
140       ARG_INT32,                                                        \
141       "0",                                                              \
142       "Dimensionality of output of feature transformation (0 to use entire matrix)" }, \
143 {"-svspec",                                                             \
144      ARG_STRING,                                                        \
145      NULL,                                                           \
146      "Subvector specification (e.g., 24,0-11/25,12-23/26-38 or 0-12/13-25/26-38)"}
147 
148 /**
149  * \struct feat_t
150  * \brief Structure for describing a speech feature type
151  * Structure for describing a speech feature type (no. of streams and stream widths),
152  * as well as the computation for converting the input speech (e.g., Sphinx-II format
153  * MFC cepstra) into this type of feature vectors.
154  */
155 typedef struct feat_s {
156     int refcount;       /**< Reference count. */
157     char *name;		/**< Printable name for this feature type */
158     int32 cepsize;	/**< Size of input speech vector (typically, a cepstrum vector) */
159     int32 n_stream;	/**< Number of feature streams; e.g., 4 in Sphinx-II */
160     uint32 *stream_len;	/**< Vector length of each feature stream */
161     int32 window_size;	/**< Number of extra frames around given input frame needed to compute
162                            corresponding output feature (so total = window_size*2 + 1) */
163     int32 n_sv;         /**< Number of subvectors */
164     uint32 *sv_len;      /**< Vector length of each subvector */
165     int32 **subvecs;    /**< Subvector specification (or NULL for none) */
166     mfcc_t *sv_buf;      /**< Temporary copy buffer for subvector projection */
167     int32 sv_dim;       /**< Total dimensionality of subvector (length of sv_buf) */
168 
169     cmn_type_t cmn;	/**< Type of CMN to be performed on each utterance */
170     int32 varnorm;	/**< Whether variance normalization is to be performed on each utt;
171                            Irrelevant if no CMN is performed */
172     agc_type_t agc;	/**< Type of AGC to be performed on each utterance */
173 
174     /**
175      * Feature computation function.
176      * @param fcb the feat_t describing this feature type
177      * @param input pointer into the input cepstra
178      * @param feat a 2-d array of output features (n_stream x stream_len)
179      * @return 0 if successful, -ve otherwise.
180      *
181      * Function for converting window of input speech vector
182      * (input[-window_size..window_size]) to output feature vector
183      * (feat[stream][]).  If NULL, no conversion available, the
184      * speech input must be feature vector itself.
185      **/
186     void (*compute_feat)(struct feat_s *fcb, mfcc_t **input, mfcc_t **feat);
187     cmn_t *cmn_struct;	/**< Structure that stores the temporary variables for cepstral
188                            means normalization*/
189     agc_t *agc_struct;	/**< Structure that stores the temporary variables for acoustic
190                            gain control*/
191 
192     mfcc_t **cepbuf;    /**< Circular buffer of MFCC frames for live feature computation. */
193     mfcc_t **tmpcepbuf; /**< Array of pointers into cepbuf to handle border cases. */
194     int32   bufpos;     /**< Write index in cepbuf. */
195     int32   curpos;     /**< Read index in cepbuf. */
196 
197     mfcc_t ***lda; /**< Array of linear transformations (for LDA, MLLT, or whatever) */
198     uint32 n_lda;   /**< Number of linear transformations in lda. */
199     uint32 out_dim; /**< Output dimensionality */
200 } feat_t;
201 
202 /**
203  * Name of feature type.
204  */
205 #define feat_name(f)		((f)->name)
206 /**
207  * Input dimensionality of feature.
208  */
209 #define feat_cepsize(f)		((f)->cepsize)
210 /**
211  * Size of dynamic feature window.
212  */
213 #define feat_window_size(f)	((f)->window_size)
214 /**
215  * Number of feature streams.
216  *
217  * @deprecated Do not use this, use feat_dimension1() instead.
218  */
219 #define feat_n_stream(f)	((f)->n_stream)
220 /**
221  * Length of feature stream i.
222  *
223  * @deprecated Do not use this, use feat_dimension2() instead.
224  */
225 #define feat_stream_len(f,i)	((f)->stream_len[i])
226 /**
227  * Number of streams or subvectors in feature output.
228  */
229 #define feat_dimension1(f)	((f)->n_sv ? (f)->n_sv : f->n_stream)
230 /**
231  * Dimensionality of stream/subvector i in feature output.
232  */
233 #define feat_dimension2(f,i)	((f)->lda ? (f)->out_dim : ((f)->sv_len ? (f)->sv_len[i] : f->stream_len[i]))
234 /**
235  * Total dimensionality of feature output.
236  */
237 #define feat_dimension(f)	((f)->out_dim)
238 /**
239  * Array with stream/subvector lengths
240  */
241 #define feat_stream_lengths(f)  ((f)->lda ? (&(f)->out_dim) : (f)->sv_len ? (f)->sv_len : f->stream_len)
242 
243 /**
244  * Parse subvector specification string.
245  *
246  * Format of specification:
247  *   \li '/' separated list of subvectors
248  *   \li each subvector is a ',' separated list of subranges
249  *   \li each subrange is a single \verbatim <number> \endverbatim or
250  *       \verbatim <number>-<number> \endverbatim (inclusive), where
251  *       \verbatim <number> \endverbatim is a feature vector dimension
252  *       specifier.
253  *
254  * E.g., "24,0-11/25,12-23/26,27-38" has:
255  *   \li 3 subvectors
256  *   \li the 1st subvector has feature dims: 24, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, and 11.
257  *   \li etc.
258  *
259  * @param str subvector specification string.
260  * @return allocated 2-D array of subvector specs (free with
261  * subvecs_free()).  If there are N subvectors specified, subvec[N] =
262  * NULL; and each subvec[0]..subvec[N-1] is -1 terminated vector of
263  * feature dims.
264  */
265 SPHINXBASE_EXPORT
266 int32 **parse_subvecs(char const *str);
267 
268 /**
269  * Free array of subvector specs.
270  */
271 SPHINXBASE_EXPORT
272 void subvecs_free(int32 **subvecs);
273 
274 
275 /**
276  * Allocate an array to hold several frames worth of feature vectors.  The returned value
277  * is the mfcc_t ***data array, organized as follows:
278  *
279  * - data[0][0] = frame 0 stream 0 vector, data[0][1] = frame 0 stream 1 vector, ...
280  * - data[1][0] = frame 1 stream 0 vector, data[0][1] = frame 1 stream 1 vector, ...
281  * - data[2][0] = frame 2 stream 0 vector, data[0][1] = frame 2 stream 1 vector, ...
282  * - ...
283  *
284  * NOTE: For I/O convenience, the entire data area is allocated as one contiguous block.
285  * @return pointer to the allocated space if successful, NULL if any error.
286  */
287 SPHINXBASE_EXPORT
288 mfcc_t ***feat_array_alloc(feat_t *fcb,	/**< In: Descriptor from feat_init(), used
289 					     to obtain number of streams and stream sizes */
290                            int32 nfr	/**< In: Number of frames for which to allocate */
291     );
292 
293 /**
294  * Realloate the array of features. Requires us to know the old size
295  */
296 SPHINXBASE_EXPORT
297 mfcc_t ***feat_array_realloc(feat_t *fcb, /**< In: Descriptor from feat_init(), used
298 					      to obtain number of streams and stream sizes */
299 			     mfcc_t ***old_feat, /**< Feature array. Freed */
300                              int32 ofr,	/**< In: Previous number of frames */
301                              int32 nfr	/**< In: Number of frames for which to allocate */
302     );
303 
304 /**
305  * Free a buffer allocated with feat_array_alloc()
306  */
307 SPHINXBASE_EXPORT
308 void feat_array_free(mfcc_t ***feat);
309 
310 
311 /**
312  * Initialize feature module to use the selected type of feature stream.
313  * One-time only initialization at the beginning of the program.  Input type
314  * is a string defining the  kind of input->feature conversion desired:
315  *
316  * - "s2_4x":     s2mfc->Sphinx-II 4-feature stream,
317  * - "1s_c_d_dd": s2mfc->Sphinx 3.x single feature stream,
318  * - "s3_1x39":   s2mfc->Sphinx 3.0 single feature stream,
319  * - "n1,n2,n3,...": Explicit feature vector layout spec. with comma-separated
320  *   feature stream lengths.  In this case, the input data is already in the
321  *   feature format and there is no conversion necessary.
322  *
323  * @return (feat_t *) descriptor if successful, NULL if error.  Caller
324  * must not directly modify the contents of the returned value.
325  */
326 SPHINXBASE_EXPORT
327 feat_t *feat_init(char const *type,/**< In: Type of feature stream */
328                   cmn_type_t cmn, /**< In: Type of cepstram mean normalization to
329                                      be done before feature computation; can be
330                                      CMN_NONE (for none) */
331                   int32 varnorm,  /**< In: (boolean) Whether variance
332                                      normalization done on each utt; only
333                                      applicable if CMN also done */
334                   agc_type_t agc, /**< In: Type of automatic gain control to be
335                                      done before feature computation */
336                   int32 breport, /**< In: Whether to show a report for feat_t */
337                   int32 cepsize  /**< Number of components in the input vector
338                                     (or 0 for the default for this feature type,
339                                     which is usually 13) */
340     );
341 
342 /**
343  * Add an LDA transformation to the feature module from a file.
344  * @return 0 for success or -1 if reading the LDA file failed.
345  **/
346 SPHINXBASE_EXPORT
347 int32 feat_read_lda(feat_t *feat,	 /**< In: Descriptor from feat_init() */
348                     const char *ldafile, /**< In: File to read the LDA matrix from. */
349                     int32 dim		 /**< In: Dimensionality of LDA output. */
350     );
351 
352 /**
353  * Transform a block of features using the feature module's LDA transform.
354  **/
355 SPHINXBASE_EXPORT
356 void feat_lda_transform(feat_t *fcb,		/**< In: Descriptor from feat_init() */
357                         mfcc_t ***inout_feat,	/**< Feature block to transform. */
358                         uint32 nfr		/**< In: Number of frames in inout_feat. */
359     );
360 
361 /**
362  * Add a subvector specification to the feature module.
363  *
364  * The subvector splitting will be performed after dynamic feature
365  * computation, CMN, AGC, and any LDA transformation.  The number of
366  * streams in the dynamic feature type must be one, as with LDA.
367  *
368  * After adding a subvector specification, the output of feature
369  * computation will be split into multiple subvectors, and
370  * feat_array_alloc() will allocate pointers accordingly.  The number
371  * of <em>streams</em> will remain the
372  *
373  * @param fcb the feature descriptor.
374  * @param subvecs subvector specification.  This pointer is retained
375  * by the feat_t and should not be freed manually.
376  * @return 0 for success or -1 if the subvector specification was
377  * invalid.
378  */
379 SPHINXBASE_EXPORT
380 int feat_set_subvecs(feat_t *fcb, int32 **subvecs);
381 
382 /**
383  * Print the given block of feature vectors to the given FILE.
384  */
385 SPHINXBASE_EXPORT
386 void feat_print(feat_t *fcb,		/**< In: Descriptor from feat_init() */
387 		mfcc_t ***feat,		/**< In: Feature data to be printed */
388 		int32 nfr,		/**< In: Number of frames of feature data above */
389 		FILE *fp		/**< In: Output file pointer */
390     );
391 
392 
393 /**
394  * Read a specified MFC file (or given segment within it), perform
395  * CMN/AGC as indicated by <code>fcb</code>, and compute feature
396  * vectors.  Feature vectors are computed for the entire segment
397  * specified, by including additional surrounding or padding frames to
398  * accommodate the feature windows.
399  *
400  * @return Number of frames of feature vectors computed if successful;
401  * -1 if any error.  <code>If</code> feat is NULL, then no actual
402  * computation will be done, and the number of frames which must be
403  * allocated will be returned.
404  *
405  * A note on how the file path is constructed: If the control file
406  * already specifies extension or absolute path, then these are not
407  * applied. The default extension is defined by the application.
408  */
409 SPHINXBASE_EXPORT
410 int32 feat_s2mfc2feat(feat_t *fcb,	/**< In: Descriptor from feat_init() */
411 		      const char *file,	/**< In: File to be read */
412 		      const char *dir,	/**< In: Directory prefix for file,
413 					   if needed; can be NULL */
414 		      const char *cepext,/**< In: Extension of the
415 					   cepstrum file.It cannot be
416 					   NULL */
417 		      int32 sf, int32 ef,   /* Start/End frames
418                                                within file to be read. Use
419                                                0,-1 to process entire
420                                                file */
421 		      mfcc_t ***feat,	/**< Out: Computed feature vectors;
422 					   caller must allocate this space */
423 		      int32 maxfr	/**< In: Available space (number of frames) in
424 					   above feat array; it must be
425 					   sufficient to hold the result.
426                                            Pass -1 for no limit. */
427     );
428 
429 
430 /**
431  * Feature computation routine for live mode decoder.
432  *
433  * This function computes features for blocks of incoming data. It
434  * retains an internal buffer for computing deltas, which means that
435  * the number of output frames will not necessarily equal the number
436  * of input frames.
437  *
438  * <strong>It is very important</strong> to realize that the number of
439  * output frames can be <strong>greater than</strong> the number of
440  * input frames, specifically when <code>endutt</code> is true.  It is
441  * guaranteed to never exceed <code>*inout_ncep +
442  * feat_window_size(fcb)</code>.  You <strong>MUST</strong> have
443  * allocated at least that many frames in <code>ofeat</code>, or you
444  * will experience a buffer overflow.
445  *
446  * If beginutt and endutt are both true, CMN_CURRENT and AGC_MAX will
447  * be done.  Otherwise only CMN_PRIOR and AGC_EMAX will be done.
448  *
449  * If beginutt is false, endutt is true, and the number of input
450  * frames exceeds the input size, then end-of-utterance processing
451  * won't actually be done.  This condition can easily be checked,
452  * because <code>*inout_ncep</code> will equal the return value on
453  * exit, and will also be smaller than the value of
454  * <code>*inout_ncep</code> on entry.
455  *
456  * @return The number of output frames actually computed.
457  **/
458 SPHINXBASE_EXPORT
459 int32 feat_s2mfc2feat_live(feat_t  *fcb,     /**< In: Descriptor from feat_init() */
460                            mfcc_t **uttcep,  /**< In: Incoming cepstral buffer */
461                            int32 *inout_ncep,/**< In: Size of incoming buffer.
462                                                 Out: Number of incoming frames consumed. */
463                            int32 beginutt,   /**< In: Begining of utterance flag */
464                            int32 endutt,     /**< In: End of utterance flag */
465                            mfcc_t ***ofeat   /**< In: Output feature buffer.  See
466                                                 <strong>VERY IMPORTANT</strong> note
467                                                 about the size of this buffer above. */
468     );
469 
470 
471 /**
472  * Retain ownership of feat_t.
473  *
474  * @return pointer to retained feat_t.
475  */
476 SPHINXBASE_EXPORT
477 feat_t *feat_retain(feat_t *f);
478 
479 /**
480  * Release resource associated with feat_t
481  *
482  * @return new reference count (0 if freed)
483  */
484 SPHINXBASE_EXPORT
485 int feat_free(feat_t *f /**< In: feat_t */
486     );
487 
488 /**
489  * Report the feat_t data structure
490  */
491 SPHINXBASE_EXPORT
492 void feat_report(feat_t *f /**< In: feat_t */
493     );
494 #ifdef __cplusplus
495 }
496 #endif
497 
498 
499 #endif
500