1 /* -*- c-basic-offset: 4; indent-tabs-mode: nil -*- */
2 /* ====================================================================
3  * Copyright (c) 1999-2004 Carnegie Mellon University.  All rights
4  * reserved.
5  *
6  * Redistribution and use in source and binary forms, with or without
7  * modification, are permitted provided that the following conditions
8  * are met:
9  *
10  * 1. Redistributions of source code must retain the above copyright
11  *    notice, this list of conditions and the following disclaimer.
12  *
13  * 2. Redistributions in binary form must reproduce the above copyright
14  *    notice, this list of conditions and the following disclaimer in
15  *    the documentation and/or other materials provided with the
16  *    distribution.
17  *
18  * This work was supported in part by funding from the Defense Advanced
19  * Research Projects Agency and the National Science Foundation of the
20  * United States of America, and the CMU Sphinx Speech Consortium.
21  *
22  * THIS SOFTWARE IS PROVIDED BY CARNEGIE MELLON UNIVERSITY ``AS IS'' AND
23  * ANY EXPRESSED OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO,
24  * THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
25  * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL CARNEGIE MELLON UNIVERSITY
26  * NOR ITS EMPLOYEES BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
27  * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
28  * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
29  * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
30  * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
31  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
32  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
33  *
34  * ====================================================================
35  *
36  */
37 /*
38  * feat.h -- Cepstral features computation.
39  */
40 
41 #ifndef _S3_FEAT_H_
42 #define _S3_FEAT_H_
43 
44 #include <stdio.h>
45 
46 /* Win32/WinCE DLL gunk */
47 #include <sphinxbase/sphinxbase_export.h>
48 #include <sphinxbase/prim_type.h>
49 #include <sphinxbase/fe.h>
50 #include <sphinxbase/cmn.h>
51 #include <sphinxbase/agc.h>
52 
53 #ifdef __cplusplus
54 extern "C" {
55 #endif
56 #if 0
57 /* Fool Emacs. */
58 }
59 #endif
60 
61 /** \file feat.h
62  * \brief compute the dynamic coefficients from the cepstral vector.
63  */
64 #define LIVEBUFBLOCKSIZE        256    /** Blocks of 256 vectors allocated
65 					   for livemode decoder */
66 #define S3_MAX_FRAMES		15000    /* RAH, I believe this is still too large, but better than before */
67 
68 #define cepstral_to_feature_command_line_macro()                        \
69 { "-feat",                                                              \
70       ARG_STRING,                                                       \
71       "1s_c_d_dd",                                                      \
72       "Feature stream type, depends on the acoustic model" },           \
73 { "-ceplen",                                                            \
74       ARG_INT32,                                                        \
75       "13",                                                             \
76      "Number of components in the input feature vector" },              \
77 { "-cmn",                                                               \
78       ARG_STRING,                                                       \
79       "current",                                                        \
80       "Cepstral mean normalization scheme ('current', 'prior', or 'none')" }, \
81 { "-cmninit",                                                           \
82       ARG_STRING,                                                       \
83       "8.0",                                                            \
84       "Initial values (comma-separated) for cepstral mean when 'prior' is used" }, \
85 { "-varnorm",                                                           \
86       ARG_BOOLEAN,                                                      \
87       "no",                                                             \
88       "Variance normalize each utterance (only if CMN == current)" },   \
89 { "-agc",                                                               \
90       ARG_STRING,                                                       \
91       "none",                                                           \
92       "Automatic gain control for c0 ('max', 'emax', 'noise', or 'none')" }, \
93 { "-agcthresh",                                                         \
94       ARG_FLOAT32,                                                      \
95       "2.0",                                                            \
96       "Initial threshold for automatic gain control" },                 \
97 { "-lda",                                                               \
98       ARG_STRING,                                                       \
99       NULL,                                                             \
100       "File containing transformation matrix to be applied to features (single-stream features only)" }, \
101 { "-ldadim",                                                            \
102       ARG_INT32,                                                        \
103       "0",                                                              \
104       "Dimensionality of output of feature transformation (0 to use entire matrix)" }, \
105 {"-svspec",                                                             \
106      ARG_STRING,                                                        \
107      NULL,                                                           \
108      "Subvector specification (e.g., 24,0-11/25,12-23/26-38 or 0-12/13-25/26-38)"}
109 
110 /**
111  * \struct feat_t
112  * \brief Structure for describing a speech feature type
113  * Structure for describing a speech feature type (no. of streams and stream widths),
114  * as well as the computation for converting the input speech (e.g., Sphinx-II format
115  * MFC cepstra) into this type of feature vectors.
116  */
117 typedef struct feat_s {
118     int refcount;       /**< Reference count. */
119     char *name;		/**< Printable name for this feature type */
120     int32 cepsize;	/**< Size of input speech vector (typically, a cepstrum vector) */
121     int32 n_stream;	/**< Number of feature streams; e.g., 4 in Sphinx-II */
122     uint32 *stream_len;	/**< Vector length of each feature stream */
123     int32 window_size;	/**< Number of extra frames around given input frame needed to compute
124                            corresponding output feature (so total = window_size*2 + 1) */
125     int32 n_sv;         /**< Number of subvectors */
126     uint32 *sv_len;      /**< Vector length of each subvector */
127     int32 **subvecs;    /**< Subvector specification (or NULL for none) */
128     mfcc_t *sv_buf;      /**< Temporary copy buffer for subvector projection */
129     int32 sv_dim;       /**< Total dimensionality of subvector (length of sv_buf) */
130 
131     cmn_type_t cmn;	/**< Type of CMN to be performed on each utterance */
132     int32 varnorm;	/**< Whether variance normalization is to be performed on each utt;
133                            Irrelevant if no CMN is performed */
134     agc_type_t agc;	/**< Type of AGC to be performed on each utterance */
135 
136     /**
137      * Feature computation function.
138      * @param fcb the feat_t describing this feature type
139      * @param input pointer into the input cepstra
140      * @param feat a 2-d array of output features (n_stream x stream_len)
141      * @return 0 if successful, -ve otherwise.
142      *
143      * Function for converting window of input speech vector
144      * (input[-window_size..window_size]) to output feature vector
145      * (feat[stream][]).  If NULL, no conversion available, the
146      * speech input must be feature vector itself.
147      **/
148     void (*compute_feat)(struct feat_s *fcb, mfcc_t **input, mfcc_t **feat);
149     cmn_t *cmn_struct;	/**< Structure that stores the temporary variables for cepstral
150                            means normalization*/
151     agc_t *agc_struct;	/**< Structure that stores the temporary variables for acoustic
152                            gain control*/
153 
154     mfcc_t **cepbuf;    /**< Circular buffer of MFCC frames for live feature computation. */
155     mfcc_t **tmpcepbuf; /**< Array of pointers into cepbuf to handle border cases. */
156     int32   bufpos;     /**< Write index in cepbuf. */
157     int32   curpos;     /**< Read index in cepbuf. */
158 
159     mfcc_t ***lda; /**< Array of linear transformations (for LDA, MLLT, or whatever) */
160     uint32 n_lda;   /**< Number of linear transformations in lda. */
161     uint32 out_dim; /**< Output dimensionality */
162 } feat_t;
163 
164 /**
165  * Name of feature type.
166  */
167 #define feat_name(f)		((f)->name)
168 /**
169  * Input dimensionality of feature.
170  */
171 #define feat_cepsize(f)		((f)->cepsize)
172 /**
173  * Size of dynamic feature window.
174  */
175 #define feat_window_size(f)	((f)->window_size)
176 /**
177  * Number of feature streams.
178  *
179  * @deprecated Do not use this, use feat_dimension1() instead.
180  */
181 #define feat_n_stream(f)	((f)->n_stream)
182 /**
183  * Length of feature stream i.
184  *
185  * @deprecated Do not use this, use feat_dimension2() instead.
186  */
187 #define feat_stream_len(f,i)	((f)->stream_len[i])
188 /**
189  * Number of streams or subvectors in feature output.
190  */
191 #define feat_dimension1(f)	((f)->n_sv ? (f)->n_sv : f->n_stream)
192 /**
193  * Dimensionality of stream/subvector i in feature output.
194  */
195 #define feat_dimension2(f,i)	((f)->lda ? (f)->out_dim : ((f)->sv_len ? (f)->sv_len[i] : f->stream_len[i]))
196 /**
197  * Total dimensionality of feature output.
198  */
199 #define feat_dimension(f)	((f)->out_dim)
200 /**
201  * Array with stream/subvector lengths
202  */
203 #define feat_stream_lengths(f)  ((f)->lda ? (&(f)->out_dim) : (f)->sv_len ? (f)->sv_len : f->stream_len)
204 
205 /**
206  * Parse subvector specification string.
207  *
208  * Format of specification:
209  *   \li '/' separated list of subvectors
210  *   \li each subvector is a ',' separated list of subranges
211  *   \li each subrange is a single \verbatim <number> \endverbatim or
212  *       \verbatim <number>-<number> \endverbatim (inclusive), where
213  *       \verbatim <number> \endverbatim is a feature vector dimension
214  *       specifier.
215  *
216  * E.g., "24,0-11/25,12-23/26,27-38" has:
217  *   \li 3 subvectors
218  *   \li the 1st subvector has feature dims: 24, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, and 11.
219  *   \li etc.
220  *
221  * @param str subvector specification string.
222  * @return allocated 2-D array of subvector specs (free with
223  * subvecs_free()).  If there are N subvectors specified, subvec[N] =
224  * NULL; and each subvec[0]..subvec[N-1] is -1 terminated vector of
225  * feature dims.
226  */
227 SPHINXBASE_EXPORT
228 int32 **parse_subvecs(char const *str);
229 
230 /**
231  * Free array of subvector specs.
232  */
233 SPHINXBASE_EXPORT
234 void subvecs_free(int32 **subvecs);
235 
236 
237 /**
238  * Allocate an array to hold several frames worth of feature vectors.  The returned value
239  * is the mfcc_t ***data array, organized as follows:
240  *
241  * - data[0][0] = frame 0 stream 0 vector, data[0][1] = frame 0 stream 1 vector, ...
242  * - data[1][0] = frame 1 stream 0 vector, data[0][1] = frame 1 stream 1 vector, ...
243  * - data[2][0] = frame 2 stream 0 vector, data[0][1] = frame 2 stream 1 vector, ...
244  * - ...
245  *
246  * NOTE: For I/O convenience, the entire data area is allocated as one contiguous block.
247  * @return pointer to the allocated space if successful, NULL if any error.
248  */
249 SPHINXBASE_EXPORT
250 mfcc_t ***feat_array_alloc(feat_t *fcb,	/**< In: Descriptor from feat_init(), used
251 					     to obtain number of streams and stream sizes */
252                            int32 nfr	/**< In: Number of frames for which to allocate */
253     );
254 
255 /**
256  * Realloate the array of features. Requires us to know the old size
257  */
258 SPHINXBASE_EXPORT
259 mfcc_t ***feat_array_realloc(feat_t *fcb, /**< In: Descriptor from feat_init(), used
260 					      to obtain number of streams and stream sizes */
261 			     mfcc_t ***old_feat, /**< Feature array. Freed */
262                              int32 ofr,	/**< In: Previous number of frames */
263                              int32 nfr	/**< In: Number of frames for which to allocate */
264     );
265 
266 /**
267  * Free a buffer allocated with feat_array_alloc()
268  */
269 SPHINXBASE_EXPORT
270 void feat_array_free(mfcc_t ***feat);
271 
272 
273 /**
274  * Initialize feature module to use the selected type of feature stream.
275  * One-time only initialization at the beginning of the program.  Input type
276  * is a string defining the  kind of input->feature conversion desired:
277  *
278  * - "s2_4x":     s2mfc->Sphinx-II 4-feature stream,
279  * - "1s_c_d_dd": s2mfc->Sphinx 3.x single feature stream,
280  * - "s3_1x39":   s2mfc->Sphinx 3.0 single feature stream,
281  * - "n1,n2,n3,...": Explicit feature vector layout spec. with comma-separated
282  *   feature stream lengths.  In this case, the input data is already in the
283  *   feature format and there is no conversion necessary.
284  *
285  * @return (feat_t *) descriptor if successful, NULL if error.  Caller
286  * must not directly modify the contents of the returned value.
287  */
288 SPHINXBASE_EXPORT
289 feat_t *feat_init(char const *type,/**< In: Type of feature stream */
290                   cmn_type_t cmn, /**< In: Type of cepstram mean normalization to
291                                      be done before feature computation; can be
292                                      CMN_NONE (for none) */
293                   int32 varnorm,  /**< In: (boolean) Whether variance
294                                      normalization done on each utt; only
295                                      applicable if CMN also done */
296                   agc_type_t agc, /**< In: Type of automatic gain control to be
297                                      done before feature computation */
298                   int32 breport, /**< In: Whether to show a report for feat_t */
299                   int32 cepsize  /**< Number of components in the input vector
300                                     (or 0 for the default for this feature type,
301                                     which is usually 13) */
302     );
303 
304 /**
305  * Add an LDA transformation to the feature module from a file.
306  * @return 0 for success or -1 if reading the LDA file failed.
307  **/
308 SPHINXBASE_EXPORT
309 int32 feat_read_lda(feat_t *feat,	 /**< In: Descriptor from feat_init() */
310                     const char *ldafile, /**< In: File to read the LDA matrix from. */
311                     int32 dim		 /**< In: Dimensionality of LDA output. */
312     );
313 
314 /**
315  * Transform a block of features using the feature module's LDA transform.
316  **/
317 SPHINXBASE_EXPORT
318 void feat_lda_transform(feat_t *fcb,		/**< In: Descriptor from feat_init() */
319                         mfcc_t ***inout_feat,	/**< Feature block to transform. */
320                         uint32 nfr		/**< In: Number of frames in inout_feat. */
321     );
322 
323 /**
324  * Add a subvector specification to the feature module.
325  *
326  * The subvector splitting will be performed after dynamic feature
327  * computation, CMN, AGC, and any LDA transformation.  The number of
328  * streams in the dynamic feature type must be one, as with LDA.
329  *
330  * After adding a subvector specification, the output of feature
331  * computation will be split into multiple subvectors, and
332  * feat_array_alloc() will allocate pointers accordingly.  The number
333  * of <em>streams</em> will remain the
334  *
335  * @param fcb the feature descriptor.
336  * @param subvecs subvector specification.  This pointer is retained
337  * by the feat_t and should not be freed manually.
338  * @return 0 for success or -1 if the subvector specification was
339  * invalid.
340  */
341 SPHINXBASE_EXPORT
342 int feat_set_subvecs(feat_t *fcb, int32 **subvecs);
343 
344 /**
345  * Print the given block of feature vectors to the given FILE.
346  */
347 SPHINXBASE_EXPORT
348 void feat_print(feat_t *fcb,		/**< In: Descriptor from feat_init() */
349 		mfcc_t ***feat,		/**< In: Feature data to be printed */
350 		int32 nfr,		/**< In: Number of frames of feature data above */
351 		FILE *fp		/**< In: Output file pointer */
352     );
353 
354 
355 /**
356  * Read a specified MFC file (or given segment within it), perform
357  * CMN/AGC as indicated by <code>fcb</code>, and compute feature
358  * vectors.  Feature vectors are computed for the entire segment
359  * specified, by including additional surrounding or padding frames to
360  * accommodate the feature windows.
361  *
362  * @return Number of frames of feature vectors computed if successful;
363  * -1 if any error.  <code>If</code> feat is NULL, then no actual
364  * computation will be done, and the number of frames which must be
365  * allocated will be returned.
366  *
367  * A note on how the file path is constructed: If the control file
368  * already specifies extension or absolute path, then these are not
369  * applied. The default extension is defined by the application.
370  */
371 SPHINXBASE_EXPORT
372 int32 feat_s2mfc2feat(feat_t *fcb,	/**< In: Descriptor from feat_init() */
373 		      const char *file,	/**< In: File to be read */
374 		      const char *dir,	/**< In: Directory prefix for file,
375 					   if needed; can be NULL */
376 		      const char *cepext,/**< In: Extension of the
377 					   cepstrum file.It cannot be
378 					   NULL */
379 		      int32 sf, int32 ef,   /* Start/End frames
380                                                within file to be read. Use
381                                                0,-1 to process entire
382                                                file */
383 		      mfcc_t ***feat,	/**< Out: Computed feature vectors;
384 					   caller must allocate this space */
385 		      int32 maxfr	/**< In: Available space (number of frames) in
386 					   above feat array; it must be
387 					   sufficient to hold the result.
388                                            Pass -1 for no limit. */
389     );
390 
391 
392 /**
393  * Feature computation routine for live mode decoder.
394  *
395  * This function computes features for blocks of incoming data. It
396  * retains an internal buffer for computing deltas, which means that
397  * the number of output frames will not necessarily equal the number
398  * of input frames.
399  *
400  * <strong>It is very important</strong> to realize that the number of
401  * output frames can be <strong>greater than</strong> the number of
402  * input frames, specifically when <code>endutt</code> is true.  It is
403  * guaranteed to never exceed <code>*inout_ncep +
404  * feat_window_size(fcb)</code>.  You <strong>MUST</strong> have
405  * allocated at least that many frames in <code>ofeat</code>, or you
406  * will experience a buffer overflow.
407  *
408  * If beginutt and endutt are both true, CMN_CURRENT and AGC_MAX will
409  * be done.  Otherwise only CMN_PRIOR and AGC_EMAX will be done.
410  *
411  * If beginutt is false, endutt is true, and the number of input
412  * frames exceeds the input size, then end-of-utterance processing
413  * won't actually be done.  This condition can easily be checked,
414  * because <code>*inout_ncep</code> will equal the return value on
415  * exit, and will also be smaller than the value of
416  * <code>*inout_ncep</code> on entry.
417  *
418  * @return The number of output frames actually computed.
419  **/
420 SPHINXBASE_EXPORT
421 int32 feat_s2mfc2feat_live(feat_t  *fcb,     /**< In: Descriptor from feat_init() */
422                            mfcc_t **uttcep,  /**< In: Incoming cepstral buffer */
423                            int32 *inout_ncep,/**< In: Size of incoming buffer.
424                                                 Out: Number of incoming frames consumed. */
425                            int32 beginutt,   /**< In: Begining of utterance flag */
426                            int32 endutt,     /**< In: End of utterance flag */
427                            mfcc_t ***ofeat   /**< In: Output feature buffer.  See
428                                                 <strong>VERY IMPORTANT</strong> note
429                                                 about the size of this buffer above. */
430     );
431 
432 
433 /**
434  * Update the normalization stats, possibly in the end of utterance
435  *
436  */
437 SPHINXBASE_EXPORT
438 void feat_update_stats(feat_t *fcb);
439 
440 
441 /**
442  * Retain ownership of feat_t.
443  *
444  * @return pointer to retained feat_t.
445  */
446 SPHINXBASE_EXPORT
447 feat_t *feat_retain(feat_t *f);
448 
449 /**
450  * Release resource associated with feat_t
451  *
452  * @return new reference count (0 if freed)
453  */
454 SPHINXBASE_EXPORT
455 int feat_free(feat_t *f /**< In: feat_t */
456     );
457 
458 /**
459  * Report the feat_t data structure
460  */
461 SPHINXBASE_EXPORT
462 void feat_report(feat_t *f /**< In: feat_t */
463     );
464 #ifdef __cplusplus
465 }
466 #endif
467 
468 
469 #endif
470