1 /* -*- c-basic-offset: 4; indent-tabs-mode: nil -*- */ 2 /* ==================================================================== 3 * Copyright (c) 1999-2004 Carnegie Mellon University. All rights 4 * reserved. 5 * 6 * Redistribution and use in source and binary forms, with or without 7 * modification, are permitted provided that the following conditions 8 * are met: 9 * 10 * 1. Redistributions of source code must retain the above copyright 11 * notice, this list of conditions and the following disclaimer. 12 * 13 * 2. Redistributions in binary form must reproduce the above copyright 14 * notice, this list of conditions and the following disclaimer in 15 * the documentation and/or other materials provided with the 16 * distribution. 17 * 18 * This work was supported in part by funding from the Defense Advanced 19 * Research Projects Agency and the National Science Foundation of the 20 * United States of America, and the CMU Sphinx Speech Consortium. 21 * 22 * THIS SOFTWARE IS PROVIDED BY CARNEGIE MELLON UNIVERSITY ``AS IS'' AND 23 * ANY EXPRESSED OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, 24 * THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR 25 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL CARNEGIE MELLON UNIVERSITY 26 * NOR ITS EMPLOYEES BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, 27 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT 28 * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, 29 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY 30 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 31 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 32 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 33 * 34 * ==================================================================== 35 * 36 */ 37 /* 38 * feat.h -- Cepstral features computation. 39 * 40 * ********************************************** 41 * CMU ARPA Speech Project 42 * 43 * Copyright (c) 1999 Carnegie Mellon University. 44 * ALL RIGHTS RESERVED. 45 * ********************************************** 46 * 47 * HISTORY 48 * $Log$ 49 * Revision 1.1 2006/04/05 20:27:30 dhdfu 50 * A Great Reorganzation of header files and executables 51 * 52 * Revision 1.17 2006/02/23 03:59:40 arthchan2003 53 * Merged from branch SPHINX3_5_2_RCI_IRII_BRANCH: a, Free buffers correctly. b, Fixed dox-doc. 54 * 55 * Revision 1.16.4.1 2005/07/05 06:25:08 arthchan2003 56 * Fixed dox-doc. 57 * 58 * Revision 1.16 2005/06/22 03:29:35 arthchan2003 59 * Makefile.am s for all subdirectory of libs3decoder/ 60 * 61 * Revision 1.5 2005/06/13 04:02:56 archan 62 * Fixed most doxygen-style documentation under libs3decoder. 63 * 64 * Revision 1.4 2005/04/21 23:50:26 archan 65 * Some more refactoring on the how reporting of structures inside kbcore_t is done, it is now 50% nice. Also added class-based LM test case into test-decode.sh.in. At this moment, everything in search mode 5 is already done. It is time to test the idea whether the search can really be used. 66 * 67 * Revision 1.3 2005/03/30 01:22:46 archan 68 * Fixed mistakes in last updates. Add 69 * 70 * 71 * 20.Apr.2001 RAH (rhoughton@mediasite.com, ricky.houghton@cs.cmu.edu) 72 * Adding feat_free() to free allocated memory 73 * 74 * 04-Jan-1999 M K Ravishankar (rkm@cs.cmu.edu) at Carnegie Mellon University 75 * Started. 76 */ 77 78 79 #ifndef _S3_FEAT_H_ 80 #define _S3_FEAT_H_ 81 82 #include <stdio.h> 83 84 /* Win32/WinCE DLL gunk */ 85 #include <sphinxbase/sphinxbase_export.h> 86 #include <sphinxbase/prim_type.h> 87 #include <sphinxbase/fe.h> 88 #include <sphinxbase/cmn.h> 89 #include <sphinxbase/agc.h> 90 91 #ifdef __cplusplus 92 extern "C" { 93 #endif 94 #if 0 95 /* Fool Emacs. */ 96 } 97 #endif 98 99 /** \file feat.h 100 * \brief compute the dynamic coefficients from the cepstral vector. 101 */ 102 #define LIVEBUFBLOCKSIZE 256 /** Blocks of 256 vectors allocated 103 for livemode decoder */ 104 #define S3_MAX_FRAMES 15000 /* RAH, I believe this is still too large, but better than before */ 105 106 #define cepstral_to_feature_command_line_macro() \ 107 { "-feat", \ 108 ARG_STRING, \ 109 "1s_c_d_dd", \ 110 "Feature stream type, depends on the acoustic model" }, \ 111 { "-ceplen", \ 112 ARG_INT32, \ 113 "13", \ 114 "Number of components in the input feature vector" }, \ 115 { "-cmn", \ 116 ARG_STRING, \ 117 "current", \ 118 "Cepstral mean normalization scheme ('current', 'prior', or 'none')" }, \ 119 { "-cmninit", \ 120 ARG_STRING, \ 121 "8.0", \ 122 "Initial values (comma-separated) for cepstral mean when 'prior' is used" }, \ 123 { "-varnorm", \ 124 ARG_BOOLEAN, \ 125 "no", \ 126 "Variance normalize each utterance (only if CMN == current)" }, \ 127 { "-agc", \ 128 ARG_STRING, \ 129 "none", \ 130 "Automatic gain control for c0 ('max', 'emax', 'noise', or 'none')" }, \ 131 { "-agcthresh", \ 132 ARG_FLOAT32, \ 133 "2.0", \ 134 "Initial threshold for automatic gain control" }, \ 135 { "-lda", \ 136 ARG_STRING, \ 137 NULL, \ 138 "File containing transformation matrix to be applied to features (single-stream features only)" }, \ 139 { "-ldadim", \ 140 ARG_INT32, \ 141 "0", \ 142 "Dimensionality of output of feature transformation (0 to use entire matrix)" }, \ 143 {"-svspec", \ 144 ARG_STRING, \ 145 NULL, \ 146 "Subvector specification (e.g., 24,0-11/25,12-23/26-38 or 0-12/13-25/26-38)"} 147 148 /** 149 * \struct feat_t 150 * \brief Structure for describing a speech feature type 151 * Structure for describing a speech feature type (no. of streams and stream widths), 152 * as well as the computation for converting the input speech (e.g., Sphinx-II format 153 * MFC cepstra) into this type of feature vectors. 154 */ 155 typedef struct feat_s { 156 int refcount; /**< Reference count. */ 157 char *name; /**< Printable name for this feature type */ 158 int32 cepsize; /**< Size of input speech vector (typically, a cepstrum vector) */ 159 int32 n_stream; /**< Number of feature streams; e.g., 4 in Sphinx-II */ 160 uint32 *stream_len; /**< Vector length of each feature stream */ 161 int32 window_size; /**< Number of extra frames around given input frame needed to compute 162 corresponding output feature (so total = window_size*2 + 1) */ 163 int32 n_sv; /**< Number of subvectors */ 164 uint32 *sv_len; /**< Vector length of each subvector */ 165 int32 **subvecs; /**< Subvector specification (or NULL for none) */ 166 mfcc_t *sv_buf; /**< Temporary copy buffer for subvector projection */ 167 int32 sv_dim; /**< Total dimensionality of subvector (length of sv_buf) */ 168 169 cmn_type_t cmn; /**< Type of CMN to be performed on each utterance */ 170 int32 varnorm; /**< Whether variance normalization is to be performed on each utt; 171 Irrelevant if no CMN is performed */ 172 agc_type_t agc; /**< Type of AGC to be performed on each utterance */ 173 174 /** 175 * Feature computation function. 176 * @param fcb the feat_t describing this feature type 177 * @param input pointer into the input cepstra 178 * @param feat a 2-d array of output features (n_stream x stream_len) 179 * @return 0 if successful, -ve otherwise. 180 * 181 * Function for converting window of input speech vector 182 * (input[-window_size..window_size]) to output feature vector 183 * (feat[stream][]). If NULL, no conversion available, the 184 * speech input must be feature vector itself. 185 **/ 186 void (*compute_feat)(struct feat_s *fcb, mfcc_t **input, mfcc_t **feat); 187 cmn_t *cmn_struct; /**< Structure that stores the temporary variables for cepstral 188 means normalization*/ 189 agc_t *agc_struct; /**< Structure that stores the temporary variables for acoustic 190 gain control*/ 191 192 mfcc_t **cepbuf; /**< Circular buffer of MFCC frames for live feature computation. */ 193 mfcc_t **tmpcepbuf; /**< Array of pointers into cepbuf to handle border cases. */ 194 int32 bufpos; /**< Write index in cepbuf. */ 195 int32 curpos; /**< Read index in cepbuf. */ 196 197 mfcc_t ***lda; /**< Array of linear transformations (for LDA, MLLT, or whatever) */ 198 uint32 n_lda; /**< Number of linear transformations in lda. */ 199 uint32 out_dim; /**< Output dimensionality */ 200 } feat_t; 201 202 /** 203 * Name of feature type. 204 */ 205 #define feat_name(f) ((f)->name) 206 /** 207 * Input dimensionality of feature. 208 */ 209 #define feat_cepsize(f) ((f)->cepsize) 210 /** 211 * Size of dynamic feature window. 212 */ 213 #define feat_window_size(f) ((f)->window_size) 214 /** 215 * Number of feature streams. 216 * 217 * @deprecated Do not use this, use feat_dimension1() instead. 218 */ 219 #define feat_n_stream(f) ((f)->n_stream) 220 /** 221 * Length of feature stream i. 222 * 223 * @deprecated Do not use this, use feat_dimension2() instead. 224 */ 225 #define feat_stream_len(f,i) ((f)->stream_len[i]) 226 /** 227 * Number of streams or subvectors in feature output. 228 */ 229 #define feat_dimension1(f) ((f)->n_sv ? (f)->n_sv : f->n_stream) 230 /** 231 * Dimensionality of stream/subvector i in feature output. 232 */ 233 #define feat_dimension2(f,i) ((f)->lda ? (f)->out_dim : ((f)->sv_len ? (f)->sv_len[i] : f->stream_len[i])) 234 /** 235 * Total dimensionality of feature output. 236 */ 237 #define feat_dimension(f) ((f)->out_dim) 238 /** 239 * Array with stream/subvector lengths 240 */ 241 #define feat_stream_lengths(f) ((f)->lda ? (&(f)->out_dim) : (f)->sv_len ? (f)->sv_len : f->stream_len) 242 243 /** 244 * Parse subvector specification string. 245 * 246 * Format of specification: 247 * \li '/' separated list of subvectors 248 * \li each subvector is a ',' separated list of subranges 249 * \li each subrange is a single \verbatim <number> \endverbatim or 250 * \verbatim <number>-<number> \endverbatim (inclusive), where 251 * \verbatim <number> \endverbatim is a feature vector dimension 252 * specifier. 253 * 254 * E.g., "24,0-11/25,12-23/26,27-38" has: 255 * \li 3 subvectors 256 * \li the 1st subvector has feature dims: 24, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, and 11. 257 * \li etc. 258 * 259 * @param str subvector specification string. 260 * @return allocated 2-D array of subvector specs (free with 261 * subvecs_free()). If there are N subvectors specified, subvec[N] = 262 * NULL; and each subvec[0]..subvec[N-1] is -1 terminated vector of 263 * feature dims. 264 */ 265 SPHINXBASE_EXPORT 266 int32 **parse_subvecs(char const *str); 267 268 /** 269 * Free array of subvector specs. 270 */ 271 SPHINXBASE_EXPORT 272 void subvecs_free(int32 **subvecs); 273 274 275 /** 276 * Allocate an array to hold several frames worth of feature vectors. The returned value 277 * is the mfcc_t ***data array, organized as follows: 278 * 279 * - data[0][0] = frame 0 stream 0 vector, data[0][1] = frame 0 stream 1 vector, ... 280 * - data[1][0] = frame 1 stream 0 vector, data[0][1] = frame 1 stream 1 vector, ... 281 * - data[2][0] = frame 2 stream 0 vector, data[0][1] = frame 2 stream 1 vector, ... 282 * - ... 283 * 284 * NOTE: For I/O convenience, the entire data area is allocated as one contiguous block. 285 * @return pointer to the allocated space if successful, NULL if any error. 286 */ 287 SPHINXBASE_EXPORT 288 mfcc_t ***feat_array_alloc(feat_t *fcb, /**< In: Descriptor from feat_init(), used 289 to obtain number of streams and stream sizes */ 290 int32 nfr /**< In: Number of frames for which to allocate */ 291 ); 292 293 /** 294 * Realloate the array of features. Requires us to know the old size 295 */ 296 SPHINXBASE_EXPORT 297 mfcc_t ***feat_array_realloc(feat_t *fcb, /**< In: Descriptor from feat_init(), used 298 to obtain number of streams and stream sizes */ 299 mfcc_t ***old_feat, /**< Feature array. Freed */ 300 int32 ofr, /**< In: Previous number of frames */ 301 int32 nfr /**< In: Number of frames for which to allocate */ 302 ); 303 304 /** 305 * Free a buffer allocated with feat_array_alloc() 306 */ 307 SPHINXBASE_EXPORT 308 void feat_array_free(mfcc_t ***feat); 309 310 311 /** 312 * Initialize feature module to use the selected type of feature stream. 313 * One-time only initialization at the beginning of the program. Input type 314 * is a string defining the kind of input->feature conversion desired: 315 * 316 * - "s2_4x": s2mfc->Sphinx-II 4-feature stream, 317 * - "1s_c_d_dd": s2mfc->Sphinx 3.x single feature stream, 318 * - "s3_1x39": s2mfc->Sphinx 3.0 single feature stream, 319 * - "n1,n2,n3,...": Explicit feature vector layout spec. with comma-separated 320 * feature stream lengths. In this case, the input data is already in the 321 * feature format and there is no conversion necessary. 322 * 323 * @return (feat_t *) descriptor if successful, NULL if error. Caller 324 * must not directly modify the contents of the returned value. 325 */ 326 SPHINXBASE_EXPORT 327 feat_t *feat_init(char const *type,/**< In: Type of feature stream */ 328 cmn_type_t cmn, /**< In: Type of cepstram mean normalization to 329 be done before feature computation; can be 330 CMN_NONE (for none) */ 331 int32 varnorm, /**< In: (boolean) Whether variance 332 normalization done on each utt; only 333 applicable if CMN also done */ 334 agc_type_t agc, /**< In: Type of automatic gain control to be 335 done before feature computation */ 336 int32 breport, /**< In: Whether to show a report for feat_t */ 337 int32 cepsize /**< Number of components in the input vector 338 (or 0 for the default for this feature type, 339 which is usually 13) */ 340 ); 341 342 /** 343 * Add an LDA transformation to the feature module from a file. 344 * @return 0 for success or -1 if reading the LDA file failed. 345 **/ 346 SPHINXBASE_EXPORT 347 int32 feat_read_lda(feat_t *feat, /**< In: Descriptor from feat_init() */ 348 const char *ldafile, /**< In: File to read the LDA matrix from. */ 349 int32 dim /**< In: Dimensionality of LDA output. */ 350 ); 351 352 /** 353 * Transform a block of features using the feature module's LDA transform. 354 **/ 355 SPHINXBASE_EXPORT 356 void feat_lda_transform(feat_t *fcb, /**< In: Descriptor from feat_init() */ 357 mfcc_t ***inout_feat, /**< Feature block to transform. */ 358 uint32 nfr /**< In: Number of frames in inout_feat. */ 359 ); 360 361 /** 362 * Add a subvector specification to the feature module. 363 * 364 * The subvector splitting will be performed after dynamic feature 365 * computation, CMN, AGC, and any LDA transformation. The number of 366 * streams in the dynamic feature type must be one, as with LDA. 367 * 368 * After adding a subvector specification, the output of feature 369 * computation will be split into multiple subvectors, and 370 * feat_array_alloc() will allocate pointers accordingly. The number 371 * of <em>streams</em> will remain the 372 * 373 * @param fcb the feature descriptor. 374 * @param subvecs subvector specification. This pointer is retained 375 * by the feat_t and should not be freed manually. 376 * @return 0 for success or -1 if the subvector specification was 377 * invalid. 378 */ 379 SPHINXBASE_EXPORT 380 int feat_set_subvecs(feat_t *fcb, int32 **subvecs); 381 382 /** 383 * Print the given block of feature vectors to the given FILE. 384 */ 385 SPHINXBASE_EXPORT 386 void feat_print(feat_t *fcb, /**< In: Descriptor from feat_init() */ 387 mfcc_t ***feat, /**< In: Feature data to be printed */ 388 int32 nfr, /**< In: Number of frames of feature data above */ 389 FILE *fp /**< In: Output file pointer */ 390 ); 391 392 393 /** 394 * Read a specified MFC file (or given segment within it), perform 395 * CMN/AGC as indicated by <code>fcb</code>, and compute feature 396 * vectors. Feature vectors are computed for the entire segment 397 * specified, by including additional surrounding or padding frames to 398 * accommodate the feature windows. 399 * 400 * @return Number of frames of feature vectors computed if successful; 401 * -1 if any error. <code>If</code> feat is NULL, then no actual 402 * computation will be done, and the number of frames which must be 403 * allocated will be returned. 404 * 405 * A note on how the file path is constructed: If the control file 406 * already specifies extension or absolute path, then these are not 407 * applied. The default extension is defined by the application. 408 */ 409 SPHINXBASE_EXPORT 410 int32 feat_s2mfc2feat(feat_t *fcb, /**< In: Descriptor from feat_init() */ 411 const char *file, /**< In: File to be read */ 412 const char *dir, /**< In: Directory prefix for file, 413 if needed; can be NULL */ 414 const char *cepext,/**< In: Extension of the 415 cepstrum file.It cannot be 416 NULL */ 417 int32 sf, int32 ef, /* Start/End frames 418 within file to be read. Use 419 0,-1 to process entire 420 file */ 421 mfcc_t ***feat, /**< Out: Computed feature vectors; 422 caller must allocate this space */ 423 int32 maxfr /**< In: Available space (number of frames) in 424 above feat array; it must be 425 sufficient to hold the result. 426 Pass -1 for no limit. */ 427 ); 428 429 430 /** 431 * Feature computation routine for live mode decoder. 432 * 433 * This function computes features for blocks of incoming data. It 434 * retains an internal buffer for computing deltas, which means that 435 * the number of output frames will not necessarily equal the number 436 * of input frames. 437 * 438 * <strong>It is very important</strong> to realize that the number of 439 * output frames can be <strong>greater than</strong> the number of 440 * input frames, specifically when <code>endutt</code> is true. It is 441 * guaranteed to never exceed <code>*inout_ncep + 442 * feat_window_size(fcb)</code>. You <strong>MUST</strong> have 443 * allocated at least that many frames in <code>ofeat</code>, or you 444 * will experience a buffer overflow. 445 * 446 * If beginutt and endutt are both true, CMN_CURRENT and AGC_MAX will 447 * be done. Otherwise only CMN_PRIOR and AGC_EMAX will be done. 448 * 449 * If beginutt is false, endutt is true, and the number of input 450 * frames exceeds the input size, then end-of-utterance processing 451 * won't actually be done. This condition can easily be checked, 452 * because <code>*inout_ncep</code> will equal the return value on 453 * exit, and will also be smaller than the value of 454 * <code>*inout_ncep</code> on entry. 455 * 456 * @return The number of output frames actually computed. 457 **/ 458 SPHINXBASE_EXPORT 459 int32 feat_s2mfc2feat_live(feat_t *fcb, /**< In: Descriptor from feat_init() */ 460 mfcc_t **uttcep, /**< In: Incoming cepstral buffer */ 461 int32 *inout_ncep,/**< In: Size of incoming buffer. 462 Out: Number of incoming frames consumed. */ 463 int32 beginutt, /**< In: Begining of utterance flag */ 464 int32 endutt, /**< In: End of utterance flag */ 465 mfcc_t ***ofeat /**< In: Output feature buffer. See 466 <strong>VERY IMPORTANT</strong> note 467 about the size of this buffer above. */ 468 ); 469 470 471 /** 472 * Retain ownership of feat_t. 473 * 474 * @return pointer to retained feat_t. 475 */ 476 SPHINXBASE_EXPORT 477 feat_t *feat_retain(feat_t *f); 478 479 /** 480 * Release resource associated with feat_t 481 * 482 * @return new reference count (0 if freed) 483 */ 484 SPHINXBASE_EXPORT 485 int feat_free(feat_t *f /**< In: feat_t */ 486 ); 487 488 /** 489 * Report the feat_t data structure 490 */ 491 SPHINXBASE_EXPORT 492 void feat_report(feat_t *f /**< In: feat_t */ 493 ); 494 #ifdef __cplusplus 495 } 496 #endif 497 498 499 #endif 500