1 /* -*- c-basic-offset: 4; indent-tabs-mode: nil -*- */ 2 /* ==================================================================== 3 * Copyright (c) 1999-2004 Carnegie Mellon University. All rights 4 * reserved. 5 * 6 * Redistribution and use in source and binary forms, with or without 7 * modification, are permitted provided that the following conditions 8 * are met: 9 * 10 * 1. Redistributions of source code must retain the above copyright 11 * notice, this list of conditions and the following disclaimer. 12 * 13 * 2. Redistributions in binary form must reproduce the above copyright 14 * notice, this list of conditions and the following disclaimer in 15 * the documentation and/or other materials provided with the 16 * distribution. 17 * 18 * This work was supported in part by funding from the Defense Advanced 19 * Research Projects Agency and the National Science Foundation of the 20 * United States of America, and the CMU Sphinx Speech Consortium. 21 * 22 * THIS SOFTWARE IS PROVIDED BY CARNEGIE MELLON UNIVERSITY ``AS IS'' AND 23 * ANY EXPRESSED OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, 24 * THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR 25 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL CARNEGIE MELLON UNIVERSITY 26 * NOR ITS EMPLOYEES BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, 27 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT 28 * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, 29 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY 30 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 31 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 32 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 33 * 34 * ==================================================================== 35 * 36 */ 37 /* 38 * feat.h -- Cepstral features computation. 39 */ 40 41 #ifndef _S3_FEAT_H_ 42 #define _S3_FEAT_H_ 43 44 #include <stdio.h> 45 46 /* Win32/WinCE DLL gunk */ 47 #include <sphinxbase/sphinxbase_export.h> 48 #include <sphinxbase/prim_type.h> 49 #include <sphinxbase/fe.h> 50 #include <sphinxbase/cmn.h> 51 #include <sphinxbase/agc.h> 52 53 #ifdef __cplusplus 54 extern "C" { 55 #endif 56 #if 0 57 /* Fool Emacs. */ 58 } 59 #endif 60 61 /** \file feat.h 62 * \brief compute the dynamic coefficients from the cepstral vector. 63 */ 64 #define LIVEBUFBLOCKSIZE 256 /** Blocks of 256 vectors allocated 65 for livemode decoder */ 66 #define S3_MAX_FRAMES 15000 /* RAH, I believe this is still too large, but better than before */ 67 68 #define cepstral_to_feature_command_line_macro() \ 69 { "-feat", \ 70 ARG_STRING, \ 71 "1s_c_d_dd", \ 72 "Feature stream type, depends on the acoustic model" }, \ 73 { "-ceplen", \ 74 ARG_INT32, \ 75 "13", \ 76 "Number of components in the input feature vector" }, \ 77 { "-cmn", \ 78 ARG_STRING, \ 79 "current", \ 80 "Cepstral mean normalization scheme ('current', 'prior', or 'none')" }, \ 81 { "-cmninit", \ 82 ARG_STRING, \ 83 "8.0", \ 84 "Initial values (comma-separated) for cepstral mean when 'prior' is used" }, \ 85 { "-varnorm", \ 86 ARG_BOOLEAN, \ 87 "no", \ 88 "Variance normalize each utterance (only if CMN == current)" }, \ 89 { "-agc", \ 90 ARG_STRING, \ 91 "none", \ 92 "Automatic gain control for c0 ('max', 'emax', 'noise', or 'none')" }, \ 93 { "-agcthresh", \ 94 ARG_FLOAT32, \ 95 "2.0", \ 96 "Initial threshold for automatic gain control" }, \ 97 { "-lda", \ 98 ARG_STRING, \ 99 NULL, \ 100 "File containing transformation matrix to be applied to features (single-stream features only)" }, \ 101 { "-ldadim", \ 102 ARG_INT32, \ 103 "0", \ 104 "Dimensionality of output of feature transformation (0 to use entire matrix)" }, \ 105 {"-svspec", \ 106 ARG_STRING, \ 107 NULL, \ 108 "Subvector specification (e.g., 24,0-11/25,12-23/26-38 or 0-12/13-25/26-38)"} 109 110 /** 111 * \struct feat_t 112 * \brief Structure for describing a speech feature type 113 * Structure for describing a speech feature type (no. of streams and stream widths), 114 * as well as the computation for converting the input speech (e.g., Sphinx-II format 115 * MFC cepstra) into this type of feature vectors. 116 */ 117 typedef struct feat_s { 118 int refcount; /**< Reference count. */ 119 char *name; /**< Printable name for this feature type */ 120 int32 cepsize; /**< Size of input speech vector (typically, a cepstrum vector) */ 121 int32 n_stream; /**< Number of feature streams; e.g., 4 in Sphinx-II */ 122 uint32 *stream_len; /**< Vector length of each feature stream */ 123 int32 window_size; /**< Number of extra frames around given input frame needed to compute 124 corresponding output feature (so total = window_size*2 + 1) */ 125 int32 n_sv; /**< Number of subvectors */ 126 uint32 *sv_len; /**< Vector length of each subvector */ 127 int32 **subvecs; /**< Subvector specification (or NULL for none) */ 128 mfcc_t *sv_buf; /**< Temporary copy buffer for subvector projection */ 129 int32 sv_dim; /**< Total dimensionality of subvector (length of sv_buf) */ 130 131 cmn_type_t cmn; /**< Type of CMN to be performed on each utterance */ 132 int32 varnorm; /**< Whether variance normalization is to be performed on each utt; 133 Irrelevant if no CMN is performed */ 134 agc_type_t agc; /**< Type of AGC to be performed on each utterance */ 135 136 /** 137 * Feature computation function. 138 * @param fcb the feat_t describing this feature type 139 * @param input pointer into the input cepstra 140 * @param feat a 2-d array of output features (n_stream x stream_len) 141 * @return 0 if successful, -ve otherwise. 142 * 143 * Function for converting window of input speech vector 144 * (input[-window_size..window_size]) to output feature vector 145 * (feat[stream][]). If NULL, no conversion available, the 146 * speech input must be feature vector itself. 147 **/ 148 void (*compute_feat)(struct feat_s *fcb, mfcc_t **input, mfcc_t **feat); 149 cmn_t *cmn_struct; /**< Structure that stores the temporary variables for cepstral 150 means normalization*/ 151 agc_t *agc_struct; /**< Structure that stores the temporary variables for acoustic 152 gain control*/ 153 154 mfcc_t **cepbuf; /**< Circular buffer of MFCC frames for live feature computation. */ 155 mfcc_t **tmpcepbuf; /**< Array of pointers into cepbuf to handle border cases. */ 156 int32 bufpos; /**< Write index in cepbuf. */ 157 int32 curpos; /**< Read index in cepbuf. */ 158 159 mfcc_t ***lda; /**< Array of linear transformations (for LDA, MLLT, or whatever) */ 160 uint32 n_lda; /**< Number of linear transformations in lda. */ 161 uint32 out_dim; /**< Output dimensionality */ 162 } feat_t; 163 164 /** 165 * Name of feature type. 166 */ 167 #define feat_name(f) ((f)->name) 168 /** 169 * Input dimensionality of feature. 170 */ 171 #define feat_cepsize(f) ((f)->cepsize) 172 /** 173 * Size of dynamic feature window. 174 */ 175 #define feat_window_size(f) ((f)->window_size) 176 /** 177 * Number of feature streams. 178 * 179 * @deprecated Do not use this, use feat_dimension1() instead. 180 */ 181 #define feat_n_stream(f) ((f)->n_stream) 182 /** 183 * Length of feature stream i. 184 * 185 * @deprecated Do not use this, use feat_dimension2() instead. 186 */ 187 #define feat_stream_len(f,i) ((f)->stream_len[i]) 188 /** 189 * Number of streams or subvectors in feature output. 190 */ 191 #define feat_dimension1(f) ((f)->n_sv ? (f)->n_sv : f->n_stream) 192 /** 193 * Dimensionality of stream/subvector i in feature output. 194 */ 195 #define feat_dimension2(f,i) ((f)->lda ? (f)->out_dim : ((f)->sv_len ? (f)->sv_len[i] : f->stream_len[i])) 196 /** 197 * Total dimensionality of feature output. 198 */ 199 #define feat_dimension(f) ((f)->out_dim) 200 /** 201 * Array with stream/subvector lengths 202 */ 203 #define feat_stream_lengths(f) ((f)->lda ? (&(f)->out_dim) : (f)->sv_len ? (f)->sv_len : f->stream_len) 204 205 /** 206 * Parse subvector specification string. 207 * 208 * Format of specification: 209 * \li '/' separated list of subvectors 210 * \li each subvector is a ',' separated list of subranges 211 * \li each subrange is a single \verbatim <number> \endverbatim or 212 * \verbatim <number>-<number> \endverbatim (inclusive), where 213 * \verbatim <number> \endverbatim is a feature vector dimension 214 * specifier. 215 * 216 * E.g., "24,0-11/25,12-23/26,27-38" has: 217 * \li 3 subvectors 218 * \li the 1st subvector has feature dims: 24, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, and 11. 219 * \li etc. 220 * 221 * @param str subvector specification string. 222 * @return allocated 2-D array of subvector specs (free with 223 * subvecs_free()). If there are N subvectors specified, subvec[N] = 224 * NULL; and each subvec[0]..subvec[N-1] is -1 terminated vector of 225 * feature dims. 226 */ 227 SPHINXBASE_EXPORT 228 int32 **parse_subvecs(char const *str); 229 230 /** 231 * Free array of subvector specs. 232 */ 233 SPHINXBASE_EXPORT 234 void subvecs_free(int32 **subvecs); 235 236 237 /** 238 * Allocate an array to hold several frames worth of feature vectors. The returned value 239 * is the mfcc_t ***data array, organized as follows: 240 * 241 * - data[0][0] = frame 0 stream 0 vector, data[0][1] = frame 0 stream 1 vector, ... 242 * - data[1][0] = frame 1 stream 0 vector, data[0][1] = frame 1 stream 1 vector, ... 243 * - data[2][0] = frame 2 stream 0 vector, data[0][1] = frame 2 stream 1 vector, ... 244 * - ... 245 * 246 * NOTE: For I/O convenience, the entire data area is allocated as one contiguous block. 247 * @return pointer to the allocated space if successful, NULL if any error. 248 */ 249 SPHINXBASE_EXPORT 250 mfcc_t ***feat_array_alloc(feat_t *fcb, /**< In: Descriptor from feat_init(), used 251 to obtain number of streams and stream sizes */ 252 int32 nfr /**< In: Number of frames for which to allocate */ 253 ); 254 255 /** 256 * Realloate the array of features. Requires us to know the old size 257 */ 258 SPHINXBASE_EXPORT 259 mfcc_t ***feat_array_realloc(feat_t *fcb, /**< In: Descriptor from feat_init(), used 260 to obtain number of streams and stream sizes */ 261 mfcc_t ***old_feat, /**< Feature array. Freed */ 262 int32 ofr, /**< In: Previous number of frames */ 263 int32 nfr /**< In: Number of frames for which to allocate */ 264 ); 265 266 /** 267 * Free a buffer allocated with feat_array_alloc() 268 */ 269 SPHINXBASE_EXPORT 270 void feat_array_free(mfcc_t ***feat); 271 272 273 /** 274 * Initialize feature module to use the selected type of feature stream. 275 * One-time only initialization at the beginning of the program. Input type 276 * is a string defining the kind of input->feature conversion desired: 277 * 278 * - "s2_4x": s2mfc->Sphinx-II 4-feature stream, 279 * - "1s_c_d_dd": s2mfc->Sphinx 3.x single feature stream, 280 * - "s3_1x39": s2mfc->Sphinx 3.0 single feature stream, 281 * - "n1,n2,n3,...": Explicit feature vector layout spec. with comma-separated 282 * feature stream lengths. In this case, the input data is already in the 283 * feature format and there is no conversion necessary. 284 * 285 * @return (feat_t *) descriptor if successful, NULL if error. Caller 286 * must not directly modify the contents of the returned value. 287 */ 288 SPHINXBASE_EXPORT 289 feat_t *feat_init(char const *type,/**< In: Type of feature stream */ 290 cmn_type_t cmn, /**< In: Type of cepstram mean normalization to 291 be done before feature computation; can be 292 CMN_NONE (for none) */ 293 int32 varnorm, /**< In: (boolean) Whether variance 294 normalization done on each utt; only 295 applicable if CMN also done */ 296 agc_type_t agc, /**< In: Type of automatic gain control to be 297 done before feature computation */ 298 int32 breport, /**< In: Whether to show a report for feat_t */ 299 int32 cepsize /**< Number of components in the input vector 300 (or 0 for the default for this feature type, 301 which is usually 13) */ 302 ); 303 304 /** 305 * Add an LDA transformation to the feature module from a file. 306 * @return 0 for success or -1 if reading the LDA file failed. 307 **/ 308 SPHINXBASE_EXPORT 309 int32 feat_read_lda(feat_t *feat, /**< In: Descriptor from feat_init() */ 310 const char *ldafile, /**< In: File to read the LDA matrix from. */ 311 int32 dim /**< In: Dimensionality of LDA output. */ 312 ); 313 314 /** 315 * Transform a block of features using the feature module's LDA transform. 316 **/ 317 SPHINXBASE_EXPORT 318 void feat_lda_transform(feat_t *fcb, /**< In: Descriptor from feat_init() */ 319 mfcc_t ***inout_feat, /**< Feature block to transform. */ 320 uint32 nfr /**< In: Number of frames in inout_feat. */ 321 ); 322 323 /** 324 * Add a subvector specification to the feature module. 325 * 326 * The subvector splitting will be performed after dynamic feature 327 * computation, CMN, AGC, and any LDA transformation. The number of 328 * streams in the dynamic feature type must be one, as with LDA. 329 * 330 * After adding a subvector specification, the output of feature 331 * computation will be split into multiple subvectors, and 332 * feat_array_alloc() will allocate pointers accordingly. The number 333 * of <em>streams</em> will remain the 334 * 335 * @param fcb the feature descriptor. 336 * @param subvecs subvector specification. This pointer is retained 337 * by the feat_t and should not be freed manually. 338 * @return 0 for success or -1 if the subvector specification was 339 * invalid. 340 */ 341 SPHINXBASE_EXPORT 342 int feat_set_subvecs(feat_t *fcb, int32 **subvecs); 343 344 /** 345 * Print the given block of feature vectors to the given FILE. 346 */ 347 SPHINXBASE_EXPORT 348 void feat_print(feat_t *fcb, /**< In: Descriptor from feat_init() */ 349 mfcc_t ***feat, /**< In: Feature data to be printed */ 350 int32 nfr, /**< In: Number of frames of feature data above */ 351 FILE *fp /**< In: Output file pointer */ 352 ); 353 354 355 /** 356 * Read a specified MFC file (or given segment within it), perform 357 * CMN/AGC as indicated by <code>fcb</code>, and compute feature 358 * vectors. Feature vectors are computed for the entire segment 359 * specified, by including additional surrounding or padding frames to 360 * accommodate the feature windows. 361 * 362 * @return Number of frames of feature vectors computed if successful; 363 * -1 if any error. <code>If</code> feat is NULL, then no actual 364 * computation will be done, and the number of frames which must be 365 * allocated will be returned. 366 * 367 * A note on how the file path is constructed: If the control file 368 * already specifies extension or absolute path, then these are not 369 * applied. The default extension is defined by the application. 370 */ 371 SPHINXBASE_EXPORT 372 int32 feat_s2mfc2feat(feat_t *fcb, /**< In: Descriptor from feat_init() */ 373 const char *file, /**< In: File to be read */ 374 const char *dir, /**< In: Directory prefix for file, 375 if needed; can be NULL */ 376 const char *cepext,/**< In: Extension of the 377 cepstrum file.It cannot be 378 NULL */ 379 int32 sf, int32 ef, /* Start/End frames 380 within file to be read. Use 381 0,-1 to process entire 382 file */ 383 mfcc_t ***feat, /**< Out: Computed feature vectors; 384 caller must allocate this space */ 385 int32 maxfr /**< In: Available space (number of frames) in 386 above feat array; it must be 387 sufficient to hold the result. 388 Pass -1 for no limit. */ 389 ); 390 391 392 /** 393 * Feature computation routine for live mode decoder. 394 * 395 * This function computes features for blocks of incoming data. It 396 * retains an internal buffer for computing deltas, which means that 397 * the number of output frames will not necessarily equal the number 398 * of input frames. 399 * 400 * <strong>It is very important</strong> to realize that the number of 401 * output frames can be <strong>greater than</strong> the number of 402 * input frames, specifically when <code>endutt</code> is true. It is 403 * guaranteed to never exceed <code>*inout_ncep + 404 * feat_window_size(fcb)</code>. You <strong>MUST</strong> have 405 * allocated at least that many frames in <code>ofeat</code>, or you 406 * will experience a buffer overflow. 407 * 408 * If beginutt and endutt are both true, CMN_CURRENT and AGC_MAX will 409 * be done. Otherwise only CMN_PRIOR and AGC_EMAX will be done. 410 * 411 * If beginutt is false, endutt is true, and the number of input 412 * frames exceeds the input size, then end-of-utterance processing 413 * won't actually be done. This condition can easily be checked, 414 * because <code>*inout_ncep</code> will equal the return value on 415 * exit, and will also be smaller than the value of 416 * <code>*inout_ncep</code> on entry. 417 * 418 * @return The number of output frames actually computed. 419 **/ 420 SPHINXBASE_EXPORT 421 int32 feat_s2mfc2feat_live(feat_t *fcb, /**< In: Descriptor from feat_init() */ 422 mfcc_t **uttcep, /**< In: Incoming cepstral buffer */ 423 int32 *inout_ncep,/**< In: Size of incoming buffer. 424 Out: Number of incoming frames consumed. */ 425 int32 beginutt, /**< In: Begining of utterance flag */ 426 int32 endutt, /**< In: End of utterance flag */ 427 mfcc_t ***ofeat /**< In: Output feature buffer. See 428 <strong>VERY IMPORTANT</strong> note 429 about the size of this buffer above. */ 430 ); 431 432 433 /** 434 * Update the normalization stats, possibly in the end of utterance 435 * 436 */ 437 SPHINXBASE_EXPORT 438 void feat_update_stats(feat_t *fcb); 439 440 441 /** 442 * Retain ownership of feat_t. 443 * 444 * @return pointer to retained feat_t. 445 */ 446 SPHINXBASE_EXPORT 447 feat_t *feat_retain(feat_t *f); 448 449 /** 450 * Release resource associated with feat_t 451 * 452 * @return new reference count (0 if freed) 453 */ 454 SPHINXBASE_EXPORT 455 int feat_free(feat_t *f /**< In: feat_t */ 456 ); 457 458 /** 459 * Report the feat_t data structure 460 */ 461 SPHINXBASE_EXPORT 462 void feat_report(feat_t *f /**< In: feat_t */ 463 ); 464 #ifdef __cplusplus 465 } 466 #endif 467 468 469 #endif 470