1 /* -*- c-basic-offset: 4; indent-tabs-mode: nil -*- */ 2 /* ==================================================================== 3 * Copyright (c) 2008 Carnegie Mellon University. All rights 4 * reserved. 5 * 6 * Redistribution and use in source and binary forms, with or without 7 * modification, are permitted provided that the following conditions 8 * are met: 9 * 10 * 1. Redistributions of source code must retain the above copyright 11 * notice, this list of conditions and the following disclaimer. 12 * 13 * 2. Redistributions in binary form must reproduce the above copyright 14 * notice, this list of conditions and the following disclaimer in 15 * the documentation and/or other materials provided with the 16 * distribution. 17 * 18 * This work was supported in part by funding from the Defense Advanced 19 * Research Projects Agency and the National Science Foundation of the 20 * United States of America, and the CMU Sphinx Speech Consortium. 21 * 22 * THIS SOFTWARE IS PROVIDED BY CARNEGIE MELLON UNIVERSITY ``AS IS'' AND 23 * ANY EXPRESSED OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, 24 * THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR 25 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL CARNEGIE MELLON UNIVERSITY 26 * NOR ITS EMPLOYEES BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, 27 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT 28 * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, 29 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY 30 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 31 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 32 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 33 * 34 * ==================================================================== 35 * 36 */ 37 38 /** 39 * @file acmod.h Acoustic model structures for PocketSphinx. 40 * @author David Huggins-Daines <dhuggins@cs.cmu.edu> 41 */ 42 43 #ifndef __ACMOD_H__ 44 #define __ACMOD_H__ 45 46 /* System headers. */ 47 #include <stdio.h> 48 49 /* SphinxBase headers. */ 50 #include <sphinxbase/cmd_ln.h> 51 #include <sphinxbase/logmath.h> 52 #include <sphinxbase/fe.h> 53 #include <sphinxbase/feat.h> 54 #include <sphinxbase/bitvec.h> 55 #include <sphinxbase/err.h> 56 #include <sphinxbase/prim_type.h> 57 58 /* Local headers. */ 59 #include "ps_mllr.h" 60 #include "bin_mdef.h" 61 #include "tmat.h" 62 #include "hmm.h" 63 64 /** 65 * States in utterance processing. 66 */ 67 typedef enum acmod_state_e { 68 ACMOD_IDLE, /**< Not in an utterance. */ 69 ACMOD_STARTED, /**< Utterance started, no data yet. */ 70 ACMOD_PROCESSING, /**< Utterance in progress. */ 71 ACMOD_ENDED /**< Utterance ended, still buffering. */ 72 } acmod_state_t; 73 74 /** 75 * Dummy senone score value for unintentionally active states. 76 */ 77 #define SENSCR_DUMMY 0x7fff 78 79 /** 80 * Feature space linear transform structure. 81 */ 82 struct ps_mllr_s { 83 int refcnt; /**< Reference count. */ 84 int n_class; /**< Number of MLLR classes. */ 85 int n_feat; /**< Number of feature streams. */ 86 int *veclen; /**< Length of input vectors for each stream. */ 87 float32 ****A; /**< Rotation part of mean transformations. */ 88 float32 ***b; /**< Bias part of mean transformations. */ 89 float32 ***h; /**< Diagonal transformation of variances. */ 90 int32 *cb2mllr; /**< Mapping from codebooks to transformations. */ 91 }; 92 93 /** 94 * Acoustic model parameter structure. 95 */ 96 typedef struct ps_mgau_s ps_mgau_t; 97 98 typedef struct ps_mgaufuncs_s { 99 char const *name; 100 101 int (*frame_eval)(ps_mgau_t *mgau, 102 int16 *senscr, 103 uint8 *senone_active, 104 int32 n_senone_active, 105 mfcc_t ** feat, 106 int32 frame, 107 int32 compallsen); 108 int (*transform)(ps_mgau_t *mgau, 109 ps_mllr_t *mllr); 110 void (*free)(ps_mgau_t *mgau); 111 } ps_mgaufuncs_t; 112 113 struct ps_mgau_s { 114 ps_mgaufuncs_t *vt; /**< vtable of mgau functions. */ 115 int frame_idx; /**< frame counter. */ 116 }; 117 118 #define ps_mgau_base(mg) ((ps_mgau_t *)(mg)) 119 #define ps_mgau_frame_eval(mg,senscr,senone_active,n_senone_active,feat,frame,compallsen) \ 120 (*ps_mgau_base(mg)->vt->frame_eval) \ 121 (mg, senscr, senone_active, n_senone_active, feat, frame, compallsen) 122 #define ps_mgau_transform(mg, mllr) \ 123 (*ps_mgau_base(mg)->vt->transform)(mg, mllr) 124 #define ps_mgau_free(mg) \ 125 (*ps_mgau_base(mg)->vt->free)(mg) 126 127 /** 128 * Acoustic model structure. 129 * 130 * This object encapsulates all stages of acoustic processing, from 131 * raw audio input to acoustic score output. The reason for grouping 132 * all of these modules together is that they all have to "agree" in 133 * their parameterizations, and the configuration of the acoustic and 134 * dynamic feature computation is completely dependent on the 135 * parameters used to build the original acoustic model (which should 136 * by now always be specified in a feat.params file). 137 * 138 * Because there is not a one-to-one correspondence from blocks of 139 * input audio or frames of input features to frames of acoustic 140 * scores (due to dynamic feature calculation), results may not be 141 * immediately available after input, and the output results will not 142 * correspond to the last piece of data input. 143 * 144 * TODO: In addition, this structure serves the purpose of queueing 145 * frames of features (and potentially also scores in the future) for 146 * asynchronous passes of recognition operating in parallel. 147 */ 148 struct acmod_s { 149 /* Global objects, not retained. */ 150 cmd_ln_t *config; /**< Configuration. */ 151 logmath_t *lmath; /**< Log-math computation. */ 152 glist_t strings; /**< Temporary acoustic model filenames. */ 153 154 /* Feature computation: */ 155 fe_t *fe; /**< Acoustic feature computation. */ 156 feat_t *fcb; /**< Dynamic feature computation. */ 157 158 /* Model parameters: */ 159 bin_mdef_t *mdef; /**< Model definition. */ 160 tmat_t *tmat; /**< Transition matrices. */ 161 ps_mgau_t *mgau; /**< Model parameters. */ 162 ps_mllr_t *mllr; /**< Speaker transformation. */ 163 164 /* Senone scoring: */ 165 int16 *senone_scores; /**< GMM scores for current frame. */ 166 bitvec_t *senone_active_vec; /**< Active GMMs in current frame. */ 167 uint8 *senone_active; /**< Array of deltas to active GMMs. */ 168 int senscr_frame; /**< Frame index for senone_scores. */ 169 int n_senone_active; /**< Number of active GMMs. */ 170 int log_zero; /**< Zero log-probability value. */ 171 172 /* Utterance processing: */ 173 mfcc_t **mfc_buf; /**< Temporary buffer of acoustic features. */ 174 mfcc_t ***feat_buf; /**< Temporary buffer of dynamic features. */ 175 FILE *rawfh; /**< File for writing raw audio data. */ 176 FILE *mfcfh; /**< File for writing acoustic feature data. */ 177 FILE *senfh; /**< File for writing senone score data. */ 178 FILE *insenfh; /**< Input senone score file. */ 179 long *framepos; /**< File positions of recent frames in senone file. */ 180 181 /* A whole bunch of flags and counters: */ 182 uint8 state; /**< State of utterance processing. */ 183 uint8 compallsen; /**< Compute all senones? */ 184 uint8 grow_feat; /**< Whether to grow feat_buf. */ 185 uint8 insen_swap; /**< Whether to swap input senone score. */ 186 187 frame_idx_t output_frame; /**< Index of next frame of dynamic features. */ 188 frame_idx_t n_mfc_alloc; /**< Number of frames allocated in mfc_buf */ 189 frame_idx_t n_mfc_frame; /**< Number of frames active in mfc_buf */ 190 frame_idx_t mfc_outidx; /**< Start of active frames in mfc_buf */ 191 frame_idx_t n_feat_alloc; /**< Number of frames allocated in feat_buf */ 192 frame_idx_t n_feat_frame; /**< Number of frames active in feat_buf */ 193 frame_idx_t feat_outidx; /**< Start of active frames in feat_buf */ 194 }; 195 typedef struct acmod_s acmod_t; 196 197 /** 198 * Initialize an acoustic model. 199 * 200 * @param config a command-line object containing parameters. This 201 * pointer is not retained by this object. 202 * @param lmath global log-math parameters. 203 * @param fe a previously-initialized acoustic feature module to use, 204 * or NULL to create one automatically. If this is supplied 205 * and its parameters do not match those in the acoustic 206 * model, this function will fail. This pointer is not retained. 207 * @param fe a previously-initialized dynamic feature module to use, 208 * or NULL to create one automatically. If this is supplied 209 * and its parameters do not match those in the acoustic 210 * model, this function will fail. This pointer is not retained. 211 * @return a newly initialized acmod_t, or NULL on failure. 212 */ 213 acmod_t *acmod_init(cmd_ln_t *config, logmath_t *lmath, fe_t *fe, feat_t *fcb); 214 215 /** 216 * Adapt acoustic model using a linear transform. 217 * 218 * @param mllr The new transform to use, or NULL to update the existing 219 * transform. The decoder retains ownership of this pointer, 220 * so you should not attempt to free it manually. Use 221 * ps_mllr_retain() if you wish to reuse it 222 * elsewhere. 223 * @return The updated transform object for this decoder, or 224 * NULL on failure. 225 */ 226 ps_mllr_t *acmod_update_mllr(acmod_t *acmod, ps_mllr_t *mllr); 227 228 /** 229 * Start logging senone scores to a filehandle. 230 * 231 * @param acmod Acoustic model object. 232 * @param logfh Filehandle to log to. 233 * @return 0 for success, <0 on error. 234 */ 235 int acmod_set_senfh(acmod_t *acmod, FILE *senfh); 236 237 /** 238 * Start logging MFCCs to a filehandle. 239 * 240 * @param acmod Acoustic model object. 241 * @param logfh Filehandle to log to. 242 * @return 0 for success, <0 on error. 243 */ 244 int acmod_set_mfcfh(acmod_t *acmod, FILE *logfh); 245 246 /** 247 * Start logging raw audio to a filehandle. 248 * 249 * @param acmod Acoustic model object. 250 * @param logfh Filehandle to log to. 251 * @return 0 for success, <0 on error. 252 */ 253 int acmod_set_rawfh(acmod_t *acmod, FILE *logfh); 254 255 /** 256 * Finalize an acoustic model. 257 */ 258 void acmod_free(acmod_t *acmod); 259 260 /** 261 * Mark the start of an utterance. 262 */ 263 int acmod_start_utt(acmod_t *acmod); 264 265 /** 266 * Mark the end of an utterance. 267 */ 268 int acmod_end_utt(acmod_t *acmod); 269 270 /** 271 * Rewind the current utterance, allowing it to be rescored. 272 * 273 * After calling this function, the internal frame index is reset, and 274 * acmod_score() will return scores starting at the first frame of the 275 * current utterance. Currently, acmod_set_grow() must have been 276 * called to enable growing the feature buffer in order for this to 277 * work. In the future, senone scores may be cached instead. 278 * 279 * @return 0 for success, <0 for failure (if the utterance can't be 280 * rewound due to no feature or score data available) 281 */ 282 int acmod_rewind(acmod_t *acmod); 283 284 /** 285 * Advance the frame index. 286 * 287 * This function moves to the next frame of input data. Subsequent 288 * calls to acmod_score() will return scores for that frame, until the 289 * next call to acmod_advance(). 290 * 291 * @return New frame index. 292 */ 293 int acmod_advance(acmod_t *acmod); 294 295 /** 296 * Set memory allocation policy for utterance processing. 297 * 298 * @param grow_feat If non-zero, the internal dynamic feature buffer 299 * will expand as necessary to encompass any amount of data fed to the 300 * model. 301 * @return previous allocation policy. 302 */ 303 int acmod_set_grow(acmod_t *acmod, int grow_feat); 304 305 /** 306 * TODO: Set queue length for utterance processing. 307 * 308 * This function allows multiple concurrent passes of search to 309 * operate on different parts of the utterance. 310 */ 311 312 /** 313 * Feed raw audio data to the acoustic model for scoring. 314 * 315 * @param inout_raw In: Pointer to buffer of raw samples 316 * Out: Pointer to next sample to be read 317 * @param inout_n_samps In: Number of samples available 318 * Out: Number of samples remaining 319 * @param full_utt If non-zero, this block represents a full 320 * utterance and should be processed as such. 321 * @return Number of frames of data processed. 322 */ 323 int acmod_process_raw(acmod_t *acmod, 324 int16 const **inout_raw, 325 size_t *inout_n_samps, 326 int full_utt); 327 328 /** 329 * Feed acoustic feature data into the acoustic model for scoring. 330 * 331 * @param inout_cep In: Pointer to buffer of features 332 * Out: Pointer to next frame to be read 333 * @param inout_n_frames In: Number of frames available 334 * Out: Number of frames remaining 335 * @param full_utt If non-zero, this block represents a full 336 * utterance and should be processed as such. 337 * @return Number of frames of data processed. 338 */ 339 int acmod_process_cep(acmod_t *acmod, 340 mfcc_t ***inout_cep, 341 int *inout_n_frames, 342 int full_utt); 343 344 /** 345 * Feed dynamic feature data into the acoustic model for scoring. 346 * 347 * Unlike acmod_process_raw() and acmod_process_cep(), this function 348 * accepts a single frame at a time. This is because there is no need 349 * to do buffering when using dynamic features as input. However, if 350 * the dynamic feature buffer is full, this function will fail, so you 351 * should either always check the return value, or always pair a call 352 * to it with a call to acmod_score(). 353 * 354 * @param feat Pointer to one frame of dynamic features. 355 * @return Number of frames processed (either 0 or 1). 356 */ 357 int acmod_process_feat(acmod_t *acmod, 358 mfcc_t **feat); 359 360 /** 361 * Set up a senone score dump file for input. 362 * 363 * @param insenfh File handle of dump file 364 * @return 0 for success, <0 for failure 365 */ 366 int acmod_set_insenfh(acmod_t *acmod, FILE *insenfh); 367 368 /** 369 * Read one frame of scores from senone score dump file. 370 * 371 * @return Number of frames read or <0 on error. 372 */ 373 int acmod_read_scores(acmod_t *acmod); 374 375 /** 376 * Get a frame of dynamic feature data. 377 * 378 * @param inout_frame_idx Input: frame index to get, or NULL 379 * to obtain features for the most recent frame. 380 * Output: frame index corresponding to this 381 * set of features. 382 * @return Feature array, or NULL if requested frame is not available. 383 */ 384 mfcc_t **acmod_get_frame(acmod_t *acmod, int *inout_frame_idx); 385 386 /** 387 * Score one frame of data. 388 * 389 * @param inout_frame_idx Input: frame index to score, or NULL 390 * to obtain scores for the most recent frame. 391 * Output: frame index corresponding to this 392 * set of scores. 393 * @return Array of senone scores for this frame, or NULL if no frame 394 * is available for scoring (such as if a frame index is 395 * requested that is not yet or no longer available). The 396 * data pointed to persists only until the next call to 397 * acmod_score() or acmod_advance(). 398 */ 399 int16 const *acmod_score(acmod_t *acmod, 400 int *inout_frame_idx); 401 402 /** 403 * Write senone dump file header. 404 */ 405 int acmod_write_senfh_header(acmod_t *acmod, FILE *logfh); 406 407 /** 408 * Write a frame of senone scores to a dump file. 409 */ 410 int acmod_write_scores(acmod_t *acmod, int n_active, uint8 const *active, 411 int16 const *senscr, FILE *senfh); 412 413 414 /** 415 * Get best score and senone index for current frame. 416 */ 417 int acmod_best_score(acmod_t *acmod, int *out_best_senid); 418 419 /** 420 * Clear set of active senones. 421 */ 422 void acmod_clear_active(acmod_t *acmod); 423 424 /** 425 * Activate senones associated with an HMM. 426 */ 427 void acmod_activate_hmm(acmod_t *acmod, hmm_t *hmm); 428 429 /** 430 * Activate a single senone. 431 */ 432 #define acmod_activate_sen(acmod, sen) bitvec_set((acmod)->senone_active_vec, sen) 433 434 /** 435 * Build active list from 436 */ 437 int32 acmod_flags2list(acmod_t *acmod); 438 439 #endif /* __ACMOD_H__ */ 440