1 /* -*- c-basic-offset: 4; indent-tabs-mode: nil -*- */
2 /* ====================================================================
3  * Copyright (c) 2008 Carnegie Mellon University.  All rights
4  * reserved.
5  *
6  * Redistribution and use in source and binary forms, with or without
7  * modification, are permitted provided that the following conditions
8  * are met:
9  *
10  * 1. Redistributions of source code must retain the above copyright
11  *    notice, this list of conditions and the following disclaimer.
12  *
13  * 2. Redistributions in binary form must reproduce the above copyright
14  *    notice, this list of conditions and the following disclaimer in
15  *    the documentation and/or other materials provided with the
16  *    distribution.
17  *
18  * This work was supported in part by funding from the Defense Advanced
19  * Research Projects Agency and the National Science Foundation of the
20  * United States of America, and the CMU Sphinx Speech Consortium.
21  *
22  * THIS SOFTWARE IS PROVIDED BY CARNEGIE MELLON UNIVERSITY ``AS IS'' AND
23  * ANY EXPRESSED OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO,
24  * THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
25  * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL CARNEGIE MELLON UNIVERSITY
26  * NOR ITS EMPLOYEES BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
27  * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
28  * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
29  * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
30  * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
31  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
32  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
33  *
34  * ====================================================================
35  *
36  */
37 
38 /**
39  * @file acmod.h Acoustic model structures for PocketSphinx.
40  * @author David Huggins-Daines <dhuggins@cs.cmu.edu>
41  */
42 
43 #ifndef __ACMOD_H__
44 #define __ACMOD_H__
45 
46 /* System headers. */
47 #include <stdio.h>
48 
49 /* SphinxBase headers. */
50 #include <sphinxbase/cmd_ln.h>
51 #include <sphinxbase/logmath.h>
52 #include <sphinxbase/fe.h>
53 #include <sphinxbase/feat.h>
54 #include <sphinxbase/bitvec.h>
55 #include <sphinxbase/err.h>
56 #include <sphinxbase/prim_type.h>
57 
58 /* Local headers. */
59 #include "ps_mllr.h"
60 #include "bin_mdef.h"
61 #include "tmat.h"
62 #include "hmm.h"
63 
64 /**
65  * States in utterance processing.
66  */
67 typedef enum acmod_state_e {
68     ACMOD_IDLE,		/**< Not in an utterance. */
69     ACMOD_STARTED,      /**< Utterance started, no data yet. */
70     ACMOD_PROCESSING,   /**< Utterance in progress. */
71     ACMOD_ENDED         /**< Utterance ended, still buffering. */
72 } acmod_state_t;
73 
74 /**
75  * Dummy senone score value for unintentionally active states.
76  */
77 #define SENSCR_DUMMY 0x7fff
78 
79 /**
80  * Feature space linear transform structure.
81  */
82 struct ps_mllr_s {
83     int refcnt;     /**< Reference count. */
84     int n_class;    /**< Number of MLLR classes. */
85     int n_feat;     /**< Number of feature streams. */
86     int *veclen;    /**< Length of input vectors for each stream. */
87     float32 ****A;  /**< Rotation part of mean transformations. */
88     float32 ***b;   /**< Bias part of mean transformations. */
89     float32 ***h;   /**< Diagonal transformation of variances. */
90     int32 *cb2mllr; /**< Mapping from codebooks to transformations. */
91 };
92 
93 /**
94  * Acoustic model parameter structure.
95  */
96 typedef struct ps_mgau_s ps_mgau_t;
97 
98 typedef struct ps_mgaufuncs_s {
99     char const *name;
100 
101     int (*frame_eval)(ps_mgau_t *mgau,
102                       int16 *senscr,
103                       uint8 *senone_active,
104                       int32 n_senone_active,
105                       mfcc_t ** feat,
106                       int32 frame,
107                       int32 compallsen);
108     int (*transform)(ps_mgau_t *mgau,
109                      ps_mllr_t *mllr);
110     void (*free)(ps_mgau_t *mgau);
111 } ps_mgaufuncs_t;
112 
113 struct ps_mgau_s {
114     ps_mgaufuncs_t *vt;  /**< vtable of mgau functions. */
115     int frame_idx;       /**< frame counter. */
116 };
117 
118 #define ps_mgau_base(mg) ((ps_mgau_t *)(mg))
119 #define ps_mgau_frame_eval(mg,senscr,senone_active,n_senone_active,feat,frame,compallsen) \
120     (*ps_mgau_base(mg)->vt->frame_eval)                                 \
121     (mg, senscr, senone_active, n_senone_active, feat, frame, compallsen)
122 #define ps_mgau_transform(mg, mllr)                                  \
123     (*ps_mgau_base(mg)->vt->transform)(mg, mllr)
124 #define ps_mgau_free(mg)                                  \
125     (*ps_mgau_base(mg)->vt->free)(mg)
126 
127 /**
128  * Acoustic model structure.
129  *
130  * This object encapsulates all stages of acoustic processing, from
131  * raw audio input to acoustic score output.  The reason for grouping
132  * all of these modules together is that they all have to "agree" in
133  * their parameterizations, and the configuration of the acoustic and
134  * dynamic feature computation is completely dependent on the
135  * parameters used to build the original acoustic model (which should
136  * by now always be specified in a feat.params file).
137  *
138  * Because there is not a one-to-one correspondence from blocks of
139  * input audio or frames of input features to frames of acoustic
140  * scores (due to dynamic feature calculation), results may not be
141  * immediately available after input, and the output results will not
142  * correspond to the last piece of data input.
143  *
144  * TODO: In addition, this structure serves the purpose of queueing
145  * frames of features (and potentially also scores in the future) for
146  * asynchronous passes of recognition operating in parallel.
147  */
148 struct acmod_s {
149     /* Global objects, not retained. */
150     cmd_ln_t *config;          /**< Configuration. */
151     logmath_t *lmath;          /**< Log-math computation. */
152     glist_t strings;           /**< Temporary acoustic model filenames. */
153 
154     /* Feature computation: */
155     fe_t *fe;                  /**< Acoustic feature computation. */
156     feat_t *fcb;               /**< Dynamic feature computation. */
157 
158     /* Model parameters: */
159     bin_mdef_t *mdef;          /**< Model definition. */
160     tmat_t *tmat;              /**< Transition matrices. */
161     ps_mgau_t *mgau;           /**< Model parameters. */
162     ps_mllr_t *mllr;           /**< Speaker transformation. */
163 
164     /* Senone scoring: */
165     int16 *senone_scores;      /**< GMM scores for current frame. */
166     bitvec_t *senone_active_vec; /**< Active GMMs in current frame. */
167     uint8 *senone_active;      /**< Array of deltas to active GMMs. */
168     int senscr_frame;          /**< Frame index for senone_scores. */
169     int n_senone_active;       /**< Number of active GMMs. */
170     int log_zero;              /**< Zero log-probability value. */
171 
172     /* Utterance processing: */
173     mfcc_t **mfc_buf;   /**< Temporary buffer of acoustic features. */
174     mfcc_t ***feat_buf; /**< Temporary buffer of dynamic features. */
175     FILE *rawfh;        /**< File for writing raw audio data. */
176     FILE *mfcfh;        /**< File for writing acoustic feature data. */
177     FILE *senfh;        /**< File for writing senone score data. */
178     FILE *insenfh;	/**< Input senone score file. */
179     long *framepos;     /**< File positions of recent frames in senone file. */
180 
181     /* A whole bunch of flags and counters: */
182     uint8 state;        /**< State of utterance processing. */
183     uint8 compallsen;   /**< Compute all senones? */
184     uint8 grow_feat;    /**< Whether to grow feat_buf. */
185     uint8 insen_swap;   /**< Whether to swap input senone score. */
186 
187     frame_idx_t output_frame; /**< Index of next frame of dynamic features. */
188     frame_idx_t n_mfc_alloc;  /**< Number of frames allocated in mfc_buf */
189     frame_idx_t n_mfc_frame;  /**< Number of frames active in mfc_buf */
190     frame_idx_t mfc_outidx;   /**< Start of active frames in mfc_buf */
191     frame_idx_t n_feat_alloc; /**< Number of frames allocated in feat_buf */
192     frame_idx_t n_feat_frame; /**< Number of frames active in feat_buf */
193     frame_idx_t feat_outidx;  /**< Start of active frames in feat_buf */
194 };
195 typedef struct acmod_s acmod_t;
196 
197 /**
198  * Initialize an acoustic model.
199  *
200  * @param config a command-line object containing parameters.  This
201  *               pointer is not retained by this object.
202  * @param lmath global log-math parameters.
203  * @param fe a previously-initialized acoustic feature module to use,
204  *           or NULL to create one automatically.  If this is supplied
205  *           and its parameters do not match those in the acoustic
206  *           model, this function will fail.  This pointer is not retained.
207  * @param fe a previously-initialized dynamic feature module to use,
208  *           or NULL to create one automatically.  If this is supplied
209  *           and its parameters do not match those in the acoustic
210  *           model, this function will fail.  This pointer is not retained.
211  * @return a newly initialized acmod_t, or NULL on failure.
212  */
213 acmod_t *acmod_init(cmd_ln_t *config, logmath_t *lmath, fe_t *fe, feat_t *fcb);
214 
215 /**
216  * Adapt acoustic model using a linear transform.
217  *
218  * @param mllr The new transform to use, or NULL to update the existing
219  *              transform.  The decoder retains ownership of this pointer,
220  *              so you should not attempt to free it manually.  Use
221  *              ps_mllr_retain() if you wish to reuse it
222  *              elsewhere.
223  * @return The updated transform object for this decoder, or
224  *         NULL on failure.
225  */
226 ps_mllr_t *acmod_update_mllr(acmod_t *acmod, ps_mllr_t *mllr);
227 
228 /**
229  * Start logging senone scores to a filehandle.
230  *
231  * @param acmod Acoustic model object.
232  * @param logfh Filehandle to log to.
233  * @return 0 for success, <0 on error.
234  */
235 int acmod_set_senfh(acmod_t *acmod, FILE *senfh);
236 
237 /**
238  * Start logging MFCCs to a filehandle.
239  *
240  * @param acmod Acoustic model object.
241  * @param logfh Filehandle to log to.
242  * @return 0 for success, <0 on error.
243  */
244 int acmod_set_mfcfh(acmod_t *acmod, FILE *logfh);
245 
246 /**
247  * Start logging raw audio to a filehandle.
248  *
249  * @param acmod Acoustic model object.
250  * @param logfh Filehandle to log to.
251  * @return 0 for success, <0 on error.
252  */
253 int acmod_set_rawfh(acmod_t *acmod, FILE *logfh);
254 
255 /**
256  * Finalize an acoustic model.
257  */
258 void acmod_free(acmod_t *acmod);
259 
260 /**
261  * Mark the start of an utterance.
262  */
263 int acmod_start_utt(acmod_t *acmod);
264 
265 /**
266  * Mark the end of an utterance.
267  */
268 int acmod_end_utt(acmod_t *acmod);
269 
270 /**
271  * Rewind the current utterance, allowing it to be rescored.
272  *
273  * After calling this function, the internal frame index is reset, and
274  * acmod_score() will return scores starting at the first frame of the
275  * current utterance.  Currently, acmod_set_grow() must have been
276  * called to enable growing the feature buffer in order for this to
277  * work.  In the future, senone scores may be cached instead.
278  *
279  * @return 0 for success, <0 for failure (if the utterance can't be
280  *         rewound due to no feature or score data available)
281  */
282 int acmod_rewind(acmod_t *acmod);
283 
284 /**
285  * Advance the frame index.
286  *
287  * This function moves to the next frame of input data.  Subsequent
288  * calls to acmod_score() will return scores for that frame, until the
289  * next call to acmod_advance().
290  *
291  * @return New frame index.
292  */
293 int acmod_advance(acmod_t *acmod);
294 
295 /**
296  * Set memory allocation policy for utterance processing.
297  *
298  * @param grow_feat If non-zero, the internal dynamic feature buffer
299  * will expand as necessary to encompass any amount of data fed to the
300  * model.
301  * @return previous allocation policy.
302  */
303 int acmod_set_grow(acmod_t *acmod, int grow_feat);
304 
305 /**
306  * TODO: Set queue length for utterance processing.
307  *
308  * This function allows multiple concurrent passes of search to
309  * operate on different parts of the utterance.
310  */
311 
312 /**
313  * Feed raw audio data to the acoustic model for scoring.
314  *
315  * @param inout_raw In: Pointer to buffer of raw samples
316  *                  Out: Pointer to next sample to be read
317  * @param inout_n_samps In: Number of samples available
318  *                      Out: Number of samples remaining
319  * @param full_utt If non-zero, this block represents a full
320  *                 utterance and should be processed as such.
321  * @return Number of frames of data processed.
322  */
323 int acmod_process_raw(acmod_t *acmod,
324                       int16 const **inout_raw,
325                       size_t *inout_n_samps,
326                       int full_utt);
327 
328 /**
329  * Feed acoustic feature data into the acoustic model for scoring.
330  *
331  * @param inout_cep In: Pointer to buffer of features
332  *                  Out: Pointer to next frame to be read
333  * @param inout_n_frames In: Number of frames available
334  *                      Out: Number of frames remaining
335  * @param full_utt If non-zero, this block represents a full
336  *                 utterance and should be processed as such.
337  * @return Number of frames of data processed.
338  */
339 int acmod_process_cep(acmod_t *acmod,
340                       mfcc_t ***inout_cep,
341                       int *inout_n_frames,
342                       int full_utt);
343 
344 /**
345  * Feed dynamic feature data into the acoustic model for scoring.
346  *
347  * Unlike acmod_process_raw() and acmod_process_cep(), this function
348  * accepts a single frame at a time.  This is because there is no need
349  * to do buffering when using dynamic features as input.  However, if
350  * the dynamic feature buffer is full, this function will fail, so you
351  * should either always check the return value, or always pair a call
352  * to it with a call to acmod_score().
353  *
354  * @param feat Pointer to one frame of dynamic features.
355  * @return Number of frames processed (either 0 or 1).
356  */
357 int acmod_process_feat(acmod_t *acmod,
358                        mfcc_t **feat);
359 
360 /**
361  * Set up a senone score dump file for input.
362  *
363  * @param insenfh File handle of dump file
364  * @return 0 for success, <0 for failure
365  */
366 int acmod_set_insenfh(acmod_t *acmod, FILE *insenfh);
367 
368 /**
369  * Read one frame of scores from senone score dump file.
370  *
371  * @return Number of frames read or <0 on error.
372  */
373 int acmod_read_scores(acmod_t *acmod);
374 
375 /**
376  * Get a frame of dynamic feature data.
377  *
378  * @param inout_frame_idx Input: frame index to get, or NULL
379  *                        to obtain features for the most recent frame.
380  *                        Output: frame index corresponding to this
381  *                        set of features.
382  * @return Feature array, or NULL if requested frame is not available.
383  */
384 mfcc_t **acmod_get_frame(acmod_t *acmod, int *inout_frame_idx);
385 
386 /**
387  * Score one frame of data.
388  *
389  * @param inout_frame_idx Input: frame index to score, or NULL
390  *                        to obtain scores for the most recent frame.
391  *                        Output: frame index corresponding to this
392  *                        set of scores.
393  * @return Array of senone scores for this frame, or NULL if no frame
394  *         is available for scoring (such as if a frame index is
395  *         requested that is not yet or no longer available).  The
396  *         data pointed to persists only until the next call to
397  *         acmod_score() or acmod_advance().
398  */
399 int16 const *acmod_score(acmod_t *acmod,
400                          int *inout_frame_idx);
401 
402 /**
403  * Write senone dump file header.
404  */
405 int acmod_write_senfh_header(acmod_t *acmod, FILE *logfh);
406 
407 /**
408  * Write a frame of senone scores to a dump file.
409  */
410 int acmod_write_scores(acmod_t *acmod, int n_active, uint8 const *active,
411                        int16 const *senscr, FILE *senfh);
412 
413 
414 /**
415  * Get best score and senone index for current frame.
416  */
417 int acmod_best_score(acmod_t *acmod, int *out_best_senid);
418 
419 /**
420  * Clear set of active senones.
421  */
422 void acmod_clear_active(acmod_t *acmod);
423 
424 /**
425  * Activate senones associated with an HMM.
426  */
427 void acmod_activate_hmm(acmod_t *acmod, hmm_t *hmm);
428 
429 /**
430  * Activate a single senone.
431  */
432 #define acmod_activate_sen(acmod, sen) bitvec_set((acmod)->senone_active_vec, sen)
433 
434 /**
435  * Build active list from
436  */
437 int32 acmod_flags2list(acmod_t *acmod);
438 
439 #endif /* __ACMOD_H__ */
440