1 /**
2  * @file   mfcc.h
3  *
4  * <JA>
5  * @brief MFCC�׻��Τ�������
6  *
7  * ���Υե�����ˤϡ������ȷ��ǡ�������MFCC��������ħ�̥٥��ȥ�����
8  * �׻����뤿��ι�¤�Τ��������ӥǥե�����ͤ��ޤޤ�Ƥ��ޤ���
9  * �ǥե�����ͤ� Julius �ȤȤ�����ۤ���Ƥ��벻����ǥ�ǻ��Ѥ��Ƥ���
10  * �ͤǤ��ꡤHTK�Υǥե���ȤȤ��ͤ��ۤʤ���ʬ������ޤ��Τ���դ��Ʋ�������
11  * </JA>
12  * <EN>
13  * @brief Definitions for MFCC computation
14  *
15  * This file contains structures and default values for extracting speech
16  * parameter vectors of Mel-Frequency Cepstral Cefficients (MFCC).
17  * The default values here are the ones used in the standard acoustic models
18  * distributed together with Julius, and some of them have different value from
19  * HTK defaults.  So be careful of the default values.
20  * </EN>
21  *
22  * @sa libsent/src/wav2mfcc/wav2mfcc.c
23  * @sa libsent/src/wav2mfcc/wav2mfcc-pipe.c
24  * @sa julius/wav2mfcc.c
25  * @sa julius/realtime-1stpass.c
26  *
27  * @author Akinobu LEE
28  * @date   Fri Feb 11 03:40:52 2005
29  *
30  * $Revision: 1.4 $
31  *
32  */
33 
34 
35 /************************************************************************/
36 /*    mfcc.h                                                            */
37 /*                                                                      */
38 /*    Author    : Yuichiro Nakano                                       */
39 /************************************************************************/
40 
41 #ifndef __MFCC_H__
42 #define __MFCC_H__
43 
44 /// DEBUG: define if you want to enable debug messages for sin/cos table operation
45 #undef MFCC_TABLE_DEBUG
46 
47 #define CPMAX 500		///< Maximum number of frames to store ceptral mean for realtime CMN update
48 #define CPSTEP 5		///< allocate step of cmean list per sentence
49 
50 #include <sent/stddefs.h>
51 #include <sent/htk_defs.h>
52 #include <sent/htk_param.h>
53 #include <ctype.h>
54 
55 #define DEF_SMPPERIOD   625	///< Default sampling period in 100ns (625 = 16kHz)
56 #define DEF_FRAMESIZE   400	///< Default Window size in samples, similar to WINDOWSIZE in HTK (unit is different)
57 #define DEF_FFTNUM      512	///< Number of FFT steps
58 #define DEF_FRAMESHIFT  160	///< Default frame shift length in samples
59 #define DEF_PREENPH     0.97	///< Default pre-emphasis coefficient, corresponds to PREEMCOEF in HTK
60 #define DEF_MFCCDIM     12	///< Default number of MFCC dimension, corresponds to NUMCEPS in HTK
61 #define DEF_CEPLIF      22	///< Default cepstral Liftering coefficient, corresponds to CEPLIFTER in HTK
62 #define DEF_FBANK       24	///< Default number of filterbank channels, corresponds to NUMCHANS in HTK
63 #define DEF_DELWIN      2	///< Default delta window size, corresponds to DELTAWINDOW in HTK
64 #define DEF_ACCWIN      2	///< Default acceleration window size, corresponds to ACCWINDOW in HTK
65 #define DEF_SILFLOOR    50.0	///< Default energy silence floor in dBs, corresponds to SILFLOOR in HTK
66 #define DEF_ESCALE      1.0	///< Default scaling coefficient of log energy, corresponds to ESCALE in HTK
67 
68 #define DEF_SSALPHA     2.0	///< Default alpha coefficient for spectral subtraction
69 #define DEF_SSFLOOR     0.5	///< Default flooring coefficient for spectral subtraction
70 
71 /* version 2 ... ss_floor and ss_alpha removed */
72 /* version 3 add usepower */
73 #define VALUE_VERSION 3	///< Integer version number of Value, for embedding
74 
75 /// mfcc configuration parameter values
76 typedef struct {
77   long smp_period;      ///< Sampling period in 100ns units
78   long smp_freq;	///< Sampling frequency
79   int framesize;        ///< Window size in samples, similar to WINDOWSIZE in HTK (unit is different)
80   int frameshift;       ///< Frame shift length in samples
81   float preEmph;        ///< Pre-emphasis coefficient, corresponds to PREEMCOEF in HTK
82   int lifter;           ///< Cepstral liftering coefficient, corresponds to CEPLIFTER in HTK
83   int fbank_num;        ///< Number of filterbank channels, corresponds to NUMCHANS in HTK
84   int delWin;           ///< Delta window size, corresponds to DELTAWINDOW in HTK
85   int accWin;           ///< Acceleration window size, corresponds to ACCWINDOW in HTK
86   float silFloor;       ///< Energy silence floor in dBs, corresponds to SILFLOOR in HTK
87   float escale;         ///< Scaling coefficient of log energy, corresponds to ESCALE in HTK
88   int hipass;		///< High frequency cut-off in fbank analysis, -1 if disabled, corresponds to HIFREQ in HTK
89   int lopass;		///< Low frequency cut-off in fbank analysis, -1 if disabled, corresponds to LOFREQ in HTK
90   int enormal;          ///< 1 if normalise raw energy, 0 if disabled, corresponds to ENORMALISE in HTK
91   int raw_e;            ///< 1 if using raw energy, 0 if disabled, corresponds to RAWENERGY in HTK
92   int zmeanframe;	///< 1 if apply zero mean frame like ZMEANSOURCE in HTK
93   int usepower;		///< 1 if use power instead of magnitude in filterbank analysis
94   float vtln_alpha;	///< warping factor for VTLN, corresponds to WARPFREQ in HTK
95   float vtln_upper;	///< hi freq. cut off for VTLN, corresponds to WARPUCUTOFF in HTK
96   float vtln_lower;	///< low freq. cut off for VTLN, corresponds to WARPLCUTOFF in HTK
97 
98   /* items below does not need to be embedded, because they can be
99      detemined from the acoustic model header, or should be computed
100      from run-time variables */
101   int delta;            ///< 1 if delta coef. needs to be computed
102   int acc;              ///< 1 if acceleration coef. needs to be computed
103   int energy;		///< 1 if energy coef. needs to be computed
104   int c0;		///< 1 if use 0'th cepstral parameter, 0 if disabled, corresponds to _0 qualifier in HTK
105   int absesup;		///< 1 if absolute energy should be suppressed
106   int cmn;              ///< 1 if use Cepstrum Mean Normalization, 0 if disabled, corresponds to _Z qualifier in HTK
107   int cvn;		///< 1 if use cepstral variance normalization, else 0 */
108   int mfcc_dim;         ///< Number of MFCC dimensions
109   int baselen;		///< Number of base MFCC dimension with energies
110   int vecbuflen;	///< Vector length needed for computation
111   int veclen;		///< Resulting length of vector
112 
113   int loaded;		///< 1 if these parameters were loaded from HTK config file or binhmm header
114 }Value;
115 
116 /// Workspace for filterbank analysis
117 typedef struct {
118    int fftN;            ///< Number of FFT point
119    int n;               ///< log2(fftN)
120    int klo;             ///< FFT indices of lopass cut-off
121    int khi;             ///< FFT indices of hipass cut-off
122    float fres;          ///< Scaled FFT resolution
123    float *cf;           ///< Array[1..pOrder+1] of centre freqs
124    short *loChan;       ///< Array[1..fftN/2] of loChan index
125    float *loWt;         ///< Array[1..fftN/2] of loChan weighting
126    float *Re;           ///< Array[1..fftN] of fftchans (real part)
127    float *Im;           ///< Array[1..fftN] of fftchans (imag part)
128 } FBankInfo;
129 
130 /// Cycle buffer for delta computation
131 typedef struct {
132   float **mfcc;			///< MFCC buffer
133   int veclen;			///< Vector length of above
134   float *vec;			///< Points to the current MFCC
135   int win;			///< Delta window length
136   int len;			///< Length of the buffer (= win*2+1)
137   int store;			///< Current next storing point
138   boolean *is_on;		///< TRUE if data filled
139   int B;			///< B coef. for delta computation
140 } DeltaBuf;
141 
142 /// Work area for MFCC computation
143 typedef struct {
144   float *bf;			///< Local buffer to hold windowed waveform
145   double *fbank;   ///< Local buffer to hold filterbank
146   FBankInfo fb;	///< Local buffer to hold filterbank information
147   int bflen;			///< Length of above
148 #ifdef MFCC_SINCOS_TABLE
149   double *costbl_hamming; ///< Cos table for hamming window
150   int costbl_hamming_len; ///< Length of above
151   /* cos/-sin table for FFT */
152   double *costbl_fft; ///< Cos table for FFT
153   double *sintbl_fft; ///< Sin table for FFT
154   int tbllen; ///< Length of above
155   /* cos table for MakeMFCC */
156   double *costbl_makemfcc; ///< Cos table for DCT
157   int costbl_makemfcc_len; ///< Length of above
158   /* sin table for WeightCepstrum */
159   double *sintbl_wcep; ///< Sin table for cepstrum weighting
160   int sintbl_wcep_len; ///< Length of above
161 #endif /* MFCC_SINCOS_TABLE */
162   float sqrt2var; ///< Work area that holds value of sqrt(2.0) / fbank_num
163   float *ssbuf;			///< Pointer to noise spectrum for SS
164   int ssbuflen;			///< length of @a ssbuf
165   float ss_floor;		///< flooring value for SS
166   float ss_alpha;		///< alpha scaling value for SS
167 } MFCCWork;
168 
169 /**
170  * Structure to hold sentence sum of MFCC for realtime CMN
171  *
172  */
173 typedef struct {
174   float *mfcc_sum;		///< Sum of MFCC parameters
175   float *mfcc_var;		///< Variance sum of MFCC parameters
176   int framenum;			///< summed number of frames
177 } CMEAN;
178 
179 /**
180  * Work area for real-time CMN
181  *
182  */
183 typedef struct {
184   CMEAN *clist;		///< List of MFCC sum for previous inputs
185   int clist_max;		///< Allocated number of CMEAN in clist
186   int clist_num;		///< Currentlly filled CMEAN in clist
187   float cweight;		///< Weight of initial cepstral mean
188   float *cmean_init;	///< Initial cepstral mean for each input
189   float *cvar_init;		///< Inisial cepstral standard deviation for each input
190   int mfcc_dim;			///< base MFCC dimension (to apply CMN)
191   int veclen;			///< full MFCC vector length
192   boolean mean;			///< TRUE if CMN is enabled
193   boolean var;			///< TRUE if CVN is enabled
194   boolean cmean_init_set;	///< TRUE if cmean_init (and cvar_init) was set
195   CMEAN now;		///< Work area to hold current cepstral mean
196 } CMNWork;
197 
198 /**
199  * work area for energy normalization on real time input
200  *
201  */
202 typedef struct {
203   LOGPROB max_last;	///< Maximum energy value of last input
204   LOGPROB min_last;	///< Minimum floored energy value of last input
205   LOGPROB max;	///< Maximum energy value of current input
206 } ENERGYWork;
207 
208 /**** mfcc-core.c ****/
209 MFCCWork *WMP_work_new(Value *para);
210 void WMP_calc(MFCCWork *w, float *mfcc, Value *para);
211 void WMP_free(MFCCWork *w);
212 /* Get filterbank information */
213 boolean InitFBank(MFCCWork *w, Value *para);
214 void FreeFBank(FBankInfo *fb);
215 /* Apply hamming window */
216 void Hamming (float *wave, int framesize, MFCCWork *w);
217 /* Apply pre-emphasis filter */
218 void PreEmphasise (float *wave, int framesize, float preEmph);
219 /* Return mel-frequency */
220 float Mel(int k, float fres);
221 /* Apply FFT */
222 void FFT(float *xRe, float *xIm, int p, MFCCWork *w);
223 /* Convert wave -> mel-frequency filterbank */
224 void MakeFBank(float *wave, MFCCWork *w, Value *para);
225 /* Apply the DCT to filterbank */
226 void MakeMFCC(float *mfcc, Value *para, MFCCWork *w);
227 /* Calculate 0'th Cepstral parameter*/
228 float CalcC0(MFCCWork *w, Value *para);
229 /* Calculate Log Raw Energy */
230 float CalcLogRawE(float *wave, int framesize);
231 /* Zero Mean Souce by frame */
232 void ZMeanFrame(float *wave, int framesize);
233 /* Re-scale cepstral coefficients */
234 void WeightCepstrum (float *mfcc, Value *para, MFCCWork *w);
235 
236 /**** wav2mfcc-buffer.c ****/
237 /* Convert wave -> MFCC_E_D_(Z) (batch) */
238 int Wav2MFCC(SP16 *wave, float **mfcc, Value *para, int nSamples, MFCCWork *w);
239 /* Calculate delta coefficients (batch) */
240 void Delta(float **c, int frame, Value *para);
241 /* Calculate acceleration coefficients (batch) */
242 void Accel(float **c, int frame, Value *para);
243 /* Normalise log energy (batch) */
244 void NormaliseLogE(float **c, int frame_num, Value *para);
245 /* Cepstrum Mean Normalization (batch) */
246 void CMN(float **mfcc, int frame_num, int dim);
247 void MVN(float **mfcc, int frame_num, Value *para);
248 
249 /**** wav2mfcc-pipe.c ****/
250 DeltaBuf *WMP_deltabuf_new(int veclen, int windowlen);
251 void WMP_deltabuf_free(DeltaBuf *db);
252 void WMP_deltabuf_prepare(DeltaBuf *db);
253 boolean WMP_deltabuf_proceed(DeltaBuf *db, float *new_mfcc);
254 boolean WMP_deltabuf_flush(DeltaBuf *db);
255 
256 CMNWork *CMN_realtime_new(Value *para, float weight);
257 void CMN_realtime_free(CMNWork *c);
258 void CMN_realtime_prepare(CMNWork *c);
259 void CMN_realtime(CMNWork *c, float *mfcc);
260 void CMN_realtime_update(CMNWork *c, HTK_Param *param);
261 boolean CMN_load_from_file(CMNWork *c, char *filename);
262 boolean CMN_save_to_file(CMNWork *c, char *filename);
263 
264 void energy_max_init(ENERGYWork *energy);
265 void energy_max_prepare(ENERGYWork *energy, Value *para);
266 LOGPROB energy_max_normalize(ENERGYWork *energy, LOGPROB f, Value *para);
267 
268 /**** ss.c ****/
269 /* spectral subtraction */
270 float *new_SS_load_from_file(char *filename, int *slen);
271 float *new_SS_calculate(SP16 *wave, int wavelen, int *slen, MFCCWork *w, Value *para);
272 
273 /**** para.c *****/
274 void undef_para(Value *para);
275 void make_default_para(Value *para);
276 void make_default_para_htk(Value *para);
277 void apply_para(Value *dst, Value *src);
278 boolean htk_config_file_parse(char *HTKconffile, Value *para);
279 void calc_para_from_header(Value *para, short param_type, short vec_size);
280 void put_para(FILE *fp, Value *para);
281 
282 
283 #endif /* __MFCC_H__ */
284