1 /*************************************************************************/ 2 /* */ 3 /* Centre for Speech Technology Research */ 4 /* University of Edinburgh, UK */ 5 /* Copyright (c) 1995,1996 */ 6 /* All Rights Reserved. */ 7 /* */ 8 /* Permission is hereby granted, free of charge, to use and distribute */ 9 /* this software and its documentation without restriction, including */ 10 /* without limitation the rights to use, copy, modify, merge, publish, */ 11 /* distribute, sublicense, and/or sell copies of this work, and to */ 12 /* permit persons to whom this work is furnished to do so, subject to */ 13 /* the following conditions: */ 14 /* 1. The code must retain the above copyright notice, this list of */ 15 /* conditions and the following disclaimer. */ 16 /* 2. Any modifications must be clearly marked as such. */ 17 /* 3. Original authors' names are not deleted. */ 18 /* 4. The authors' names are not used to endorse or promote products */ 19 /* derived from this software without specific prior written */ 20 /* permission. */ 21 /* */ 22 /* THE UNIVERSITY OF EDINBURGH AND THE CONTRIBUTORS TO THIS WORK */ 23 /* DISCLAIM ALL WARRANTIES WITH REGARD TO THIS SOFTWARE, INCLUDING */ 24 /* ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS, IN NO EVENT */ 25 /* SHALL THE UNIVERSITY OF EDINBURGH NOR THE CONTRIBUTORS BE LIABLE */ 26 /* FOR ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES */ 27 /* WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN */ 28 /* AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, */ 29 /* ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF */ 30 /* THIS SOFTWARE. */ 31 /* */ 32 /*************************************************************************/ 33 34 #ifndef __EST_SIGPR_FRAME_H__ 35 #define __EST_SIGPR_FRAME_H__ 36 37 #include "EST_FMatrix.h" 38 39 40 41 /**@name Linear Prediction functions 42 Including, generation of coefficients from the signal, reflection 43 coefficients, line spectral frequencies, areas. 44 */ 45 //@{ 46 47 /** Produce the full set of linear prediction coefficients from a 48 frame of speech waveform. 49 50 @param sig: the frame of input waveform 51 @param acf: the autocorrelation coefficients 52 @param ref: the reflection coefficients 53 @param lpc: the LPC coefficients 54 55 The order of the lpc analysis is given as the size of the <parameter> 56 lpc <parameter> vector - 1. The coefficients are placed in the 57 locations 1 - size, and the energy is placed in location 0. 58 */ 59 60 void sig2lpc(const EST_FVector &sig, EST_FVector &acf, 61 EST_FVector &ref, EST_FVector &lpc); 62 63 64 /** Calculate cepstral coefficients from lpc coefficients. 65 66 It is possible to calculate a set of cepstral coefficients from 67 lpc coefficients using the relationship: 68 69 \[c_{k}= a_{k} + \frac{1}{k}\sum_{i=1}^{k-1} i c_{i} a_{k-1}\] 70 71 The order of the cepstral analysis can be different from the lpc 72 order. If the cepstral order is greater, interpolation is used (FINISH 73 add equation). Both orders are taken from the lengths of the 74 respective vectors. Note that these cepstral coefficients take on the 75 assumptions (and errors) of the lpc model and hence will not be the 76 same as cepstral coefficients calculated using DFT functions. 77 78 @param lpc: the LPC coefficients (input) 79 @param lpc: the cepstral coefficients (output) 80 */ 81 82 void lpc2cep(const EST_FVector &lpc, EST_FVector &cep); 83 84 85 86 /** Produce a set linear prediction coefficients from a 87 frame of speech waveform. {\tt sig} is the frame of input waveform, 88 and {\tt lpc} are the LPC coefficients. The 89 {\bf order} of the lpc analysis is given as the size of the {\tt lpc} 90 vector -1. The coefficients are placed in the locations 1 - size, and 91 the energy is placed in location 0. 92 */ 93 void sig2lpc(const EST_FVector &sig, EST_FVector &lpc); 94 95 /** Produce a set of reflection coefficients from a 96 frame of speech waveform. {\tt sig} is the frame of input waveform, 97 and {\tt ref} are the LPC coefficients. The 98 {\bf order} of the lpc analysis is given as the size of the {\tt lpc} 99 vector -1. The coefficients are placed in the locations 1 - size, and 100 the energy is placed in location 0. 101 */ 102 void sig2ref(const EST_FVector &sig, EST_FVector &ref); 103 104 105 /**@name Area Functions 106 Using the analogy of the lossless tube, the 107 cross-sectional areas of the sections of this tube are related to the reflection coefficients and can be calculated from the following relationship: 108 109 \[\frac{A_{i+1}}{A_{i}} = \frac{i - k_{i}}{1 + k_{i}} \] 110 111 */ 112 //@{ 113 /** The area according to the formula. */ 114 void ref2truearea(const EST_FVector &ref, EST_FVector &area); 115 116 /** An approximation of the area is calculate by skipping the denominator 117 in the formula. */ 118 void ref2area(const EST_FVector &ref, EST_FVector &area); 119 120 /** The logs of the areas. */ 121 void ref2logarea(const EST_FVector &ref, EST_FVector &logarea); 122 //@} 123 124 /** Calculate the reflection coefficients from the lpc 125 coefficients. Note that in the standard linear prediction analysis, 126 the reflection coefficients are generated as a by-product. @see 127 sig2lpc */ 128 129 void lpc2ref(const EST_FVector &lpc, EST_FVector &ref); 130 131 /** Calculate the linear prediction coefficients from the reflection 132 coefficients. 133 Use the equation: 134 \[power=\frac{1}{n}\sum_{i=1}^{n}a_{i}^2\] 135 136 @see lpc2ref*/ 137 138 void ref2lpc(const EST_FVector &ref, EST_FVector &lpc); 139 140 /** Calculate line spectral frequencies from linear prediction coefficients. 141 Use the equation: 142 \[power=\frac{1}{n}\sum_{i=1}^{n}a_{i}^2\] 143 144 @see lsf2lpc 145 */ 146 147 void lpc2lsf(const EST_FVector &lpc, EST_FVector &lsf); 148 149 /** Calculate line spectral frequencies from linear prediction coefficients. 150 Use the equation: 151 \[power=\frac{1}{n}\sum_{i=1}^{n}a_{i}^2\] 152 153 @see lpc2lsf 154 */ 155 156 void lsf2lpc(const EST_FVector &lsf, EST_FVector &lpc); 157 //@} 158 159 void frame_convert(const EST_FVector &in_frame, const EST_String &in_type, 160 EST_FVector &out_frame, const EST_String &out_type); 161 162 163 164 // end of lpc functions 165 166 /**@name Energy and power frame functions 167 */ 168 169 //@{ 170 171 /** Calculate the power for a frame of speech. This is defined as 172 \[power=\frac{1}{n}\sum_{i=1}^{n}a_{i}^2\] 173 */ 174 175 176 void sig2pow(EST_FVector &frame, float &power); 177 178 /** Calculate the root mean square energy for a frame of speech. This 179 is defined as \[energy=\sqrt{\frac{1}{n}\sum_{i=1}^{n}a_{i}^2}\] */ 180 181 void sig2rms(EST_FVector &frame, float &rms_energy); 182 183 //@} 184 // end of power and energy 185 186 /**@name Frame based filter bank and cepstral analysis 187 188 These functions are \Ref{Frame based signal processing functions}. 189 */ 190 191 //@{ 192 193 /** Calculate the (log) energy (or power) in each channel of a Mel 194 scale filter bank for a frame of speech. The filters are triangular, are 195 evenly spaced and are all of equal width, on a Mel scale. The upper and lower 196 cutoffs of each filter are at the centre frequencies of the adjacent filters. 197 The Mel scale is described under {\tt Hz2Mel}. 198 199 @see Hz2Mel 200 @see sig2fft 201 @see fft2fbank 202 */ 203 204 void sig2fbank(const EST_FVector &sig, 205 EST_FVector &fbank_frame, 206 const float sample_rate, 207 const bool use_power_rather_than_energy, 208 const bool take_log); 209 210 /** Calculate the energy (or power) spectrum of a frame of speech. The FFT 211 order is determined by the number of samples in the frame of speech, and is 212 a power of 2. Note that the FFT vector returned corresponds to frequencies 213 from 0 to half the sample rate. Energy is the magnitude of the FFT; power is 214 the squared magnitude. 215 216 @see fft2fbank 217 @see sig2fbank 218 */ 219 220 void sig2fft(const EST_FVector &sig, 221 EST_FVector &fft_vec, 222 const bool use_power_rather_than_energy); 223 224 /** Given a Mel filter bank description, bin the FFT coefficients 225 to compute the output of the filters. The first and last elements of 226 {\tt mel_fbank_frequencies} define the lower and upper bound of 227 the first and last filters respectively and the intervening elements 228 give the filter centre frequencies. That is, {\tt mel_fbank_frequencies} has 229 two more elements than {\tt fbank_vec}. 230 231 @see fastFFT 232 @see sig2fft 233 @see sig2fbank 234 @see fbank2melcep 235 */ 236 237 void fft2fbank(const EST_FVector &fft_frame, 238 EST_FVector &fbank_vec, 239 const float Hz_per_fft_coeff, 240 const EST_FVector &mel_fbank_frequencies); 241 242 /** Compute the discrete cosine transform of log Mel-scale filter bank output 243 to get the Mel cepstral coefficients for a frame of speech. 244 Optional liftering (filtering in the cepstral domain) can be applied to 245 normalise the magnitudes of the coefficients. This is useful because, 246 typically, the higher order cepstral coefficients are significantly 247 smaller than the lower ones and it is often desirable to normalise 248 the means and variances across coefficients. 249 250 The lifter (cepstral filter) used is: 251 \[c_i' = \{ 1 + \frac{L}{2} sin \frac{\Pi i}{L} \} \; c_i\] 252 253 A typical value of L used in speech recognition is 22. A value of L=0 is taken 254 to mean no liftering. This is equivalent to L=1. 255 256 @see sig2fft 257 @see fft2fbank 258 @see sig2fbank 259 */ 260 261 void fbank2melcep(const EST_FVector &fbank_vec, 262 EST_FVector &mfcc, 263 const float liftering_parameter, 264 const bool include_c0 = false); 265 266 /** Make a triangular Mel scale filter. The filter is centred at 267 {\tt this_mel_centre} and 268 extends from {\tt this_mel_low} to {\tt this_mel_high}. {\tt half_fft_order} 269 is the length of a power/energy spectrum covering 0Hz to half the sampling 270 frequency with a resolution of {\tt Hz_per_fft_coeff}. 271 272 The routine returns a vector of weights to be applied to the energy/power 273 spectrum starting at element {\tt fft_index_start}. 274 The number of points (FFT coefficients) covered 275 by the filter is given by the length of the returned vector {\tt filter}. 276 277 @see fft2fbank 278 @see Hz2Mel 279 @see Mel2Hz 280 */ 281 282 void make_mel_triangular_filter(const float this_mel_centre, 283 const float this_mel_low, 284 const float this_mel_high, 285 const float Hz_per_fft_coeff, 286 const int half_fft_order, 287 int &fft_index_start, 288 EST_FVector &filter); 289 290 /**@name Frequency conversion functions 291 292 These are functions used in \Ref{Filter bank and cepstral analysis}. 293 */ 294 295 //@{ 296 297 /** Convert Hertz to Mel. The Mel scale is defined by 298 \[f_{\mbox{Mel}} = 1127 \; log( 1 + \frac{f_{\mbox{Hertz}}}{700} )\] 299 300 @see Mel2Hz 301 @see Frequency conversion functions 302 */ 303 304 float Hz2Mel(float frequency_in_Hertz); 305 306 /** 307 Convert Mel to Hertz. 308 309 @see Hz2Mel 310 */ 311 312 float Mel2Hz(float frequency_in_Mel); 313 314 //@} 315 // end of frequency conversion functions 316 317 //@} 318 // end of filter bank and cepstral analysis 319 320 321 322 323 #endif /* __EST_SIGPR_FRAME_H__ */ 324