1 /************************************************************************/
2 /* */
3 /* Centre for Speech Technology Research */
4 /* University of Edinburgh, UK */
5 /* Copyright (c) 1996,1997 */
6 /* All Rights Reserved. */
7 /* */
8 /* Permission is hereby granted, free of charge, to use and distribute */
9 /* this software and its documentation without restriction, including */
10 /* without limitation the rights to use, copy, modify, merge, publish, */
11 /* distribute, sublicense, and/or sell copies of this work, and to */
12 /* permit persons to whom this work is furnished to do so, subject to */
13 /* the following conditions: */
14 /* 1. The code must retain the above copyright notice, this list of */
15 /* conditions and the following disclaimer. */
16 /* 2. Any modifications must be clearly marked as such. */
17 /* 3. Original authors' names are not deleted. */
18 /* 4. The authors' names are not used to endorse or promote products */
19 /* derived from this software without specific prior written */
20 /* permission. */
21 /* */
22 /* THE UNIVERSITY OF EDINBURGH AND THE CONTRIBUTORS TO THIS WORK */
23 /* DISCLAIM ALL WARRANTIES WITH REGARD TO THIS SOFTWARE, INCLUDING */
24 /* ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS, IN NO EVENT */
25 /* SHALL THE UNIVERSITY OF EDINBURGH NOR THE CONTRIBUTORS BE LIABLE */
26 /* FOR ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES */
27 /* WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN */
28 /* AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, */
29 /* ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF */
30 /* THIS SOFTWARE. */
31 /* */
32 /************************************************************************/
33 /* */
34 /* Author: Paul Taylor (pault@cstr.ed.ac.uk) */
35 /* Date: Fri May 9 1997 */
36 /* ------------------------------------------------------------------- */
37 /* Examples of Generation of Acoustic Feature Vectors from Waveforms */
38 /* */
39 /************************************************************************/
40
41 #include <cstdlib>
42 #include "EST_sigpr.h"
43 #include "EST_cmd_line.h"
44 #include "EST_inline_utils.h"
45 #include "EST_sigpr.h"
46
47 /**@name Signal processing example code
48 *
49 * @toc
50 */
51 //@{
52
53 EST_StrList empty;
54
55 void print_map(EST_TrackMap &t);
56 void print_track_map(EST_Track &t);
57
main(void)58 int main(void)
59
60 {
61 EST_StrList base_list; // decl
62 EST_StrList delta_list; // decl
63 EST_StrList acc_list; // decl
64 EST_Option op, al; // decl
65 init_lib_ops(al, op);
66 EST_Wave sig; // decl
67 EST_Track fv, part; // decl
68 float shift; // decl
69 int i;
70
71
72 cout << "position 1\n";
73
74 /**@name Producing a single type of feature vector for an utterance
75
76 A number of types of signal processing can be performed by the
77 sig2coef function. The following code demonstrates a simple
78 case of calculating the linear prediction (LP) coefficients for
79 a waveform.
80
81 First set the order of the lpc analysis to 16 (this entails 17 actual
82 coefficients) and then load in the waveform to be analysed.
83
84 */
85 //@{
86
87
88 //@{ code
89
90 int lpc_order = 16;
91 sig.load(DATA "/kdt_001.wav");
92
93 //@} code
94
95 /** Now allocate enough space in the track to hold the analysis.
96 The following command resizes fv to have enough frames for
97 analysis frames at 0.01 intervals up to the end of the waveform,
98 (sig.end()), and enough channels to store lpc_order + 1 coefficients.
99 The channels are named so as to take lpc coefficients.
100 */
101 //@{ code
102 int num_frames;
103 num_frames = (int)ceil(sig.end() / 0.01);
104 fv.resize(num_frames, lpc_order + 1);
105 //@} code
106
107 /** The positions of the frames, corresponding to the middel of their
108 analysis window also needs to be set. For fixed frame analysis, this
109 can be done with the fill_time() function: */
110
111 //@{ code
112 fv.fill_time(0.01);
113 //@} code
114
115 /** The simplest way to do the actual analysis is as follows, which
116 will fill the track with the values from the LP analysis using the
117 default processing controls.
118 */
119
120 //@{ code
121 sig2coef(sig, fv, "lpc");
122 //@} code
123
124 /** In this style of analysis, default values are used to control the
125 windowing mechanisms which split the whole signal into frames.
126
127 Specifically, each frame is defined to start a certain distance
128 before the time interval, and extending the same distance after.
129 This distance is calculated as a function of the local window
130 spacing and can be adjusted as follows:
131
132 Extending one time period before and one time period after the
133 current time mark:
134 */
135 //@{ code
136 sig2coef(sig, fv, "lpc", 2.0);
137 //@} code
138 /** Extending 1.5 time periods before and after the
139 current time mark, etc;
140 */
141 //@{ code
142 sig2coef(sig, fv, "lpc", 3.0);
143 //@} code
144
145 /** The type of windowing function may be changed also as this
146 can be passed in as an optional argument. First we
147 create a window function (This is explained more in \Ref{Windowing}).
148 */
149 //@{ code
150 EST_WindowFunc *wf = EST_Window::creator("hamming");
151 //@} code
152 /** and then pass it in as the last argument
153 */
154 //@{ code
155 sig2coef(sig, fv, "lpc", 3.0, wf);
156 //@} code
157 //@}
158
159 /**@name Pitch-Synchronous vs fixed frame analysis.
160
161 Most of the core signal processing functions operate on individual
162 frames of speech and are oblivious as to how these frames were
163 extracted from the original speech. This allows us to take the frames
164 from anywhere in the signal: specifically, this facilitates two
165 common forms of analysis:
166
167 <formalpara><title>fixed frame</title><para>
168 The time points are space at even intervals throughout the signal.
169 </para></formalpara>
170 <formalpara><title>pitch-synchronous</title><para>
171 The time points represent <emphasis>pitchmarks</emphasis>
172 and correspond to a specific position in each pitch period,
173 e.g. the instant of glottal closure.</para></formalpara>
174 <para>
175
176 It is a simple matter to fill the time array, but normally
177 pitchmarks are read from a file or taken from another signal
178 processing algorithm (see \Ref{Pitchmark functions.}).
179 </para>
180 <para>
181
182
183 There are many ways to fill the time array for fixed frame analysis.
184
185 manually:
186
187 */
188 //@{
189
190 //@{ code
191 int num_frames = 300;
192 fv.resize(num_frames, lpc_order + 1);
193 shift = 0.01; // time interval in seconds
194
195 for (i = 0; i < num_frames; ++i)
196 fv.t(i) = shift * (float) i;
197 //@} code
198 /** or by use of the member function \Ref{EST_Track::fill_time}
199 */
200
201 //@{ code
202 fv.fill_time(0.01);
203 //@} code
204
205 /** Pitch synchronous values can simply be read from pitchmark
206 files:
207 */
208 //@{ code
209 fv.load(DATA "/kdt_001.pm");
210 make_track(fv, "lpc", lpc_order + 1);
211 //@} code
212
213 /** Regardless of how the time points where obtain, the analysis
214 function call is just the same:
215 */
216 //@{ code
217 sig2coef(sig, fv, "lpc");
218 //@} code
219 //@}
220
221 cout << "position 3\n";
222
223 /**@name Naming Channels
224 @id sigpr-example-naming-channels
225 Multiple types of feature vector can be stored in the same Track.
226 Imagine that we want lpc, cepstrum and power
227 coefficients in that order in a track. This can be achieved by using
228 the \Ref{sig2coef} function multiple times, or by the wrap
229 around \Ref{sigpr_base} function.
230 </para><para>
231
232 It is vitally important here to ensure that before passing the
233 track to the signal processing functions that it has the correct
234 number of channels and that these are appropriately named. This is
235 most easily done using the track map facility, explained
236 in <LINK LINKEND="proan129">Naming Channels</LINK>
237 </para><para>
238
239 For each call, we only us the part of track that is relevant.
240 The sub_track member function of \Ref{EST_Track} is used to get
241 this. In the following example, we are assuming here that
242 fv has sufficient space for 17
243 lpc coefficients, 8 cepstrum coefficients and power and that
244 they are stored in that order.
245
246 */
247 //@{
248 //@{ code
249
250 int cep_order = 16;
251 EST_StrList map;
252
253 map.append("$lpc-0+" Stringtoi(lpc_order));
254 map.append("$cepc-0+" Stringtoi(cep_order));
255 map.append("power");
256
257 fv.resize(EST_CURRENT, map);
258 //@} code
259
260 /** An alternative is to use <function>add_channels_to_map()</function>
261 which takes a list of coefficient types and makes a map.
262 The order of each type of processing is extracted from
263 op.
264 */
265
266 //@{ code
267
268 EST_StrList coef_types;
269
270 coef_types.append("lpc");
271 coef_types.append("cep");
272 coef_types.append("power");
273
274 map.clear();
275
276 add_channels_to_map(map, coef_types, op);
277 fv.resize(EST_CURRENT, map);
278
279 //@} code
280
281 /** After allocating the right number of frames and channels
282 in {\tt fv}, we extract a sub_track, which has all the frames
283 (i.e. between 0 and EST_ALL) and all the lpc channels
284 */
285 //@{ code
286 fv.sub_track(part, 0, EST_ALL, 0, "lpc_0", "lpc_N");
287 //@} code
288 /** now call the signal processing function on this part:
289 */
290 //@{ code
291 sig2coef(sig, part, "lpc");
292 //@} code
293
294 /** We repeat the procedure for the cepstral coefficients, but this
295 time take the next 8 channels (17-24 inclusive) and calculate the coefficients:
296 */
297 //@{ code
298 fv.sub_track(part, 0, EST_ALL, "cep_0", "cep_N");
299
300 sig2coef(sig, part, "cep");
301 //@} code
302 /** Extract the last channel for power and call the power function:
303 */
304 //@{ code
305 fv.sub_track(part, 0, EST_ALL, "power", 1);
306 power(sig, part, 0.01);
307
308 //@} code
309
310 /** While the above technique is adequate for our needs and is
311 a useful demonstration of sub_track extraction, the
312 \Ref{sigpr_base} function is normally easier to use as it does
313 all the sub track extraction itself. To perform the lpc, cepstrum
314 and power analysis, we put these names into a StrList and
315 call \Ref{sigpr_base}.
316 */
317 //@{ code
318 base_list.clear(); // empty the list, just in case
319 base_list.append("lpc");
320 base_list.append("cep");
321 base_list.append("power");
322
323 sigpr_base(sig, fv, op, base_list);
324 //@} code
325 /** This will call \Ref{sigpr_track} as many times as is necessary.
326 */
327 //@}
328
329 /**@name Producing delta and acceleration coefficients
330
331 Delta coefficients represent the numerical differentiation of a
332 track, and acceleration coefficients represent the second
333 order numerical differentiation.
334
335 By convention, delta coefficients have a "_d" suffix and acceleration
336 coefficients "_a". If the coefficient is multi-dimensional, the
337 numbers go after the "_d" or "_a".
338
339 */
340 //@{
341 //@{ code
342
343 map.append("$cep_d-0+" Stringtoi(cep_order)); // add deltas
344 map.append("$cep_a-0+" Stringtoi(cep_order)); // add accs
345
346 fv.resize(EST_CURRENT, map); // resize the track.
347 //@} code
348 /**
349 Given a EST_Track of coefficients {\tt fv}, the \Ref{delta}
350 function is used to produce the delta equivalents {\tt
351 del}. The following uses the track allocated above and
352 generates a set of cepstral coefficients and then makes their
353 delta and acc:
354
355 */
356 //@{ code
357
358 EST_Track del, acc;
359
360 fv.sub_track(part, 0, EST_ALL, 0, "cep_0", "cep_N"); // make subtrack of coefs
361 sig2coef(sig, part, "cep"); // fill with cepstra
362
363 // make subtrack of deltas
364 fv.sub_track(del, 0, EST_ALL, 0, "cep_d_0", "cep_d_N");
365 delta(part, del); // calculate deltas of part, and place answer in del
366
367 // make subtrack of accs
368 fv.sub_track(acc, 0, EST_ALL, 0, "cep_a_0", "cep_a_N");
369 delta(del, acc); // calculate deltas of del, and place answer in acc
370 //@} code
371 /** It is possible to directly calculate the delta coefficients of
372 a type of coefficient, even if we don't have the base type.
373 \Ref{sigpr_delta} will process the waveform, make a temporary
374 track of the required type "lpc" and calculate the delta of this.
375 </para><para>
376 The following makes a set of delta reflection coefficients:
377
378 */
379 //@{ code
380 map.append("$ref_d-0+" Stringtoi(lpc_order)); // add to map
381 fv.resize(EST_CURRENT, map); // resize the track.
382
383 sigpr_delta(sig, fv, op, "ref");
384 //@} code
385 /** an equivalent function exists for acceleration coefficients:
386 */
387 //@{ code
388 map.append("$lsf_a-0+" Stringtoi(lpc_order)); // add acc lsf
389 fv.resize(EST_CURRENT, map); // resize the track.
390
391 sigpr_acc(sig, fv, op, "ref");
392
393 //@} code
394 //@}
395
396 /**@name Windowing
397
398 The \Ref{EST_Window} class provides a variety of means to
399 divide speech into frames using windowing mechanisms.
400
401 </para><para>
402 A window function can be created from a window name using the
403 \Ref{EST_Window::creator} function:
404 */
405 //@{
406 //@{ code
407
408 EST_WindowFunc *hamm = EST_Window::creator("hamming");
409 EST_WindowFunc *rect = EST_Window::creator("rectangular");
410 //@} code
411 /** This function can then be used to create a EST_TBuffer of
412 window values. In the following example the values from a
413 256 point hamming window are stored in the buffer win_vals:
414 */
415 //@{ code
416 EST_FVector frame;
417 EST_FVector win_vals;
418
419 hamm(256, win_vals);
420 //@} code
421
422 /** The make_window function also creates a window:
423 */
424 //@{ code
425 EST_Window::make_window(win_vals, 256, "hamming",-1);
426 //@} code
427
428 /** this can then be used to make a frame of speech from the main EST_Wave
429 sig. The following example extracts speech starting at sample 1000:
430 */
431 //@} code
432 for (i = 0; i < 256; ++i)
433 frame[i] = (float)sig.a(i + 1000) * win_vals[i];
434 //@} code
435
436 /** Alternatively, exactly the same operation can be performed in a
437 single step by passing the window function to the
438 \Ref{EST_Window::window_signal} function which takes a
439 \Ref{EST_Wave} and performs windowing on a section of it,
440 storing the output in the \Ref{EST_FVector} {\tt frame}.
441 */
442 //@{ code
443 EST_Window::window_signal(sig, hamm, 1000, 256, frame, 1);
444 //@} code
445 /** The window function need not be explicitly created, the window
446 signal can work on just the name of the window type:
447 */
448
449 //@{ code
450 EST_Window::window_signal(sig, "hamming", 1000, 256, frame, 1);
451 //@} code
452
453 //@}
454 /**@name Frame based signal processing
455 @id sigpr-example-frames
456 The signal processing library provides an extensive set of functions
457 which operate on a single frame of coefficients.
458 The following example shows one method of splitting the signal
459 into frames and calling a signal processing algorithm.
460
461 First set up the track for 16 order LP analysis:
462
463 */
464 //@{
465 //@{ code
466
467 map.clear();
468 map.append("$lpc-0+16");
469
470 fv.resize(EST_CURRENT, map);
471
472 //@} code
473 /** In this example, we take the analysis frame length to be 256 samples
474 long, and the shift in samples is just the shift in seconds times the
475 sampling frequency.
476 */
477 //@{ code
478 int s_length = 256;
479 int s_shift = int(shift * float(sig.sample_rate()));
480 EST_FVector coefs;
481 //@} code
482
483 /** Now we set up a loop which calculates the frames one at a time.
484 {\tt start} is the start position in samples of each frame.
485 The \Ref{EST_Window::window_signal} function is called which
486 makes a \Ref{EST_FVector} frame of the speech via a hamming window.
487
488 Using the \Ref{EST_Track::frame} function, the EST_FVector
489 {\tt coefs} is set to frame {\tt k} in the track. It is important
490 to understand that this operation involves setting an internal
491 smart pointer in {\tt coefs} to the memory of frame {\tt k}. This
492 allows the signal processing function \Ref{sig2lpc} to operate
493 on an input and output \Ref{EST_FVector}, without any copying to or
494 from the main track. After the \Ref{sig2lpc} call, the kth frame
495 of {\tt fv} is now filled with the LP coefficients.
496 */
497 //@{ code
498 for (int k1 = 0; k1 < fv.num_frames(); ++k1)
499 {
500 int start = (k1 * s_shift) - (s_length/2);
501 EST_Window::window_signal(sig, "hamming", start, s_length, frame, 1);
502
503 fv.frame(coefs, k1); // Extract a single frame
504 sig2lpc(frame, coefs); // Pass this to actual algorithm
505 }
506 //@} code
507
508 /** A slightly different tack can be taken for pitch-synchronous analysis.
509 Setting up fv with the pitchmarks and channels:
510 */
511 //@{ code
512 fv.load(DATA "/kd1_001.pm");
513 fv.resize(EST_CURRENT, map);
514 //@} code
515 /** Set up as before, but this time calculate the window starts and
516 lengths from the time points. In this example, the length is a
517 {\tt factor} (twice) the local frame shift.
518 Note that the only difference between this function and the fixed
519 frame one is in the calculation of the start and end points - the
520
521 windowing, frame extraction and call to \Ref{sig2lpc} are exactly
522 the same.
523 */
524 //@{ code
525 float factor = 2.0;
526
527 for (int k2 = 0; k2 < fv.num_frames(); ++k2)
528 {
529 s_length = irint(get_frame_size(fv, k2, sig.sample_rate())* factor);
530 int start = (irint(fv.t(k2) * sig.sample_rate()) - (s_length/2));
531
532 EST_Window::window_signal(sig, wf, start, s_length, frame, 1);
533
534 fv.frame(coefs, k2);
535 sig2lpc(frame, coefs);
536 }
537 //@} code
538 //@}
539
540 /**@name Filtering
541
542 In the EST library we so far have two main types of filter,
543 {\bf finite impulse response (FIR)} filters and {\bf linear
544 prediction (LP)} filters. {\bf infinite impulse response (IIR)}
545 filters are not yet implemented, though LP filters are a
546 special case of these.
547 </para><para>
548 Filtering involves 2 stages: the design of the filter and the
549 use of this filter on the waveform.
550 </para><para>
551 First we examine a simple low-pass filter which attempts to suppress
552 all frequencies about a cut-off. Imagine we want to low pass filter
553 a signal at 400Hz. First we design the filter:
554 */
555 //@{
556 //@{ code
557
558 EST_FVector filter;
559 int freq = 400;
560 int filter_order = 99;
561
562 filter = design_lowpass_FIR_filter(sig.sample_rate(), 400, 99);
563 //@} code
564 /** And now use this filter on the signal:
565 */
566 //@{ code
567 FIRfilter(sig, filter);
568 //@} code
569 /** For one-off filtering operations, the filter design can be
570 done in the filter function itself. The \Ref{FIRlowpass_filter}
571 function takes the signal, cut-off frequency and order as
572 arguments and designs the filter on the fly. Because of the
573 overhead of filter design, this function is expensive and
574 should only be used for one-off operations.
575 */
576 //@{ code
577 FIRlowpass_filter(sig, 400, 99);
578 //@} code
579 /** The equivalent operations exist for high-pass filtering:
580 */
581 //@{ code
582 filter = design_highpass_FIR_filter(sig.sample_rate(), 50, 99);
583 FIRfilter(sig, filter);
584 FIRhighpass_filter(sig, 50, 99);
585 //@} code
586 /** Filters of arbitrary frequency response can also be designed using
587 the \Ref{design_FIR_filter} function. This function takes a
588 EST_FVector of order $2^{N}$ which specifies the desired frequency
589 response up to 1/2 the sampling frequency. The function returns
590 a set of filter coefficients that attempt to match the desired
591 reponse.
592 */
593 //@{ code
594 EST_FVector response(16);
595 response[0] = 1;
596 response[1] = 1;
597 response[2] = 1;
598 response[3] = 1;
599 response[4] = 0;
600 response[5] = 0;
601 response[6] = 0;
602 response[7] = 0;
603 response[8] = 1;
604 response[9] = 1;
605 response[10] = 1;
606 response[11] = 1;
607 response[12] = 0;
608 response[13] = 0;
609 response[14] = 0;
610 response[15] = 0;
611
612 filter = design_FIR_filter(response, 15);
613
614 FIRfilter(sig, response);
615 //@} code
616 /**The normal filtering functions can cause a time delay in the
617 filtered waveform. To attempt to eliminate this, a set of
618 double filter function functions are provided which guarantees
619 zero phase differences between the original and filtered waveform.
620 */
621 //@{ code
622 FIRlowpass_double_filter(sig, 400);
623 FIRhighpass_double_filter(sig, 40);
624 //@} code
625
626 /** Sometimes it is undesirable to have the input signal overwritten.
627 For these cases, a set of parallel functions exist which take
628 a input waveform for reading and a output waveform for writing to.
629 */
630 //@{ code
631 EST_Wave sig_out;
632
633 FIRfilter(sig, sig_out, response);
634 FIRlowpass_filter(sig, sig_out, 400);
635 FIRhighpass_filter(sig, sig_out, 40);
636 //@} code
637 //@}
638
639 }
640
641 //@}
642