1 /************************************************************************/
2 /*                                                                      */
3 /*                Centre for Speech Technology Research                 */
4 /*                     University of Edinburgh, UK                      */
5 /*                       Copyright (c) 1996,1997                        */
6 /*                        All Rights Reserved.                          */
7 /*                                                                      */
8 /*  Permission is hereby granted, free of charge, to use and distribute */
9 /*  this software and its documentation without restriction, including  */
10 /*  without limitation the rights to use, copy, modify, merge, publish, */
11 /*  distribute, sublicense, and/or sell copies of this work, and to     */
12 /*  permit persons to whom this work is furnished to do so, subject to  */
13 /*  the following conditions:                                           */
14 /*   1. The code must retain the above copyright notice, this list of   */
15 /*      conditions and the following disclaimer.                        */
16 /*   2. Any modifications must be clearly marked as such.               */
17 /*   3. Original authors' names are not deleted.                        */
18 /*   4. The authors' names are not used to endorse or promote products  */
19 /*      derived from this software without specific prior written       */
20 /*      permission.                                                     */
21 /*                                                                      */
22 /*  THE UNIVERSITY OF EDINBURGH AND THE CONTRIBUTORS TO THIS WORK       */
23 /*  DISCLAIM ALL WARRANTIES WITH REGARD TO THIS SOFTWARE, INCLUDING     */
24 /*  ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS, IN NO EVENT  */
25 /*  SHALL THE UNIVERSITY OF EDINBURGH NOR THE CONTRIBUTORS BE LIABLE    */
26 /*  FOR ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES   */
27 /*  WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN  */
28 /*  AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION,         */
29 /*  ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF      */
30 /*  THIS SOFTWARE.                                                      */
31 /*                                                                      */
32 /************************************************************************/
33 /*                                                                      */
34 /*                 Author: Paul Taylor (pault@cstr.ed.ac.uk)            */
35 /*                   Date: Fri May  9 1997                              */
36 /* -------------------------------------------------------------------  */
37 /* Examples of Generation of Acoustic Feature Vectors from Waveforms    */
38 /*                                                                      */
39 /************************************************************************/
40 
41 #include <cstdlib>
42 #include "EST_sigpr.h"
43 #include "EST_cmd_line.h"
44 #include "EST_inline_utils.h"
45 #include "EST_sigpr.h"
46 
47 /**@name Signal processing example code
48   *
49   * @toc
50   */
51 //@{
52 
53 EST_StrList empty;
54 
55 void print_map(EST_TrackMap &t);
56 void print_track_map(EST_Track &t);
57 
main(void)58 int main(void)
59 
60 {
61     EST_StrList base_list; // decl
62     EST_StrList delta_list; // decl
63     EST_StrList acc_list; // decl
64     EST_Option op, al; // decl
65     init_lib_ops(al, op);
66     EST_Wave sig; // decl
67     EST_Track fv, part; // decl
68     float shift; // decl
69     int i;
70 
71 
72     cout << "position 1\n";
73 
74     /**@name Producing a single type of feature vector for an utterance
75 
76        A number of types of signal processing can be performed by the
77        sig2coef function. The following code demonstrates a simple
78        case of calculating the linear prediction (LP) coefficients for
79        a waveform.
80 
81        First set the order of the lpc analysis to 16 (this entails 17 actual
82        coefficients) and then load in the waveform to be analysed.
83 
84       */
85     //@{
86 
87 
88     //@{ code
89 
90     int lpc_order = 16;
91     sig.load(DATA "/kdt_001.wav");
92 
93     //@} code
94 
95     /**  Now allocate enough space in the track to hold the analysis.
96 	 The following command resizes fv to have enough frames for
97 	 analysis frames at 0.01 intervals up to the end of the waveform,
98 	 (sig.end()), and enough channels to store lpc_order + 1 coefficients.
99 	 The channels are named so as to take lpc coefficients.
100     */
101     //@{ code
102     int num_frames;
103     num_frames = (int)ceil(sig.end() / 0.01);
104     fv.resize(num_frames, lpc_order + 1);
105     //@} code
106 
107     /** The positions of the frames, corresponding to the middel of their
108 	analysis window also needs to be set. For fixed frame analysis, this
109 	can be done with the fill_time() function: */
110 
111     //@{ code
112     fv.fill_time(0.01);
113     //@} code
114 
115     /** The simplest way to do the actual analysis is as follows, which
116 	will fill the track with the values from the LP analysis using the
117 	default processing controls.
118     */
119 
120     //@{ code
121     sig2coef(sig, fv, "lpc");
122     //@} code
123 
124     /** In this style of analysis, default values are used to control the
125 	windowing mechanisms which split the whole signal into frames.
126 
127 	Specifically, each frame is defined to start a certain distance
128 	before the time interval, and extending the same distance after.
129 	This distance is calculated as a function of the local window
130 	spacing and can be adjusted as follows:
131 
132 	Extending one time period before and one time period after the
133 	current time mark:
134     */
135     //@{ code
136     sig2coef(sig, fv, "lpc", 2.0);
137     //@} code
138     /** Extending 1.5 time periods before and  after the
139     current time mark, etc;
140     */
141     //@{ code
142     sig2coef(sig, fv, "lpc", 3.0);
143     //@} code
144 
145     /** The type of windowing function may be changed also as this
146 	can be passed in as an optional argument. First we
147 	create a window function (This is explained more in \Ref{Windowing}).
148     */
149     //@{ code
150     EST_WindowFunc *wf =  EST_Window::creator("hamming");
151     //@} code
152     /** and then pass it in as the last argument
153      */
154     //@{ code
155     sig2coef(sig, fv, "lpc", 3.0, wf);
156     //@} code
157     //@}
158 
159     /**@name Pitch-Synchronous vs fixed frame analysis.
160 
161       Most of the core signal processing functions operate on individual
162       frames of speech and are oblivious as to how these frames were
163       extracted from the original speech. This allows us to take the frames
164       from anywhere in the signal: specifically, this facilitates two
165       common forms of analysis:
166 
167       <formalpara><title>fixed frame</title><para>
168       The time points are space at even intervals throughout the signal.
169       </para></formalpara>
170       <formalpara><title>pitch-synchronous</title><para>
171       The time points represent <emphasis>pitchmarks</emphasis>
172       and correspond to a specific position in each pitch period,
173       e.g. the instant of glottal closure.</para></formalpara>
174       <para>
175 
176       It is a simple matter to fill the time array, but normally
177       pitchmarks are read from a file or taken from another signal
178       processing algorithm (see \Ref{Pitchmark functions.}).
179       </para>
180       <para>
181 
182 
183       There are many ways to fill the time array for fixed frame analysis.
184 
185       manually:
186 
187       */
188     //@{
189 
190     //@{ code
191     int num_frames = 300;
192     fv.resize(num_frames, lpc_order + 1);
193     shift = 0.01; // time interval in seconds
194 
195     for (i = 0; i < num_frames; ++i)
196 	fv.t(i) = shift * (float) i;
197     //@} code
198     /** or by use of the  member function \Ref{EST_Track::fill_time}
199      */
200 
201     //@{ code
202     fv.fill_time(0.01);
203     //@} code
204 
205     /** Pitch synchronous values can simply be read from pitchmark
206      files:
207     */
208     //@{ code
209     fv.load(DATA "/kdt_001.pm");
210     make_track(fv, "lpc", lpc_order + 1);
211     //@} code
212 
213     /** Regardless of how the time points where obtain, the analysis
214      function call is just the same:
215     */
216     //@{ code
217     sig2coef(sig, fv, "lpc");
218     //@} code
219     //@}
220 
221     cout << "position 3\n";
222 
223     /**@name Naming Channels
224       @id sigpr-example-naming-channels
225       Multiple types of feature vector can be stored in the same Track.
226       Imagine that we want lpc, cepstrum and power
227       coefficients in that order in a track. This can be achieved by using
228       the \Ref{sig2coef} function multiple times, or by the wrap
229       around \Ref{sigpr_base} function.
230       </para><para>
231 
232       It is vitally important here to ensure that before passing the
233       track to the signal processing functions that it has the correct
234       number of channels and that these are appropriately named. This is
235       most easily done using the track map facility, explained
236       in <LINK LINKEND="proan129">Naming Channels</LINK>
237       </para><para>
238 
239       For each call, we only us the part of track that is relevant.
240       The sub_track member function of \Ref{EST_Track} is used to get
241       this. In the following example, we are assuming here that
242       fv has sufficient space for 17
243       lpc coefficients, 8 cepstrum  coefficients and power and that
244       they are stored in that order.
245 
246       */
247     //@{
248     //@{ code
249 
250     int cep_order = 16;
251     EST_StrList map;
252 
253     map.append("$lpc-0+" Stringtoi(lpc_order));
254     map.append("$cepc-0+" Stringtoi(cep_order));
255     map.append("power");
256 
257     fv.resize(EST_CURRENT, map);
258     //@} code
259 
260     /** An alternative is to use <function>add_channels_to_map()</function>
261 	which takes a list of coefficient types and makes a map.
262 	The order of each type of processing is extracted from
263 	op.
264 	*/
265 
266     //@{ code
267 
268     EST_StrList coef_types;
269 
270     coef_types.append("lpc");
271     coef_types.append("cep");
272     coef_types.append("power");
273 
274     map.clear();
275 
276     add_channels_to_map(map, coef_types, op);
277     fv.resize(EST_CURRENT, map);
278 
279     //@} code
280 
281     /** After allocating the right number of frames and channels
282 	 in {\tt fv}, we extract a sub_track, which has all the frames
283 	 (i.e. between 0 and EST_ALL) and all the lpc channels
284     */
285     //@{ code
286     fv.sub_track(part, 0, EST_ALL, 0, "lpc_0", "lpc_N");
287     //@} code
288     /** now call the signal processing function on this part:
289      */
290     //@{ code
291     sig2coef(sig, part, "lpc");
292     //@} code
293 
294     /** We repeat the procedure for the cepstral coefficients, but this
295 	time take the next 8 channels (17-24 inclusive)  and calculate the coefficients:
296     */
297     //@{ code
298     fv.sub_track(part, 0, EST_ALL, "cep_0", "cep_N");
299 
300     sig2coef(sig, part, "cep");
301     //@} code
302     /** Extract the last channel for power and call the power function:
303      */
304     //@{ code
305     fv.sub_track(part, 0, EST_ALL, "power", 1);
306     power(sig, part, 0.01);
307 
308     //@} code
309 
310      /** While the above technique is adequate for our needs and is
311 	a useful demonstration of sub_track extraction, the
312 	\Ref{sigpr_base} function is normally easier to use as it does
313 	all the sub track extraction itself. To perform the lpc, cepstrum
314 	and power analysis, we put these names into a StrList and
315 	call \Ref{sigpr_base}.
316      */
317     //@{ code
318     base_list.clear(); // empty the list, just in case
319     base_list.append("lpc");
320     base_list.append("cep");
321     base_list.append("power");
322 
323     sigpr_base(sig, fv, op, base_list);
324     //@} code
325     /** This will call \Ref{sigpr_track} as many times as is necessary.
326      */
327     //@}
328 
329     /**@name Producing delta and acceleration coefficients
330 
331       Delta coefficients represent the numerical differentiation of a
332       track, and acceleration coefficients represent the second
333       order numerical differentiation.
334 
335       By convention, delta coefficients have a "_d" suffix and acceleration
336       coefficients "_a". If the coefficient is multi-dimensional, the
337       numbers go after the "_d" or "_a".
338 
339       */
340     //@{
341     //@{ code
342 
343     map.append("$cep_d-0+" Stringtoi(cep_order)); // add deltas
344     map.append("$cep_a-0+" Stringtoi(cep_order)); // add accs
345 
346     fv.resize(EST_CURRENT, map); // resize the track.
347     //@} code
348     /**
349 	Given a EST_Track of coefficients {\tt fv}, the \Ref{delta}
350 	function is used to produce the delta equivalents {\tt
351 	del}. The following uses the track allocated above and
352 	generates a set of cepstral coefficients and then makes their
353 	delta and acc:
354 
355     */
356     //@{ code
357 
358     EST_Track del, acc;
359 
360     fv.sub_track(part, 0, EST_ALL, 0, "cep_0", "cep_N"); // make subtrack of coefs
361     sig2coef(sig, part, "cep");  // fill with cepstra
362 
363     // make subtrack of deltas
364     fv.sub_track(del, 0, EST_ALL, 0, "cep_d_0", "cep_d_N");
365     delta(part, del);  // calculate deltas of part, and place answer in del
366 
367     // make subtrack of accs
368     fv.sub_track(acc, 0, EST_ALL, 0, "cep_a_0", "cep_a_N");
369     delta(del, acc);  // calculate deltas of del, and place answer in acc
370     //@} code
371     /** It is possible to directly calculate the delta coefficients of
372 	a type of coefficient, even if we don't have the base type.
373 	\Ref{sigpr_delta} will process the waveform, make a temporary
374 	track of the required type "lpc" and calculate the delta of this.
375 	</para><para>
376 	The following makes a set of delta reflection coefficients:
377 
378     */
379     //@{ code
380     map.append("$ref_d-0+" Stringtoi(lpc_order)); // add to map
381     fv.resize(EST_CURRENT, map); // resize the track.
382 
383     sigpr_delta(sig, fv, op, "ref");
384     //@} code
385     /** an equivalent function exists for acceleration coefficients:
386      */
387     //@{ code
388     map.append("$lsf_a-0+" Stringtoi(lpc_order)); // add acc lsf
389     fv.resize(EST_CURRENT, map); // resize the track.
390 
391     sigpr_acc(sig, fv, op, "ref");
392 
393     //@} code
394     //@}
395 
396     /**@name Windowing
397 
398       The \Ref{EST_Window} class provides a variety of means to
399       divide speech into frames using windowing mechanisms.
400 
401       </para><para>
402       A window function can be created from a window name using the
403       \Ref{EST_Window::creator} function:
404      */
405     //@{
406     //@{ code
407 
408     EST_WindowFunc *hamm =  EST_Window::creator("hamming");
409     EST_WindowFunc *rect =  EST_Window::creator("rectangular");
410     //@} code
411     /** This function can then be used to create a EST_TBuffer of
412 	window values. In the following example the values from a
413 	256 point hamming window are stored in the buffer win_vals:
414     */
415     //@{ code
416     EST_FVector frame;
417     EST_FVector win_vals;
418 
419     hamm(256, win_vals);
420     //@} code
421 
422     /** The make_window function also creates a window:
423      */
424     //@{ code
425     EST_Window::make_window(win_vals, 256, "hamming",-1);
426     //@} code
427 
428     /** this can then be used to make a frame of speech from the main EST_Wave
429 	sig. The following example extracts speech starting at sample 1000:
430     */
431     //@} code
432     for (i = 0; i < 256; ++i)
433 	frame[i] = (float)sig.a(i + 1000) * win_vals[i];
434     //@} code
435 
436     /** Alternatively, exactly the same operation can be performed in a
437 	single step by passing the window function to the
438 	\Ref{EST_Window::window_signal} function which takes a
439 	\Ref{EST_Wave} and performs windowing on a section of it,
440 	storing the output in the \Ref{EST_FVector} {\tt frame}.
441     */
442     //@{ code
443     EST_Window::window_signal(sig, hamm, 1000, 256, frame, 1);
444     //@} code
445     /** The window function need not be explicitly created, the window
446 	signal can work on just the name of the window type:
447     */
448 
449     //@{ code
450     EST_Window::window_signal(sig, "hamming", 1000, 256, frame, 1);
451     //@} code
452 
453     //@}
454     /**@name Frame based signal processing
455       @id sigpr-example-frames
456       The signal processing library provides an extensive set of functions
457       which operate on a single frame of coefficients.
458       The following example shows one method of splitting the signal
459       into frames and calling a signal processing algorithm.
460 
461       First set up the track for 16 order LP analysis:
462 
463       */
464     //@{
465     //@{ code
466 
467     map.clear();
468     map.append("$lpc-0+16");
469 
470     fv.resize(EST_CURRENT, map);
471 
472     //@} code
473     /** In this example, we take the analysis frame length to be 256 samples
474 	long, and the shift in samples is just the shift in seconds times the
475 	sampling frequency.
476     */
477     //@{ code
478     int s_length = 256;
479     int s_shift =  int(shift * float(sig.sample_rate()));
480     EST_FVector coefs;
481     //@} code
482 
483     /** Now we set up a loop which calculates the frames one at a time.
484 	 {\tt start} is the start position in samples of each frame.
485 	 The \Ref{EST_Window::window_signal} function is called which
486 	 makes a \Ref{EST_FVector} frame of the speech via a hamming window.
487 
488 	 Using the \Ref{EST_Track::frame} function, the EST_FVector
489 	 {\tt coefs} is set to frame {\tt k} in the track. It is important
490 	 to understand that this operation involves setting an internal
491 	 smart pointer in {\tt coefs} to the memory of frame {\tt k}. This
492 	 allows the signal processing function \Ref{sig2lpc} to operate
493 	 on an input and output \Ref{EST_FVector}, without any copying to or
494 	 from the main track. After the \Ref{sig2lpc} call, the kth frame
495 	 of {\tt fv} is now filled with the LP coefficients.
496     */
497     //@{ code
498     for (int k1 = 0; k1 < fv.num_frames(); ++k1)
499     {
500 	int start = (k1 * s_shift) - (s_length/2);
501 	EST_Window::window_signal(sig, "hamming", start, s_length, frame, 1);
502 
503 	fv.frame(coefs, k1); 	// Extract a single frame
504 	sig2lpc(frame, coefs); 	// Pass this to actual algorithm
505     }
506     //@} code
507 
508     /** A slightly different tack can be taken for pitch-synchronous analysis.
509 	Setting up fv with the pitchmarks and channels:
510     */
511     //@{ code
512     fv.load(DATA "/kd1_001.pm");
513     fv.resize(EST_CURRENT, map);
514     //@} code
515     /** Set up as before, but this time calculate the window starts and
516 	lengths from the time points. In this example, the length is a
517 	{\tt factor} (twice) the local frame shift.
518 	Note that the only difference between this function and the fixed
519 	frame one is in the calculation of the start and end points - the
520 
521 	windowing, frame extraction and call to \Ref{sig2lpc} are exactly
522 	the same.
523     */
524     //@{ code
525     float factor = 2.0;
526 
527     for (int k2 = 0; k2 < fv.num_frames(); ++k2)
528     {
529 	s_length = irint(get_frame_size(fv, k2, sig.sample_rate())* factor);
530 	int start = (irint(fv.t(k2) * sig.sample_rate()) - (s_length/2));
531 
532 	EST_Window::window_signal(sig, wf, start, s_length, frame, 1);
533 
534 	fv.frame(coefs, k2);
535 	sig2lpc(frame, coefs);
536     }
537     //@} code
538     //@}
539 
540     /**@name Filtering
541 
542        In the EST library we so far have two main types of filter,
543        {\bf finite impulse response (FIR)} filters and {\bf linear
544        prediction (LP)} filters. {\bf infinite impulse response (IIR)}
545        filters are not yet implemented, though LP filters are a
546        special case of these.
547        </para><para>
548        Filtering involves 2 stages: the design of the filter and the
549        use of this filter on the waveform.
550        </para><para>
551        First we examine a simple low-pass filter which attempts to suppress
552        all frequencies about a cut-off. Imagine we want to low pass filter
553        a signal at 400Hz. First we design the filter:
554     */
555     //@{
556     //@{ code
557 
558     EST_FVector filter;
559     int freq = 400;
560     int filter_order = 99;
561 
562     filter = design_lowpass_FIR_filter(sig.sample_rate(), 400, 99);
563     //@} code
564     /** And now use this filter on the signal:
565      */
566     //@{ code
567     FIRfilter(sig, filter);
568     //@} code
569     /** For one-off filtering operations, the filter design can be
570 	done in the filter function itself. The \Ref{FIRlowpass_filter}
571 	function takes the signal, cut-off frequency and order as
572 	arguments and designs the filter on the fly. Because of the
573 	overhead of filter design, this function is expensive and
574 	should only be used for one-off operations.
575     */
576     //@{ code
577     FIRlowpass_filter(sig, 400, 99);
578     //@} code
579     /** The equivalent operations exist for high-pass filtering:
580      */
581     //@{ code
582     filter = design_highpass_FIR_filter(sig.sample_rate(), 50, 99);
583     FIRfilter(sig, filter);
584     FIRhighpass_filter(sig, 50, 99);
585     //@} code
586     /** Filters of arbitrary frequency response can also be designed using
587 	the \Ref{design_FIR_filter} function. This function takes a
588 	EST_FVector of order $2^{N}$ which specifies the desired frequency
589 	response up to 1/2 the sampling frequency. The function returns
590 	a set of filter coefficients that attempt to match the desired
591 	reponse.
592     */
593     //@{ code
594     EST_FVector response(16);
595     response[0] = 1;
596     response[1] = 1;
597     response[2] = 1;
598     response[3] = 1;
599     response[4] = 0;
600     response[5] = 0;
601     response[6] = 0;
602     response[7] = 0;
603     response[8] = 1;
604     response[9] = 1;
605     response[10] = 1;
606     response[11] = 1;
607     response[12] = 0;
608     response[13] = 0;
609     response[14] = 0;
610     response[15] = 0;
611 
612     filter = design_FIR_filter(response, 15);
613 
614     FIRfilter(sig, response);
615     //@} code
616     /**The normal filtering functions can cause a time delay in the
617        filtered waveform. To attempt to eliminate this, a set of
618        double filter function functions are provided which guarantees
619        zero phase differences between the original and filtered waveform.
620     */
621     //@{ code
622     FIRlowpass_double_filter(sig, 400);
623     FIRhighpass_double_filter(sig, 40);
624     //@} code
625 
626     /** Sometimes it is undesirable to have the input signal overwritten.
627 	For these cases, a set of parallel functions exist which take
628 	a input waveform for reading and a output waveform for writing to.
629     */
630     //@{ code
631     EST_Wave sig_out;
632 
633     FIRfilter(sig, sig_out, response);
634     FIRlowpass_filter(sig, sig_out, 400);
635     FIRhighpass_filter(sig, sig_out, 40);
636     //@} code
637     //@}
638 
639 }
640 
641 //@}
642