1 /*************************************************************************/
2 /* */
3 /* Centre for Speech Technology Research */
4 /* University of Edinburgh, UK */
5 /* Copyright (c) 1996,1997 */
6 /* All Rights Reserved. */
7 /* */
8 /* Permission is hereby granted, free of charge, to use and distribute */
9 /* this software and its documentation without restriction, including */
10 /* without limitation the rights to use, copy, modify, merge, publish, */
11 /* distribute, sublicense, and/or sell copies of this work, and to */
12 /* permit persons to whom this work is furnished to do so, subject to */
13 /* the following conditions: */
14 /* 1. The code must retain the above copyright notice, this list of */
15 /* conditions and the following disclaimer. */
16 /* 2. Any modifications must be clearly marked as such. */
17 /* 3. Original authors' names are not deleted. */
18 /* 4. The authors' names are not used to endorse or promote products */
19 /* derived from this software without specific prior written */
20 /* permission. */
21 /* */
22 /* THE UNIVERSITY OF EDINBURGH AND THE CONTRIBUTORS TO THIS WORK */
23 /* DISCLAIM ALL WARRANTIES WITH REGARD TO THIS SOFTWARE, INCLUDING */
24 /* ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS, IN NO EVENT */
25 /* SHALL THE UNIVERSITY OF EDINBURGH NOR THE CONTRIBUTORS BE LIABLE */
26 /* FOR ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES */
27 /* WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN */
28 /* AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, */
29 /* ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF */
30 /* THIS SOFTWARE. */
31 /* */
32 /*************************************************************************/
33 /* */
34 /* Author: Paul Taylor */
35 /* Date: 6 Jan 1998 */
36 /* --------------------------------------------------------------------- */
37 /* Acoustic Unit Concatenation */
38 /* */
39 /*************************************************************************/
40
41
42 #include "siod.h"
43 #include "EST_sigpr.h"
44 #include "EST_wave_aux.h"
45 #include "EST_track_aux.h"
46 #include "EST_ling_class.h"
47 #include "us_synthesis.h"
48 #include <cmath>
49
50 #include "Phone.h"
51
52 void merge_features(EST_Item *from, EST_Item *to, int keep_id);
53
54 void dp_time_align(EST_Utterance &utt, const EST_String &source_name,
55 const EST_String &target_name,
56 const EST_String &time_name,
57 bool do_start);
58
59 void concatenate_unit_coefs(EST_Relation &unit_stream, EST_Track &source_lpc);
60 void us_unit_raw_concat(EST_Utterance &utt);
61
62 void window_units(EST_Relation &unit_stream,
63 EST_TVector<EST_Wave> &frames,
64 float window_factor,
65 EST_String window_name,
66 bool window_symmetric,
67 EST_IVector *pm_indices=0);
68
69 bool dp_match(const EST_Relation &lexical,
70 const EST_Relation &surface,
71 EST_Relation &match,
72 float ins, float del, float sub);
73
74 void map_match_times(EST_Relation &target, const EST_String &match_name,
75 const EST_String &time_name, bool do_start);
76
77
window_frame(EST_Wave & frame,EST_Wave & whole,float scale,int start,int end,EST_WindowFunc * window_function,int centre_index=-1)78 static void window_frame(EST_Wave &frame, EST_Wave &whole, float scale,
79 int start, int end, EST_WindowFunc *window_function,
80 int centre_index=-1)
81 {
82 int i, j, send;
83 EST_TBuffer<float> window;
84 int window_length = (end-start)+1;
85
86 if (frame.num_samples() != (window_length))
87 frame.resize(window_length);
88 frame.set_sample_rate(whole.sample_rate());
89 // Ensure we have a safe end
90 if (end < whole.num_samples())
91 send = end;
92 else
93 send = whole.num_samples();
94
95
96 int print_centre;
97 if ( centre_index < 0 ){
98 window_function( window_length, window, -1 );
99 print_centre = (window_length-1)/2+start;
100 }
101 else{
102 window_function( window_length, window, (centre_index-start));
103 print_centre = centre_index;
104 }
105
106
107 #if defined(EST_DEBUGGING)
108 cerr << "(start centre end window_length wholewavelen) "
109 << start << " "
110 << print_centre << " "
111 << end << " "
112 << window_length << " "
113 << whole.num_samples() << endl;
114 #endif
115
116
117 // To allow a_no_check access we do this in three stages
118 for (i = 0, j = start; j < 0; ++i, ++j)
119 frame.a_no_check(i) = 0;
120 for ( ; j < send; ++i, ++j)
121 frame.a_no_check(i) = (int)((float)whole.a_no_check(j) * window(i) * scale);
122 for ( ; j < end; ++j,++i)
123 frame.a_no_check(i) = 0;
124
125
126 #if defined(EST_DEBUGGING)
127 // It's not always very nice to resynthesise speech from
128 // inserted zeros! These checks should alert the user (me ;)
129 if( start<0 )
130 EST_warning( "padded start of pitch period with zeros (index %d)", i );
131
132 if( end>whole.num_samples() )
133 EST_warning( "padded end of pitch period with zeros (frame %d)", i );
134 #endif
135 }
136
137
138 // The window_signal function has been changed in several ways:
139 //
140 // *) The function now has an asymmetric window mode.
141 //
142 // In this mode, asymmetric windows are used from pitchmark at t-1
143 // to pitchmark at time t+1, with the maximum value of 1.0 at
144 // pitchmark at time t.
145 //
146 // *) In the original symmetric mode:
147 //
148 // The first change is to ensure the window frames always have an
149 // odd number of samples (a convention for how to handle rounding
150 // problems when converting from times (float) to sample numbers
151 // (int)). The centre sample corresponds to the pitch mark time.
152 //
153 // The second change is that the estimate of local pitch period is
154 // always based in current and *previous* pitchmark. In the case
155 // of the first pitch mark in track pm, the previous pitchmark is
156 // assumed to be at zero time. Hopefully, this won't break much.
157 // However, if this convention is not used everywhere else that
158 // it's needed and some things break, then arguably those
159 // things need to be fixed to adhere to this same convention...
window_signal(EST_Wave & sig,EST_Track & pm,EST_WaveVector & frames,int & i,float scale,float window_factor,EST_WindowFunc * window_function,bool window_symmetric,EST_IVector * pm_indices=0)160 void window_signal(EST_Wave &sig, EST_Track &pm,
161 EST_WaveVector &frames, int &i, float scale,
162 float window_factor,
163 EST_WindowFunc *window_function,
164 bool window_symmetric,
165 EST_IVector *pm_indices=0)
166 {
167 float first_pos, period=0.0;
168 float prev_pm, current_pm;
169 int first_sample, centre_sample, last_sample;
170 int sample_rate = sig.sample_rate();
171 int pm_num_frames = pm.num_frames();
172
173 // estimate first period as pitchmark time itself (i.e. assume a previous
174 // pitchmark at 0.0 time, waveform sample 0)
175 prev_pm = 0.0;
176
177
178 if( window_symmetric )
179 {
180 if (pm_num_frames < 1 )
181 EST_error( "Attempted to Window around less than 1 pitchmark" );
182
183 for( int j=0; j<pm_num_frames; ++j, ++i ){
184 current_pm = pm.t(j);
185 period = current_pm - prev_pm;
186 centre_sample = (int)rint( current_pm*(float)sample_rate );
187
188 first_pos = prev_pm - (period * (window_factor-1.0));
189 first_sample = (int)rint( first_pos*(float)sample_rate );
190
191 last_sample = (2*centre_sample)-first_sample;
192
193 window_frame(frames[i], sig, scale, first_sample, last_sample, window_function);
194
195 prev_pm = current_pm;
196 }
197 }
198 else{
199 if( pm_indices == 0 )
200 EST_error( "required pitchmark indices EST_IVector is null" );
201
202 int j;
203
204 // Rob's experiment to see if we can handle small bits of speech with no pitchmarks.
205 // We just 0 the frames in this case.
206
207 if (pm_num_frames < 1 )
208 {
209 EST_warning( "Attempted to Window around less than 1 pitchmark" );
210 }
211 else
212 {
213 for( j=0; j<pm_num_frames-1; ++j, ++i ){
214 current_pm = pm.t(j);
215 period = current_pm - prev_pm;
216 centre_sample = (int)rint( current_pm*(float)sample_rate );
217
218 first_pos = prev_pm - (period * (window_factor-1.0));
219 first_sample = (int)rint( first_pos*(float)sample_rate );
220
221 float next_pm = pm.t(j+1);
222 float last_pos = next_pm + ((next_pm-current_pm)*(window_factor-1.0));
223 last_sample = (int)rint( last_pos*(float)sample_rate );
224
225 window_frame(frames[i], sig, scale, first_sample,
226 last_sample, window_function, centre_sample);
227 (*pm_indices)[i] = centre_sample - first_sample;
228
229 prev_pm = current_pm;
230 }
231
232 //last frame window size is set according to pm.t(end) and the number
233 //of samples in the waveform (it is presumed the waveform begins at the
234 //preceeding pitchmark and ends at the pitchmark following the current
235 //unit...)
236
237 current_pm = pm.t(j);
238 centre_sample = (int)rint( current_pm*(float)sample_rate );
239 first_pos = prev_pm - (period * (window_factor-1.0));
240 first_sample = (int)rint( first_pos*(float)sample_rate );
241 last_sample = sig.num_samples()-1;
242 window_frame(frames[i], sig, scale, first_sample,
243 last_sample, window_function);
244 (*pm_indices)[i] = centre_sample - first_sample;
245
246 #if defined(EST_DEBUGGING)
247 cerr << "changed: " << i << " " << pm_indices->n() << endl;
248 #endif
249
250 ++i;
251 }
252 }
253 }
254
window_units(EST_Relation & unit_stream,EST_TVector<EST_Wave> & frames,float window_factor,EST_String window_name,bool window_symmetric,EST_IVector * pm_indices)255 void window_units( EST_Relation &unit_stream,
256 EST_TVector<EST_Wave> &frames,
257 float window_factor,
258 EST_String window_name,
259 bool window_symmetric,
260 EST_IVector *pm_indices )
261 {
262 int i;
263 EST_Wave *sig;
264 EST_Item *u;
265 EST_Track *coefs;
266 int num = 0;
267 float scale;
268 EST_WindowFunc *window_function;
269
270 for (u = unit_stream.head(); u; u = u->next())
271 num += track(u->f("coefs"))->num_frames();
272 frames.resize(num);
273
274 if( pm_indices != 0 )
275 pm_indices->resize(num);
276
277 if (window_name == "")
278 window_name = "hanning";
279
280 window_function = EST_Window::creator(window_name);
281
282 for (i = 0, u = unit_stream.head(); u; u = u->next())
283 {
284 sig = wave(u->f("sig"));
285 coefs = track(u->f("coefs"));
286 scale = (u->f_present("scale") ? u->F("scale") : 1.0);
287
288 window_signal(*sig, *coefs, frames, i, scale, window_factor,
289 window_function, window_symmetric, pm_indices);
290 }
291 }
292
293
us_unit_concat(EST_Utterance & utt,float window_factor,const EST_String & window_name,bool no_waveform=false,bool window_symmetric=true)294 void us_unit_concat(EST_Utterance &utt, float window_factor,
295 const EST_String &window_name,
296 bool no_waveform=false,
297 bool window_symmetric=true)
298
299 {
300 EST_Relation *unit_stream;
301 EST_Track *source_coef = new EST_Track;
302 EST_WaveVector *frames = new EST_WaveVector;
303 EST_IVector *pm_indices = 0;
304
305 unit_stream = utt.relation("Unit", 1);
306
307 concatenate_unit_coefs(*unit_stream, *source_coef);
308
309 utt.create_relation("SourceCoef");
310 EST_Item *item = utt.relation("SourceCoef")->append();
311 item->set("name", "coef");
312 item->set_val("coefs", est_val(source_coef));
313
314 if (!no_waveform){
315 if( !window_symmetric )
316 pm_indices = new EST_IVector;
317
318 window_units(*unit_stream, *frames,
319 window_factor, window_name, window_symmetric, pm_indices);
320
321 item->set_val("frame", est_val(frames));
322
323 if( !window_symmetric )
324 item->set_val("pm_indices", est_val(pm_indices));
325 }
326 }
327
328
us_get_copy_wave(EST_Utterance & utt,EST_Wave & source_sig,EST_Track & source_coefs,EST_Relation & source_seg)329 void us_get_copy_wave(EST_Utterance &utt, EST_Wave &source_sig,
330 EST_Track &source_coefs, EST_Relation &source_seg)
331 {
332 EST_Item *s, *n;
333
334 if (!utt.relation_present("Segment"))
335 EST_error("utterance must have \"Segment\" relation\n");
336
337 utt.create_relation("TmpSegment");
338
339 for (s = source_seg.head(); s; s = s->next())
340 {
341 n = utt.relation("TmpSegment")->append();
342 merge_features(n, s, 0);
343 }
344
345 utt.relation("Segment")->remove_item_feature("source_end");
346
347 dp_time_align(utt, "TmpSegment", "Segment", "source_", 0);
348
349 utt.create_relation("Unit");
350 EST_Item *d = utt.relation("Unit")->append();
351
352
353 EST_Wave *ss = new EST_Wave;
354 *ss = source_sig;
355
356 EST_Track *c = new EST_Track;
357 *c = source_coefs;
358
359 d->set_val("sig", est_val(ss));
360 d->set_val("coefs", est_val(c));
361
362 utt.remove_relation("TmpSegment");
363 }
364
365
us_energy_normalise(EST_Relation & unit)366 void us_energy_normalise(EST_Relation &unit)
367 {
368 EST_Wave *sig;
369
370 for (EST_Item *s = unit.head(); s; s = s->next())
371 {
372 sig = wave(s->f("sig"));
373 if (s->f_present("energy_factor"))
374 sig->rescale(s->F("energy_factor"));
375 }
376 }
377
us_unit_raw_concat(EST_Utterance & utt)378 void us_unit_raw_concat(EST_Utterance &utt)
379 {
380 EST_Wave *sig, *unit_sig;
381 EST_Track *unit_coefs=0;
382 float window_factor;
383 int i, j, k;
384 int first_pm, last_pm, last_length;
385 float first_pos, last_pos;
386
387 window_factor = get_c_float(siod_get_lval("window_factor",
388 "UniSyn: no window_factor"));
389 sig = new EST_Wave;
390
391 sig->resize(1000000);
392 sig->fill(0);
393 j = 0;
394
395 for (EST_Item *s = utt.relation("Unit", 1)->head(); s; s = s->next())
396 {
397 unit_sig = wave(s->f("sig"));
398 unit_coefs = track(s->f("coefs"));
399
400 first_pos = unit_coefs->t(1);
401 first_pm = (int)(first_pos * (float)unit_sig->sample_rate());
402
403 last_pos = unit_coefs->t(unit_coefs->num_frames()-2);
404 last_pm = (int)(last_pos * (float)unit_sig->sample_rate());
405 last_length = unit_sig->num_samples() - last_pm;
406
407 // cout << "first pm: " << first_pm << endl;
408 // cout << "last pm: " << last_pm << endl;
409 // cout << "last length: " << last_length << endl;
410
411 j -= first_pm;
412
413 for (i = 0; i < first_pm; ++i, ++j)
414 sig->a_safe(j) += (short)((((float) i)/ (float)first_pm) *(float)unit_sig->a_safe(i)+0.5);
415
416 for (; i < last_pm; ++i, ++j)
417 sig->a(j) = unit_sig->a(i);
418
419 for (k = 0; i < unit_sig->num_samples(); ++i, ++j, ++k)
420 sig->a_safe(j) += (short)((1.0 - (((float) k) / (float) last_length))
421 * (float)unit_sig->a_safe(i) + 0.5);
422
423 // j -= last_length;
424 // j += 2000;
425 }
426
427 sig->resize(j);
428 sig->set_sample_rate(16000);
429
430 add_wave_to_utterance(utt, *sig, "Wave");
431 }
432
433
concatenate_unit_coefs(EST_Relation & unit_stream,EST_Track & source_lpc)434 void concatenate_unit_coefs(EST_Relation &unit_stream, EST_Track &source_lpc)
435 {
436 int num_source_frames = 0;
437 int num_source_channels = 0;;
438 float prev_time, abs_offset, rel_offset, period, offset;
439 int i, j, k, l;
440 EST_Track *coefs;
441
442 EST_Item *u = unit_stream.head();
443 if( u == 0 ){
444 //sometimes we are just asked to synthesise empty utterances, and
445 //code elsewhere wants us to continue...
446 source_lpc.resize(0,0);
447 }
448 else{
449 EST_Track *t = 0;
450 for ( ; u; u = u->next())
451 {
452 t = track(u->f("coefs"));
453 num_source_frames += t->num_frames();
454 }
455
456 num_source_channels = t->num_channels();
457
458 source_lpc.resize(num_source_frames, num_source_channels);
459 source_lpc.copy_setup(*t);
460
461 prev_time = 0.0;
462 // copy basic information
463 for (i = 0, l = 0, u = unit_stream.head(); u; u = u->next())
464 {
465 coefs = track(u->f("coefs"));
466
467 for (j = 0; j < coefs->num_frames(); ++j, ++i)
468 {
469 for (k = 0; k < coefs->num_channels(); ++k)
470 source_lpc.a_no_check(i, k) = coefs->a_no_check(j, k);
471 source_lpc.t(i) = coefs->t(j) + prev_time;
472 }
473
474 prev_time = source_lpc.t(i - 1);
475 u->set("end", prev_time);
476 u->set("num_frames", coefs->num_frames());
477 }
478 }
479
480 // adjust pitchmarks
481 abs_offset = 0.0;
482 rel_offset = 0.0;
483 // absolute offset in seconds
484 abs_offset = get_c_float(siod_get_lval("us_abs_offset", "zz"));
485 // relative offset as a function of local pitch period
486 rel_offset = get_c_float(siod_get_lval("us_rel_offset", "zz"));
487
488 if( abs_offset!=0.0 || rel_offset!=0.0 ){
489 cerr << "Adjusting pitchmarks" << endl;
490 for (i = 0; i < source_lpc.num_frames(); ++i){
491 period = get_time_frame_size(source_lpc, (i));
492 offset = abs_offset + (rel_offset * period);
493 source_lpc.t(i) = source_lpc.t(i) + offset;
494 }
495 }
496 }
497
498 // jointimes specifies centre of last pitch period in each
499 // concatenated unit
500 // void us_linear_smooth_amplitude( EST_Wave *w,
501 // const EST_Track &pm,
502 // const EST_FVector &jointimes)
503 // {
504 // int num_joins = jointimes.length();
505
506 // EST_Track *factor_contour = new EST_Track( num_joins );
507
508 // for( int i=0; i<num_joins; ++i ){
509 // float join_t = jointimes(i);
510 // int join_indx = pm.index_below( join_t );
511
512 // // estimate local short-time energy function either side of join
513 // int left_start = rount(pm.t(join_indx-2)*(float)16000);
514 // int left_end = rount(pm.t(join_indx)*(float)16000);
515 // float left_power = 0.0 ;
516 // for( int j=left_start; j<left_end; ++j )
517 // left_power += pow( w[j], 2 );
518
519 // left_power /= (left_end - left_start); //normalise for frame length
520
521 // int right_start = rount(pm.t(join_indx+1)*(float)16000);
522 // int right_end = rount(pm.t(join_indx+3)*(float)16000);
523 // float right_power = 0.0;
524 // for( int j=right_start; j<right_end; ++j )
525 // right_power += pow( w[j], 2 );
526
527 // right_power /= (right_end - right_start); //normalise for frame length
528
529 // float mean_power = (left_power+right_power)/2.0;
530
531 // float left_factor = left_power/mean_power;
532 // float right_factor = right_power/mean_power;
533
534 // (*factor_contour)[i] = left_factor;
535 // (*factor_contour)[i+1] = right_factor;
536 // }
537
538 // }
539
us_pitch_period_energy_contour(const EST_WaveVector & pp,const EST_Track & pm)540 static EST_Track* us_pitch_period_energy_contour( const EST_WaveVector &pp,
541 const EST_Track &pm )
542 {
543 const int pp_length = pp.length();
544
545 EST_Track *contour = new EST_Track;
546 contour->resize( pp_length, 1 );
547
548 for( int i=0; i<pp_length; ++i ){
549 const EST_Wave &frame = pp(i);
550 const int frame_length = frame.length();
551
552 // RMSE for EST_Wave window
553 int j;
554 for( contour->a_no_check(i,0) = 0.0, j=0; j<frame_length; ++j )
555 contour->a_no_check( i, 0 ) += pow( float(frame.a_no_check( j )), float(2.0) );
556
557 contour->a_no_check(i,0) = sqrt( contour->a_no_check(i,0) / (float)j );
558 contour->t(i) = pm.t(i);
559 }
560
561 return contour;
562 }
563
564 EST_Val ffeature(EST_Item *item,const EST_String &fname);
565
us_linear_smooth_amplitude(EST_Utterance * utt)566 void us_linear_smooth_amplitude( EST_Utterance *utt )
567 {
568 EST_WaveVector *pp = wavevector(utt->relation("SourceCoef")->first()->f("frame"));
569 EST_Track *pm = track(utt->relation("SourceCoef")->first()->f("coefs"));
570
571 EST_Track *energy = us_pitch_period_energy_contour( *pp, *pm );
572 energy->save( "./energy_track.est", "est" );
573
574 FILE *ofile = fopen( "./join_times.est", "w" );
575 EST_Relation *units = utt->relation("Unit");
576 for( EST_Item *u=units->head(); u; u=u->next() ){
577
578 EST_Item *diphone_left = u;
579 // EST_Item *diphone_right = u->next();
580
581 fprintf( ofile, "%s\t%f\n", diphone_left->S("name").str(), diphone_left->F("end"));
582
583 EST_Item *join_phone_left = item(diphone_left->f("ph1"))->next();
584 EST_String phone_name = join_phone_left->S("name");
585 if( ph_is_sonorant( phone_name ) && !ph_is_silence( phone_name )){
586
587 //if( (ffeature(join_phone_left, "ph_vc")).S() == "+"){ // ideally for sonorants
588
589 cerr << "smoothing phone " << join_phone_left->S("name") << "\n";
590
591 // EST_Item *join_phone_right = item(diphone_right->f("ph1"));
592
593 int left_end_index = energy->index(diphone_left->F("end"));
594 int right_start_index = left_end_index + 1;
595 float left_power = energy->a(left_end_index,0);
596 float right_power = energy->a(right_start_index,0);
597
598 float mean_power = (left_power+right_power)/2.0;
599 float left_factor = left_power/mean_power;
600 float right_factor = right_power/mean_power;
601
602 int smooth_start_index = left_end_index-5;
603 int smooth_end_index = right_start_index+5;
604
605
606 // rescale left pitch periods
607 float factor = 1.0;
608 float factor_incr = (left_factor-1.0)/(float)(left_end_index - smooth_start_index);
609 for( int i=smooth_start_index; i<=left_end_index; ++i, factor+=factor_incr ){
610 (*pp)[i].rescale( factor, 0 );
611 cerr << "rescaled frame " << i << "(factor " << factor << ")\n";
612 }
613
614 // rescale right pitch periods
615 factor = right_factor;
616 factor_incr = (1.0-right_factor)/(float)(smooth_end_index-right_start_index);
617 for( int i=right_start_index; i<=smooth_end_index; ++i, factor+=factor_incr){
618 (*pp)[i].rescale( factor, 0 );
619 cerr << "rescaled frame " << i << "(factor " << factor << ")\n";
620 }
621 }
622 else
623 cerr << "no smoothing for " << join_phone_left->S("name") << "\n";
624
625 cerr <<endl;
626 }
627
628 fclose( ofile );
629 delete energy;
630 }
631
632