1 /*************************************************************************/
2 /* */
3 /* Centre for Speech Technology Research */
4 /* (University of Edinburgh, UK) and */
5 /* Korin Richmond */
6 /* Copyright (c) 2002 */
7 /* All Rights Reserved. */
8 /* */
9 /* Permission is hereby granted, free of charge, to use and distribute */
10 /* this software and its documentation without restriction, including */
11 /* without limitation the rights to use, copy, modify, merge, publish, */
12 /* distribute, sublicense, and/or sell copies of this work, and to */
13 /* permit persons to whom this work is furnished to do so, subject to */
14 /* the following conditions: */
15 /* */
16 /* 1. The code must retain the above copyright notice, this list of */
17 /* conditions and the following disclaimer. */
18 /* 2. Any modifications must be clearly marked as such. */
19 /* 3. Original authors' names are not deleted. */
20 /* 4. The authors' names are not used to endorse or promote products */
21 /* derived from this software without specific prior written */
22 /* permission. */
23 /* */
24 /* THE UNIVERSITY OF EDINBURGH AND THE CONTRIBUTORS TO THIS WORK */
25 /* DISCLAIM ALL WARRANTIES WITH REGARD TO THIS SOFTWARE, INCLUDING */
26 /* ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS. IN NO EVENT */
27 /* SHALL THE UNIVERSITY OF EDINBURGH NOR THE CONTRIBUTORS BE LIABLE */
28 /* FOR ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES */
29 /* WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN */
30 /* AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, */
31 /* ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF */
32 /* THIS SOFTWARE. */
33 /* */
34 /*************************************************************************/
35 /* */
36 /* Author: Korin Richmond */
37 /* Date: Aug 2002 */
38 /* --------------------------------------------------------------------- */
39 /* first stab at a diphone unit selection "voice" - using a list of */
40 /* utterance objects */
41 /*************************************************************************/
42
43 #include "festival.h"
44 #include "DiphoneUnitVoice.h"
45 #include "DiphoneVoiceModule.h"
46 #include "EST_DiphoneCoverage.h"
47 #include "EST_rw_status.h"
48 #include "EST_viterbi.h"
49 #include "EST_Track.h"
50 #include "EST_track_aux.h"
51 #include "EST_Wave.h"
52 #include "EST_THash.h"
53 #include "EST_TList.h"
54 #include "EST_types.h"
55 #include "ling_class/EST_Utterance.h"
56 #include "siod.h"
57 #include "siod_est.h"
58 #include "safety.h"
59 #include <cstdlib>
60
61 #include "EST_TargetCost.h"
62 #include "TargetCostRescoring.h"
63 #include "EST_JoinCost.h"
64 #include "EST_JoinCostCache.h"
65
66 #include "EST_Val.h"
67
68 SIOD_REGISTER_TYPE(itemlist,ItemList)
69 VAL_REGISTER_TYPE(itemlist,ItemList)
70
71 // from src/modules/UniSyn_diphone/us_diphone.h
72 // this won't be staying here long...
73 void parse_diphone_times(EST_Relation &diphone_stream,
74 EST_Relation &source_lab);
75
SIOD_REGISTER_CLASS(du_voice,DiphoneUnitVoice)76 SIOD_REGISTER_CLASS(du_voice,DiphoneUnitVoice)
77 VAL_REGISTER_CLASS(du_voice,DiphoneUnitVoice)
78
79 static void my_parse_diphone_times(EST_Relation &diphone_stream,
80 EST_Relation &source_lab)
81 {
82 EST_Item *s, *u;
83 float dur1, dur_u, p_time=0.0;
84
85 // NOTE: because of the extendLeft/extendRight phone join hack for missing diphones,
86 // the unit linked list *may be* shorter that the segment list.
87 //(admittedly could cause confusion)
88
89 for( s=source_lab.head(), u=diphone_stream.head(); (u!=0)&&(s!=0); u=u->next(), s=s->next()){
90 EST_Track *pm = track(u->f("coefs"));
91
92 int end_frame = pm->num_frames() - 1;
93 int mid_frame = u->I("middle_frame");
94
95 dur1 = pm->t(mid_frame);
96 dur_u = pm->t(end_frame);
97
98 s->set("end", (p_time+dur1) );
99
100 p_time += dur_u;
101 u->set("end", p_time);
102
103 if( u->f_present("extendRight") ){//because diphone squeezed out (see above)
104 s = s->next();
105 s->set("end", p_time );
106 }
107 }
108
109 if(s)
110 s->set("end", (p_time));
111 }
112
113 // temporary hack necessary because decoder can only take a
114 // function pointer (would be better to relax this restriction in
115 // the EST_Viterbi_Decoder class, or in a replacement class, rather
116 // than using this hack)
117 static DiphoneUnitVoice *globalTempVoicePtr = 0;
118
DiphoneUnitVoice(const EST_StrList & basenames,const EST_String & uttDir,const EST_String & wavDir,const EST_String & pmDir,const EST_String & coefDir,unsigned int sr,const EST_String & uttExt,const EST_String & wavExt,const EST_String & pmExt,const EST_String & coefExt)119 DiphoneUnitVoice::DiphoneUnitVoice( const EST_StrList& basenames,
120 const EST_String& uttDir,
121 const EST_String& wavDir,
122 const EST_String& pmDir,
123 const EST_String& coefDir,
124 unsigned int sr,
125 const EST_String& uttExt,
126 const EST_String& wavExt,
127 const EST_String& pmExt,
128 const EST_String& coefExt )
129 : pruning_beam( -1 ),
130 ob_pruning_beam( -1 ),
131 tc_rescoring_beam( -1 ),
132 tc_rescoring_weight( 0.0 ),
133 tc_weight( 1.0 ),
134 jc_weight( 1.0 ),
135 jc_f0_weight( 1.0 ),
136 jc_power_weight( 1.0 ),
137 jc_spectral_weight( 1.0 ),
138 prosodic_modification( 0 ),
139 wav_srate( sr ),
140 jc( 0 ),
141 jc_delete( false ),
142 tc( 0 ),
143 tc_delete( false ),
144 tcdh( 0 )
145
146 {
147 // make the default voice module with the supplied parameters
148 addVoiceModule( basenames, uttDir, wavDir, pmDir, coefDir,
149 wav_srate,
150 uttExt, wavExt, pmExt, coefExt );
151
152 diphone_backoff_rules = 0;
153 }
154
initialise(bool ignore_bad_tag)155 void DiphoneUnitVoice::initialise( bool ignore_bad_tag )
156 {
157 if( jc == 0 )
158 EST_error( "Need to set join cost calculator for voice" );
159
160 if( tc == 0 )
161 EST_error( "Need to set target cost calculator for voice" );
162
163 EST_TList<DiphoneVoiceModule*>::Entries it;
164
165 for( it.begin(voiceModules); it; it++ )
166 (*it)->initialise( tc, ignore_bad_tag );
167 }
168
addVoiceModule(const EST_StrList & basenames,const EST_String & uttDir,const EST_String & wavDir,const EST_String & pmDir,const EST_String & coefDir,unsigned int srate,const EST_String & uttExt,const EST_String & wavExt,const EST_String & pmExt,const EST_String & coefExt)169 bool DiphoneUnitVoice::addVoiceModule( const EST_StrList& basenames,
170 const EST_String& uttDir,
171 const EST_String& wavDir,
172 const EST_String& pmDir,
173 const EST_String& coefDir,
174 unsigned int srate,
175 const EST_String& uttExt,
176 const EST_String& wavExt,
177 const EST_String& pmExt,
178 const EST_String& coefExt )
179
180 {
181 DiphoneVoiceModule *vm;
182
183 if( srate != wav_srate )
184 EST_error( "Voice samplerate: %d\nmodule samplerate: %d",
185 wav_srate, srate );
186
187 vm = new DiphoneVoiceModule( basenames, uttDir, wavDir, pmDir, coefDir,
188 srate,
189 uttExt, wavExt, pmExt, coefExt );
190 CHECK_PTR(vm);
191
192 registerVoiceModule( vm );
193
194 return true;
195 }
196
197
registerVoiceModule(DiphoneVoiceModule * vm)198 void DiphoneUnitVoice::registerVoiceModule( DiphoneVoiceModule *vm )
199 {
200 voiceModules.append( vm );
201 }
202
203
setJoinCost(EST_JoinCost * jcost,bool del)204 void DiphoneUnitVoice::setJoinCost( EST_JoinCost *jcost, bool del )
205 {
206 if( jc_delete == true )
207 if( jc != 0 )
208 delete jc;
209
210 jc = jcost;
211 jc_delete = del;
212 }
213
setTargetCost(EST_TargetCost * tcost,bool del)214 void DiphoneUnitVoice::setTargetCost( EST_TargetCost *tcost, bool del )
215 {
216 if( tc_delete == true )
217 if( tc != 0 )
218 delete tc;
219
220 tc = tcost;
221 tc_delete = del;
222 }
223
224
~DiphoneUnitVoice()225 DiphoneUnitVoice::~DiphoneUnitVoice()
226 {
227 EST_TList<DiphoneVoiceModule*>::Entries it;
228
229 for( it.begin(voiceModules); it; it++ )
230 delete( *it );
231
232 if(diphone_backoff_rules)
233 delete diphone_backoff_rules;
234
235 if( jc_delete == true )
236 if( jc != 0 )
237 delete jc;
238
239 if( tc_delete == true )
240 if( tc != 0 )
241 delete tc;
242
243 if(tcdh)
244 delete tcdh;
245
246 }
247
248
addToCatalogue(const EST_Utterance * utt)249 void DiphoneUnitVoice::addToCatalogue( const EST_Utterance *utt )
250 {
251 // needed?
252 }
253
254
getDiphone(const EST_VTCandidate * cand,EST_Track * coef,EST_Wave * sig,int * midframe,bool extendLeft,bool extendRight)255 void DiphoneUnitVoice::getDiphone( const EST_VTCandidate *cand,
256 EST_Track* coef, EST_Wave* sig, int *midframe,
257 bool extendLeft, bool extendRight )
258 {
259 // The need for this function in this class is a bit messy, it would be far
260 // nicer just to be able to ask the Candidate itself to hand over the relevant
261 // synthesis parameters. In future, it will work that way ;)
262
263 // put there by DiphoneVoiceModule::getCandidateList
264 const DiphoneCandidate *diphcand = diphonecandidate( cand->name );
265
266 const DiphoneVoiceModule* parentModule = diphcand->dvm;
267 EST_Item *firstPhoneInDiphone = cand->s;
268
269 // need to call right getDiphone to do the actual work
270 parentModule->getDiphone( firstPhoneInDiphone, coef, sig, midframe, extendLeft, extendRight );
271 }
272
273 // REQUIREMENT: the unit relation must have previously been used to initialise the
274 // Viterbi decoder from which the path was produced.
fillUnitRelation(EST_Relation * units,const EST_VTPath * path)275 void DiphoneUnitVoice::fillUnitRelation( EST_Relation *units, const EST_VTPath *path )
276 {
277 EST_Item *it=units->tail();
278
279 for ( ; path != 0 && it != 0; path=path->from, it=it->prev() ){
280 EST_Track *coefs = new EST_Track;
281 CHECK_PTR(coefs);
282 EST_Wave *sig = new EST_Wave;
283 CHECK_PTR(sig);
284 int midf;
285
286 getDiphone( path->c, coefs, sig, &midf,
287 it->f_present("extendLeft"), it->f_present("extendRight"));
288
289 EST_Item *firstPhoneInDiphone = path->c->s;
290 it->set_val( "sig", est_val( sig ) );
291 it->set_val( "coefs", est_val( coefs ) );
292 it->set( "middle_frame", midf );
293 it->set( "source_utt", firstPhoneInDiphone->relation()->utt()->f.S("fileid"));
294 it->set_val( "source_ph1", est_val( firstPhoneInDiphone ));
295 it->set( "source_end", firstPhoneInDiphone->F("end"));
296 it->set( "target_cost", path->c->score );
297
298 //have to recalculate join cost as it's not currently saved anywhere
299 if( path->from == 0 )
300 it->set( "join_cost", 0.0);
301 else{
302 // join cost between right edge of left diphone and vice versa
303 const DiphoneCandidate *l_diph = diphonecandidate(path->from->c->name);
304 const DiphoneCandidate *r_diph = diphonecandidate(path->c->name);
305
306 it->set( "join_cost", (*jc)( l_diph, r_diph ) );
307 }
308 }
309 }
310
311 // The use of the globalFunctionPtr in this function is a really just a temporary hack
312 // necessary because the decoder as it stands at present can only take a function pointer
313 // (would be better to relax this restriction in the EST_Viterbi_Decoder class, or in a
314 // replacement class, rather than using this hack)
315 // static EST_VTPath* extendPath( EST_VTPath *p, EST_VTCandidate *c,
316 // EST_Features&)
317 // {
318 // EST_VTPath *np = new EST_VTPath;
319 // CHECK_PTR(np);
320
321 // if( globalTempVoicePtr ==0 )
322 // EST_error( "globalTempVoicePtr is not set, can't continue" );
323
324 // const EST_JoinCost &jcost = globalTempVoicePtr->getJoinCostCalculator();
325
326 // np->c = c;
327 // np->from = p;
328 // np->state = c->pos;
329
330 // if ((p == 0) || (p->c == 0))
331 // np->score = c->score;
332 // else{
333 // // join cost between right edge of left diphone and vice versa
334 // np->score = p->score + c->score + jcost( p->c->s->next(), c->s );
335 // }
336 // return np;
337 // }
extendPath(EST_VTPath * p,EST_VTCandidate * c,EST_Features &)338 static EST_VTPath* extendPath( EST_VTPath *p, EST_VTCandidate *c,
339 EST_Features&)
340 {
341 EST_VTPath *np = new EST_VTPath;
342 CHECK_PTR(np);
343
344 if( globalTempVoicePtr ==0 )
345 EST_error( "globalTempVoicePtr is not set, can't continue" );
346
347 const EST_JoinCost &jcost = globalTempVoicePtr->getJoinCostCalculator();
348
349 np->c = c;
350 np->from = p;
351 np->state = c->pos;
352
353 if ((p == 0) || (p->c == 0))
354 np->score = c->score;
355 else{
356 const DiphoneCandidate *l_diph = diphonecandidate(p->c->name);
357 const DiphoneCandidate *r_diph = diphonecandidate(c->name);
358
359 // join cost between right edge of left diphone and vice versa
360 np->score = p->score + c->score + jcost( l_diph, r_diph );
361 }
362 return np;
363 }
364
365 // This function is a really just a temporary hack necessary because the decoder
366 // as it stands at present can only take a function pointer (would be better to relax
367 // this restriction in the EST_Viterbi_Decoder class, or in a replacement class, rather
368 // than using this hack)
getCandidatesFunction(EST_Item * s,EST_Features & f)369 static EST_VTCandidate* getCandidatesFunction( EST_Item *s,
370 EST_Features &f)
371 {
372 DiphoneUnitVoice *duv = globalTempVoicePtr;
373 if( duv==0 )
374 EST_error( "Candidate source voice is unset" );
375
376 return duv->getCandidates( s, f );
377 }
378
379 // Function which, given an item from the timeline relation that
380 // was originally used to initialise the EST_Viterbi_Decoder
381 // returns a pointer to a linked list of EST_VTCandidates
382 // (this is provided to the viterbi decoder upon its construction
383 // and (in)directly called by it as part of the decoding process...)
getCandidates(EST_Item * s,EST_Features & f) const384 EST_VTCandidate* DiphoneUnitVoice::getCandidates( EST_Item *s,
385 EST_Features &f) const
386 {
387 EST_VTCandidate *c = 0;
388 EST_VTCandidate *moduleListHead = 0;
389 EST_VTCandidate *moduleListTail = 0;
390
391 // these objects [c/sh]ould be a parameter visible in the user's script
392 // land, and will be in future...
393
394 // tc now a member
395 // EST_DefaultTargetCost default_target_cost;
396 // EST_TargetCost *tc = &default_target_cost;
397 // or
398 // EST_SchemeTargetCost scheme_target_cost(rintern( "targetcost"));
399 // EST_TargetCost *tc = &scheme_target_cost;
400
401 EST_TList<DiphoneVoiceModule*>::Entries module_iter;
402 int nfound, total=0;
403
404 ////////////////////////////////////////////////////////////////
405 // join linked list of candidates from each module into one list
406 for( module_iter.begin(voiceModules); module_iter; module_iter++ ){
407 nfound = (*module_iter)->getCandidateList( *s,
408 tc,
409 tcdh,
410 tc_weight,
411 &moduleListHead,
412 &moduleListTail );
413 if( nfound>0 ){
414 moduleListTail->next = c;
415 c = moduleListHead;
416 total += nfound;
417 }
418 }
419
420 if( total==0 )
421 EST_error( "Couldn't find diphone %s", (const char*)s->S("name") );
422
423 if( verbosity() > 0 )
424 printf( "Number of candidates found for target \"%s\": %d\n",
425 (const char*)s->S("name"), total );
426
427 if( ! ((tc_rescoring_beam == -1.0) || (tc_rescoring_weight <= 0.0)) )
428 rescoreCandidates( c, tc_rescoring_beam, tc_rescoring_weight );
429
430 return c;
431 }
432
diphoneCoverage(const EST_String filename) const433 void DiphoneUnitVoice::diphoneCoverage(const EST_String filename) const
434 {
435
436 EST_DiphoneCoverage dc;
437 EST_TList<DiphoneVoiceModule*>::Entries module_iter;
438
439 // for each module
440 for( module_iter.begin(voiceModules); module_iter; module_iter++ )
441 (*module_iter)->getDiphoneCoverageStats(&dc);
442
443 dc.print_stats(filename);
444
445 }
446
447
448
synthesiseWave(EST_Utterance * utt)449 bool DiphoneUnitVoice::synthesiseWave( EST_Utterance *utt )
450 {
451 getUnitSequence( utt );
452
453 return true;
454 }
455
456
457
getUnitSequence(EST_Utterance * utt)458 void DiphoneUnitVoice::getUnitSequence( EST_Utterance *utt )
459 {
460 EST_Relation *segs = utt->relation( "Segment" );
461 EST_Relation *units = utt->create_relation( "Unit" );
462
463 if(!tcdh)
464 tcdh = new TCDataHash(20);
465 else
466 tcdh->clear();
467
468 // Initialise the Unit relation time index for decoder
469 EST_String diphone_name;
470 EST_StrList missing_diphones;
471
472 EST_Item *it=segs->head();
473 if( it == 0 )
474 EST_error( "Segment relation is empty" );
475
476 bool extendLeftFlag = false;
477 for( ; it->next(); it=it->next() )
478 {
479 EST_String l = it->S("name");
480 EST_String r = it->next()->S("name");
481
482 EST_String diphone_name = EST_String::cat(l,"_",r);
483 EST_String orig = diphone_name;
484
485 if(tc->is_flatpack())
486 tcdh->add_item( it , ((EST_FlatTargetCost *)tc)->flatpack(it) );
487
488
489 // First attempt back off:
490 // If missing diphone is an interword diphone, insert a silence!
491 // Perceptual results say this is prefered.
492
493 if ( diphone_name != EST_String::Empty &&
494 !this->unitAvailable(diphone_name) )
495 {
496 EST_Item *s1,*s2;
497 EST_Item *w1=0,*w2=0;
498
499 cerr << "Missing diphone: "<< diphone_name << endl;
500
501 if((s1 = parent(it,"SylStructure")))
502 w1= parent(s1,"SylStructure");
503 if( (s2 = parent(it->next(),"SylStructure")))
504 w2= parent(s2,"SylStructure");
505
506 if( w1 && w2 && (w1 != w2) )
507 {
508 EST_Item *sil;
509
510 cerr << " Interword so inseting silence.\n";
511
512 sil = it->insert_after();
513 sil->set("name",ph_silence());
514
515 r = it->next()->S("name");
516 diphone_name = EST_String::cat(l,"_",r);
517
518 }
519 }
520
521
522 // Simple back off.
523 // Change diphone name for one we actually have.
524
525 while(diphone_name != EST_String::Empty &&
526 !this->unitAvailable(diphone_name) &&
527 diphone_backoff_rules)
528 {
529
530 cerr << " diphone still missing, backing off: " << diphone_name << endl;
531
532 diphone_name = diphone_backoff_rules->backoff(l,r);
533 l = diphone_name.before("_");
534 r = diphone_name.after("_");
535
536 cerr << " backed off: " << orig << " -> " << diphone_name << endl;
537
538 if( verbosity() > 0 ){
539 EST_warning("Backing off requested diphone %s to %s",
540 orig.str(),
541 diphone_name.str() );
542 }
543 }
544
545
546 //// Complex backoff. Changes the segment stream to the right,
547 //// may still leave a discontinuity to the left. This could be
548 //// fixed, but it would requires a better search. Rob's thoughts
549 //// are that the simple method works better, unless it resorts to
550 //// a bad default rule.
551
552
553 // while(!this->unitAvailable(diphone_name) &&
554 // diphone_backoff_rules &&
555 // !diphone_backoff_rules->backoff(it))
556 // diphone_name = EST_String::cat(it->S("name"),"_",it->next()->S("name"));
557
558 if( !this->unitAvailable( diphone_name ) ){
559 missing_diphones.append( diphone_name );
560 if(units->tail())
561 units->tail()->set( "extendRight", 1 );
562 extendLeftFlag = true; // trigger for next unit to make up second half of missing diphone
563 }
564 else{
565 EST_Item *t = units->append();
566 t->set( "name", diphone_name );
567 if(orig != diphone_name)
568 t->set( "missing_diphone",orig);
569 t->set_val( "ph1", est_val(it) );
570 if( extendLeftFlag == true ){
571 t->set( "extendLeft", 1 );
572 extendLeftFlag = false;
573 }
574 }
575 }
576
577 // stop if necessary units are still missing.
578 if( missing_diphones.length() > 0 ){
579 for( EST_Litem *it=missing_diphones.head(); it!=0 ; it=it->next() )
580 printf( "requested diphone missing: %s\n", missing_diphones(it).str() );
581
582 EST_warning("Making phone joins to compensate...");
583 // EST_error("Unable to synthesise utterance due to missing diphones");
584 }
585
586 // Make the decoder do its thing
587 // -1 means number of states at each time point not fixed
588 EST_Viterbi_Decoder v( getCandidatesFunction, extendPath, -1 );
589
590 // turn on pruning if necessary
591 if( (pruning_beam>0) || (ob_pruning_beam>0) )
592 v.set_pruning_parameters( pruning_beam, ob_pruning_beam );
593
594 // temporary hack necessary because decoder can only take a
595 // function pointer (would be better to relax this restriction in
596 // the EST_Viterbi_Decoder class, or in a replacement class, rather
597 // than using this hack)
598 globalTempVoicePtr = this;
599
600 v.set_big_is_good(false);
601
602 if( verbosity() > 0 )
603 v.turn_on_trace();
604
605 v.initialise( units );
606 v.search();
607
608 // take hold of the best path (end thereof)
609 EST_VTPath *bestp=0;
610 if( !v.result( &bestp ) )
611 EST_error( "No best candidate sequence found" );
612
613 // fill in the best path features in the Unit Relation
614 fillUnitRelation( units, bestp );
615
616 my_parse_diphone_times( *units, *segs );
617 }
618
619
620 /////////////////////////////////////////////////////////////////////////////////////
621 // Canned example experimental code (proof of concept rather than intelligently done)
622
itemListContainsItem(const ItemList * il,const EST_Item * item)623 static inline bool itemListContainsItem( const ItemList* il, const EST_Item *item )
624 {
625 ItemList::Entries it;
626
627 for( it.begin( *il ); it; it++ )
628 if( (*it) == item )
629 return true;
630
631 return false;
632 }
633
634
getCandidatesWithOmissionsFunction(EST_Item * s,EST_Features & f)635 static EST_VTCandidate* getCandidatesWithOmissionsFunction( EST_Item *s, EST_Features &f )
636 {
637 DiphoneUnitVoice *duv = globalTempVoicePtr;
638 if( duv==0 )
639 EST_error( "Candidate source voice is unset" );
640
641 //get candidate list as usual
642 EST_VTCandidate *candlist = duv->getCandidates( s, f );
643
644 //filter out candidates on basis of omission list (yes, this is quite dumb)
645 if( s->f_present( "omitlist" ) ){
646
647 EST_warning( "omitlist found in unit %s", s->S("name").str() );
648
649 ItemList *omitlist = itemlist( s->f("omitlist") );
650
651 //until one candidate remains as head (to keep hold of list head)
652 while( candlist != 0 && itemListContainsItem( omitlist, candlist->s ) ){
653 EST_VTCandidate *del_cand = candlist;
654 candlist = candlist->next;
655 del_cand->next = 0; //so deletion doesn't trigger total list deletion
656 delete del_cand;
657 }
658
659 //then continue down list
660 EST_VTCandidate *prev = candlist;
661 EST_VTCandidate *cand = candlist->next;
662 while( cand!=0 ){
663 if( itemListContainsItem( omitlist, cand->s ) ){ //delete cand on true
664 prev->next = cand->next;
665 cand->next = 0; //so deletion doesn't trigger total list deletion
666 delete cand;
667 cand = prev;
668 }
669 cand = cand->next;
670 }
671
672 if( candlist == 0 )
673 EST_error( "zero candidates remain after filtering" );
674
675 }
676
677 return candlist;
678 }
679
680 // For when the utterance already has the unit sequence, with certain candidates
681 // flagged as to be avoided, or mandatory and so on...
regetUnitSequence(EST_Utterance * utt)682 void DiphoneUnitVoice::regetUnitSequence( EST_Utterance *utt )
683 {
684 // Unit relation should already be in existence for decoder
685 EST_Relation *units = utt->relation( "Unit" );
686 EST_Item *it=units->head();
687 if( it == 0 )
688 EST_error( "Unit relation is empty" );
689
690 // Make the decoder do its thing (again)
691 // -1 means number of states at each time point not fixed
692 EST_Viterbi_Decoder v( getCandidatesWithOmissionsFunction, extendPath, -1 );
693
694 // turn on pruning if necessary
695 if( (pruning_beam>0) || (ob_pruning_beam>0) )
696 v.set_pruning_parameters( pruning_beam, ob_pruning_beam );
697
698 // temporary hack necessary because decoder can only take a
699 // function pointer (would be better to relax this restriction in
700 // the EST_Viterbi_Decoder class, or in a replacement class, rather
701 // than using this hack)
702 globalTempVoicePtr = this;
703
704 v.set_big_is_good(false);
705
706 if( verbosity() > 0 )
707 v.turn_on_trace();
708
709 v.initialise( units );
710 v.search();
711
712 // take hold of the best path (end thereof)
713 EST_VTPath *bestp=0;
714 if( !v.result( &bestp ) )
715 EST_error( "No best candidate sequence found" );
716
717 // fill in the best path features in the Unit Relation
718 fillUnitRelation( units, bestp );
719
720 EST_Relation *segs = utt->relation("Segment");
721 my_parse_diphone_times( *units, *segs );
722 }
723
724 // End canned example experimental code ///////////////////////////////////////////
725 ///////////////////////////////////////////////////////////////////////////////////
726
727
unitAvailable(const EST_String & diphone) const728 bool DiphoneUnitVoice::unitAvailable( const EST_String &diphone ) const
729 {
730 EST_TList<DiphoneVoiceModule*>::Entries it;
731
732 for( it.begin(voiceModules); it; it++ )
733 if( (*it)->numAvailableCandidates(diphone) > 0 )
734 return true;
735
736 return false;
737 }
738
numAvailableCandidates(const EST_String & diphone) const739 unsigned int DiphoneUnitVoice::numAvailableCandidates( const EST_String &diphone ) const
740 {
741 unsigned int number = 0;
742 EST_TList<DiphoneVoiceModule*>::Entries it;
743
744 for( it.begin(voiceModules); it; it++ )
745 number += (*it)->numAvailableCandidates(diphone);
746
747 return number;
748 }
749
750
751 ////////////////////////////////////////////////////////////////////////
752 ////////////////////////////////////////////////////////////////////////
753 // special case of the above for utterances structures that are
754 // actually in the voice database, which doesn't do any search
755 // This is useful for doing copy synthesis of utterances (eg.
756 // to test out resynthesis, prosodic modification and so on)
getCopyUnitUtterance(const EST_String & utt_fname,EST_Utterance ** utt_out) const757 void DiphoneUnitVoice::getCopyUnitUtterance( const EST_String &utt_fname,
758 EST_Utterance **utt_out ) const
759 {
760 // need to find which, if any, voice module has this utterance
761 // in its list
762 EST_TList<DiphoneVoiceModule*>::Entries module_iter;
763 EST_Utterance *db_utt=0;
764 for( module_iter.begin(voiceModules); module_iter; module_iter++ )
765 if( (*module_iter)->getUtterance(&db_utt, "fileid", utt_fname) == true )
766 break;
767
768 if( db_utt == 0 )
769 EST_error( "Could not find Utterance %s in any voice module",
770 utt_fname.str() );
771 else{
772 // deep copy database utterance and fill in Unit relation
773 *utt_out = new EST_Utterance( *db_utt );
774 CHECK_PTR(utt_out);
775
776 EST_Utterance myUtt( *db_utt );
777
778 cerr << myUtt.relation_present( "Segment" ) << " "
779 << myUtt.num_relations() <<endl;
780
781
782 cerr << db_utt->relation_present( "Segment" ) << " "
783 << (*utt_out)->relation_present( "Segment" ) << " "
784 << (*utt_out)->num_relations() <<endl;
785
786
787 EST_Relation *segs = (*utt_out)->relation( "Segment" );
788 EST_Relation *units = (*utt_out)->create_relation( "Unit" );
789
790 // Initialise the Unit relation + fill in necessary/suitable
791 // synthesis parameters
792 EST_String ph1, ph2;
793 EST_Item *it = segs->tail();
794 EST_Item *db_utt_seg_it = db_utt->relation( "Segment" )->tail();
795 if( it == 0 )
796 EST_error( "Segment relation is empty" );
797 else{
798 ph2 = it->S("name");
799 while( ((it=it->prev())!=0) &&
800 ((db_utt_seg_it=db_utt_seg_it->prev())!=0) ){
801 EST_Track *coefs = new EST_Track;
802 CHECK_PTR(coefs);
803 EST_Wave *sig = new EST_Wave;
804 CHECK_PTR(sig);
805 int midf;
806
807 (*module_iter)->getDiphone( db_utt_seg_it, coefs, sig, &midf );
808
809 ph1 = it->S("name");
810 EST_Item *t = units->prepend();
811 t->set( "name", EST_String::cat(ph1,"_",ph2) );
812 t->set_val( "ph1", est_val(it) );
813 t->set_val( "sig", est_val( sig ) );
814 t->set_val( "coefs", est_val( coefs ) );
815 t->set( "middle_frame", midf );
816 t->set( "source_utt", db_utt->f.S("fileid"));
817 t->set_val( "source_ph1", est_val( db_utt_seg_it ));
818 t->set( "source_end", db_utt_seg_it->F("end"));
819 t->set( "target_cost", 0.0 );
820 t->set( "join_cost", 0.0);
821
822 ph2 = ph1;
823 }
824 }
825 my_parse_diphone_times( *units, *segs );
826
827 // this is for copy synthesis, so copy actual timings
828 //for( EST_Item *seg = segs->head(); it!=0; it=it->next() )
829 //seg->set( "end", seg->F("source_end") );
830 }
831 }
832
833 ////////////////////////////////////////////////////////////////////////
834 ////////////////////////////////////////////////////////////////////////
835
836
837
numUnitTypes() const838 unsigned int DiphoneUnitVoice::numUnitTypes() const
839 {
840 //necessary?
841 return 0;
842 }
843
numDatabaseUnits() const844 unsigned int DiphoneUnitVoice::numDatabaseUnits() const
845 {
846 unsigned int sum=0;
847
848 EST_TList<DiphoneVoiceModule*>::Entries it;
849
850 for( it.begin( voiceModules ); it; it++ )
851 sum += (*it)->numModuleUnits();
852
853 return sum;
854 }
855
856
857 //////////////////////////////////////////////////////////////////////////
858
set_diphone_backoff(DiphoneBackoff * dbo)859 void DiphoneUnitVoice::set_diphone_backoff(DiphoneBackoff *dbo)
860 {
861 if (diphone_backoff_rules)
862 delete diphone_backoff_rules;
863 diphone_backoff_rules = dbo;
864 }
865
866
getPhoneList(const EST_String & phone,ItemList & list)867 int DiphoneUnitVoice::getPhoneList( const EST_String &phone, ItemList &list )
868 {
869 unsigned int n=0;
870
871 EST_TList<DiphoneVoiceModule*>::Entries it;
872 for( it.begin( voiceModules ); it; it++ )
873 n += (*it)->getPhoneList( phone, list );
874
875 return n;
876 }
877
878
879
precomputeJoinCosts(const EST_StrList & phones,bool verbose)880 void DiphoneUnitVoice::precomputeJoinCosts( const EST_StrList &phones, bool verbose )
881 {
882 EST_StrList::Entries it;
883 for( it.begin( phones ); it; it++ ){
884 ItemList *l = new ItemList;
885 CHECK_PTR(l);
886
887 unsigned int n = getPhoneList( (*it), *l );
888
889 if( verbose==true )
890 cerr << "phone " << (*it) << " " << n << " instances\n";
891
892 if( n>0 ){
893 jc->computeAndCache( *l, true ); //verbose=true
894 }
895 else
896 EST_warning( "Phone %s not listed in voice", (*it).str() );
897
898 delete l;
899 }
900 }
901