1 /*************************************************************************/
2 /*                                                                       */
3 /*                Centre for Speech Technology Research                  */
4 /*                 (University of Edinburgh, UK) and                     */
5 /*                           Korin Richmond                              */
6 /*                         Copyright (c) 2002                            */
7 /*                         All Rights Reserved.                          */
8 /*                                                                       */
9 /*  Permission is hereby granted, free of charge, to use and distribute  */
10 /*  this software and its documentation without restriction, including   */
11 /*  without limitation the rights to use, copy, modify, merge, publish,  */
12 /*  distribute, sublicense, and/or sell copies of this work, and to      */
13 /*  permit persons to whom this work is furnished to do so, subject to   */
14 /*  the following conditions:                                            */
15 /*                                                                       */
16 /*   1. The code must retain the above copyright notice, this list of    */
17 /*      conditions and the following disclaimer.                         */
18 /*   2. Any modifications must be clearly marked as such.                */
19 /*   3. Original authors' names are not deleted.                         */
20 /*   4. The authors' names are not used to endorse or promote products   */
21 /*      derived from this software without specific prior written        */
22 /*      permission.                                                      */
23 /*                                                                       */
24 /*  THE UNIVERSITY OF EDINBURGH AND THE CONTRIBUTORS TO THIS WORK        */
25 /*  DISCLAIM ALL WARRANTIES WITH REGARD TO THIS SOFTWARE, INCLUDING      */
26 /*  ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS. IN NO EVENT   */
27 /*  SHALL THE UNIVERSITY OF EDINBURGH NOR THE CONTRIBUTORS BE LIABLE     */
28 /*  FOR ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES    */
29 /*  WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN   */
30 /*  AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION,          */
31 /*  ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF       */
32 /*  THIS SOFTWARE.                                                       */
33 /*                                                                       */
34 /*************************************************************************/
35 /*                                                                       */
36 /*                          Author: Korin Richmond                       */
37 /*                            Date:  Aug  2002                           */
38 /* --------------------------------------------------------------------- */
39 /* first stab at a diphone unit selection "voice" - using a list of      */
40 /* utterance objects                                                     */
41 /*************************************************************************/
42 
43 #include "festival.h"
44 #include "DiphoneUnitVoice.h"
45 #include "DiphoneVoiceModule.h"
46 #include "EST_DiphoneCoverage.h"
47 #include "EST_rw_status.h"
48 #include "EST_viterbi.h"
49 #include "EST_Track.h"
50 #include "EST_track_aux.h"
51 #include "EST_Wave.h"
52 #include "EST_THash.h"
53 #include "EST_TList.h"
54 #include "EST_types.h"
55 #include "ling_class/EST_Utterance.h"
56 #include "siod.h"
57 #include "siod_est.h"
58 #include "safety.h"
59 #include <cstdlib>
60 
61 #include "EST_TargetCost.h"
62 #include "TargetCostRescoring.h"
63 #include "EST_JoinCost.h"
64 #include "EST_JoinCostCache.h"
65 
66 #include "EST_Val.h"
67 
68 SIOD_REGISTER_TYPE(itemlist,ItemList)
69 VAL_REGISTER_TYPE(itemlist,ItemList)
70 
71 // from src/modules/UniSyn_diphone/us_diphone.h
72 // this won't be staying here long...
73 void parse_diphone_times(EST_Relation &diphone_stream,
74 			 EST_Relation &source_lab);
75 
SIOD_REGISTER_CLASS(du_voice,DiphoneUnitVoice)76 SIOD_REGISTER_CLASS(du_voice,DiphoneUnitVoice)
77 VAL_REGISTER_CLASS(du_voice,DiphoneUnitVoice)
78 
79 static void my_parse_diphone_times(EST_Relation &diphone_stream,
80 				   EST_Relation &source_lab)
81 {
82   EST_Item *s, *u;
83   float dur1, dur_u, p_time=0.0;
84 
85   // NOTE: because of the extendLeft/extendRight phone join hack for missing diphones,
86   // the unit linked list *may be* shorter that the segment list.
87   //(admittedly could cause confusion)
88 
89   for( s=source_lab.head(), u=diphone_stream.head(); (u!=0)&&(s!=0); u=u->next(), s=s->next()){
90     EST_Track *pm = track(u->f("coefs"));
91 
92     int end_frame = pm->num_frames() - 1;
93     int mid_frame = u->I("middle_frame");
94 
95     dur1 = pm->t(mid_frame);
96     dur_u = pm->t(end_frame);
97 
98     s->set("end", (p_time+dur1) );
99 
100     p_time += dur_u;
101     u->set("end", p_time);
102 
103     if( u->f_present("extendRight") ){//because diphone squeezed out (see above)
104       s = s->next();
105       s->set("end", p_time );
106     }
107   }
108 
109   if(s)
110     s->set("end", (p_time));
111 }
112 
113 // temporary hack necessary because decoder can only take a
114 // function pointer (would be better to relax this restriction in
115 // the EST_Viterbi_Decoder class, or in a replacement class, rather
116 // than using this hack)
117 static DiphoneUnitVoice *globalTempVoicePtr = 0;
118 
DiphoneUnitVoice(const EST_StrList & basenames,const EST_String & uttDir,const EST_String & wavDir,const EST_String & pmDir,const EST_String & coefDir,unsigned int sr,const EST_String & uttExt,const EST_String & wavExt,const EST_String & pmExt,const EST_String & coefExt)119 DiphoneUnitVoice::DiphoneUnitVoice( const EST_StrList& basenames,
120 				    const EST_String& uttDir,
121 				    const EST_String& wavDir,
122 				    const EST_String& pmDir,
123 				    const EST_String& coefDir,
124 				    unsigned int sr,
125 				    const EST_String& uttExt,
126 				    const EST_String& wavExt,
127 				    const EST_String& pmExt,
128 				    const EST_String& coefExt )
129   : pruning_beam( -1 ),
130     ob_pruning_beam( -1 ),
131     tc_rescoring_beam( -1 ),
132     tc_rescoring_weight( 0.0 ),
133     tc_weight( 1.0 ),
134     jc_weight( 1.0 ),
135     jc_f0_weight( 1.0 ),
136     jc_power_weight( 1.0 ),
137     jc_spectral_weight( 1.0 ),
138     prosodic_modification( 0 ),
139     wav_srate( sr ),
140     jc( 0 ),
141     jc_delete( false ),
142     tc( 0 ),
143     tc_delete( false ),
144     tcdh( 0 )
145 
146 {
147   // make the default voice module with the supplied parameters
148   addVoiceModule( basenames, uttDir, wavDir, pmDir, coefDir,
149 		  wav_srate,
150 		  uttExt, wavExt, pmExt, coefExt );
151 
152   diphone_backoff_rules = 0;
153 }
154 
initialise(bool ignore_bad_tag)155 void DiphoneUnitVoice::initialise( bool ignore_bad_tag )
156 {
157   if( jc == 0 )
158     EST_error( "Need to set join cost calculator for voice" );
159 
160   if( tc == 0 )
161     EST_error( "Need to set target cost calculator for voice" );
162 
163   EST_TList<DiphoneVoiceModule*>::Entries it;
164 
165   for( it.begin(voiceModules); it; it++ )
166     (*it)->initialise( tc, ignore_bad_tag );
167 }
168 
addVoiceModule(const EST_StrList & basenames,const EST_String & uttDir,const EST_String & wavDir,const EST_String & pmDir,const EST_String & coefDir,unsigned int srate,const EST_String & uttExt,const EST_String & wavExt,const EST_String & pmExt,const EST_String & coefExt)169 bool DiphoneUnitVoice::addVoiceModule( const EST_StrList& basenames,
170 				       const EST_String& uttDir,
171 				       const EST_String& wavDir,
172 				       const EST_String& pmDir,
173 				       const EST_String& coefDir,
174 				       unsigned int srate,
175 				       const EST_String& uttExt,
176 				       const EST_String& wavExt,
177 				       const EST_String& pmExt,
178 				       const EST_String& coefExt )
179 
180 {
181   DiphoneVoiceModule *vm;
182 
183   if( srate != wav_srate )
184     EST_error( "Voice samplerate: %d\nmodule samplerate: %d",
185 	       wav_srate, srate );
186 
187   vm = new DiphoneVoiceModule( basenames, uttDir, wavDir, pmDir, coefDir,
188 			       srate,
189 			       uttExt, wavExt, pmExt, coefExt );
190   CHECK_PTR(vm);
191 
192   registerVoiceModule( vm );
193 
194   return true;
195 }
196 
197 
registerVoiceModule(DiphoneVoiceModule * vm)198 void DiphoneUnitVoice::registerVoiceModule( DiphoneVoiceModule *vm )
199 {
200   voiceModules.append( vm );
201 }
202 
203 
setJoinCost(EST_JoinCost * jcost,bool del)204 void DiphoneUnitVoice::setJoinCost( EST_JoinCost *jcost, bool del )
205 {
206   if( jc_delete == true )
207     if( jc != 0 )
208       delete jc;
209 
210   jc = jcost;
211   jc_delete = del;
212 }
213 
setTargetCost(EST_TargetCost * tcost,bool del)214 void DiphoneUnitVoice::setTargetCost( EST_TargetCost *tcost, bool del )
215 {
216   if( tc_delete == true )
217     if( tc != 0 )
218       delete tc;
219 
220   tc = tcost;
221   tc_delete = del;
222 }
223 
224 
~DiphoneUnitVoice()225 DiphoneUnitVoice::~DiphoneUnitVoice()
226 {
227   EST_TList<DiphoneVoiceModule*>::Entries it;
228 
229   for( it.begin(voiceModules); it; it++ )
230     delete( *it );
231 
232   if(diphone_backoff_rules)
233     delete diphone_backoff_rules;
234 
235   if( jc_delete == true )
236     if( jc != 0 )
237       delete jc;
238 
239   if( tc_delete == true )
240     if( tc != 0 )
241       delete tc;
242 
243   if(tcdh)
244     delete tcdh;
245 
246 }
247 
248 
addToCatalogue(const EST_Utterance * utt)249 void DiphoneUnitVoice::addToCatalogue( const EST_Utterance *utt )
250 {
251   // needed?
252 }
253 
254 
getDiphone(const EST_VTCandidate * cand,EST_Track * coef,EST_Wave * sig,int * midframe,bool extendLeft,bool extendRight)255 void DiphoneUnitVoice::getDiphone( const EST_VTCandidate *cand,
256 				   EST_Track* coef, EST_Wave* sig, int *midframe,
257 				   bool extendLeft, bool extendRight )
258 {
259   // The need for this function in this class is a bit messy, it would be far
260   // nicer just to be able to ask the Candidate itself to hand over the relevant
261   // synthesis parameters.  In future, it will work that way ;)
262 
263   // put there by DiphoneVoiceModule::getCandidateList
264   const DiphoneCandidate *diphcand = diphonecandidate( cand->name );
265 
266   const DiphoneVoiceModule* parentModule = diphcand->dvm;
267   EST_Item *firstPhoneInDiphone = cand->s;
268 
269   // need to call right getDiphone to do the actual work
270   parentModule->getDiphone( firstPhoneInDiphone, coef, sig, midframe, extendLeft, extendRight );
271 }
272 
273 // REQUIREMENT: the unit relation must have previously been used to initialise the
274 // Viterbi decoder from which the path was produced.
fillUnitRelation(EST_Relation * units,const EST_VTPath * path)275 void DiphoneUnitVoice::fillUnitRelation( EST_Relation *units, const EST_VTPath *path )
276 {
277   EST_Item *it=units->tail();
278 
279   for ( ; path != 0 && it != 0; path=path->from, it=it->prev() ){
280     EST_Track *coefs = new EST_Track;
281     CHECK_PTR(coefs);
282     EST_Wave *sig = new EST_Wave;
283     CHECK_PTR(sig);
284     int midf;
285 
286     getDiphone( path->c, coefs, sig, &midf,
287 		it->f_present("extendLeft"), it->f_present("extendRight"));
288 
289     EST_Item *firstPhoneInDiphone = path->c->s;
290     it->set_val( "sig", est_val( sig ) );
291     it->set_val( "coefs", est_val( coefs ) );
292     it->set( "middle_frame", midf );
293     it->set( "source_utt", firstPhoneInDiphone->relation()->utt()->f.S("fileid"));
294     it->set_val( "source_ph1", est_val( firstPhoneInDiphone ));
295     it->set( "source_end", firstPhoneInDiphone->F("end"));
296     it->set( "target_cost", path->c->score );
297 
298     //have to recalculate join cost as it's not currently saved anywhere
299     if( path->from == 0 )
300       it->set( "join_cost", 0.0);
301     else{
302       // join cost between right edge of left diphone and vice versa
303       const DiphoneCandidate *l_diph = diphonecandidate(path->from->c->name);
304       const DiphoneCandidate *r_diph = diphonecandidate(path->c->name);
305 
306       it->set( "join_cost", (*jc)( l_diph, r_diph ) );
307     }
308   }
309 }
310 
311 // The use of the globalFunctionPtr in this function is a really just a temporary hack
312 // necessary because the decoder as it stands at present can only take a function pointer
313 // (would be better to relax this restriction in the EST_Viterbi_Decoder class, or in a
314 // replacement class, rather than using this hack)
315 // static EST_VTPath* extendPath( EST_VTPath *p, EST_VTCandidate *c,
316 // 	 		       EST_Features&)
317 // {
318 //   EST_VTPath *np = new EST_VTPath;
319 //   CHECK_PTR(np);
320 
321 //   if( globalTempVoicePtr ==0 )
322 //     EST_error( "globalTempVoicePtr is not set, can't continue" );
323 
324 //   const EST_JoinCost &jcost = globalTempVoicePtr->getJoinCostCalculator();
325 
326 //   np->c = c;
327 //   np->from = p;
328 //   np->state = c->pos;
329 
330 //   if ((p == 0) || (p->c == 0))
331 //     np->score = c->score;
332 //   else{
333 //     // join cost between right edge of left diphone and vice versa
334 //     np->score = p->score + c->score + jcost( p->c->s->next(), c->s );
335 //   }
336 //   return np;
337 // }
extendPath(EST_VTPath * p,EST_VTCandidate * c,EST_Features &)338 static EST_VTPath* extendPath( EST_VTPath *p, EST_VTCandidate *c,
339 	 		       EST_Features&)
340 {
341   EST_VTPath *np = new EST_VTPath;
342   CHECK_PTR(np);
343 
344   if( globalTempVoicePtr ==0 )
345     EST_error( "globalTempVoicePtr is not set, can't continue" );
346 
347   const EST_JoinCost &jcost = globalTempVoicePtr->getJoinCostCalculator();
348 
349   np->c = c;
350   np->from = p;
351   np->state = c->pos;
352 
353   if ((p == 0) || (p->c == 0))
354     np->score = c->score;
355   else{
356     const DiphoneCandidate *l_diph = diphonecandidate(p->c->name);
357     const DiphoneCandidate *r_diph = diphonecandidate(c->name);
358 
359     // join cost between right edge of left diphone and vice versa
360     np->score = p->score + c->score + jcost( l_diph, r_diph );
361   }
362   return np;
363 }
364 
365 // This function is a really just a temporary hack necessary because the decoder
366 // as it stands at present can only take a function pointer (would be better to relax
367 // this restriction in the EST_Viterbi_Decoder class, or in a replacement class, rather
368 // than using this hack)
getCandidatesFunction(EST_Item * s,EST_Features & f)369 static EST_VTCandidate* getCandidatesFunction( EST_Item *s,
370 					       EST_Features &f)
371 {
372   DiphoneUnitVoice *duv = globalTempVoicePtr;
373   if( duv==0 )
374     EST_error( "Candidate source voice is unset" );
375 
376   return duv->getCandidates( s, f );
377 }
378 
379 // Function which, given an item from the timeline relation that
380 // was originally used to initialise the EST_Viterbi_Decoder
381 // returns a pointer to a linked list of EST_VTCandidates
382 // (this is provided to the viterbi decoder upon its construction
383 // and (in)directly called by it as part of the decoding process...)
getCandidates(EST_Item * s,EST_Features & f) const384 EST_VTCandidate* DiphoneUnitVoice::getCandidates( EST_Item *s,
385 						  EST_Features &f) const
386 {
387   EST_VTCandidate *c = 0;
388   EST_VTCandidate *moduleListHead = 0;
389   EST_VTCandidate *moduleListTail = 0;
390 
391   // these objects [c/sh]ould be a parameter visible in the user's script
392   // land, and will be in future...
393 
394   // tc now a member
395   // EST_DefaultTargetCost default_target_cost;
396   // EST_TargetCost *tc = &default_target_cost;
397   // or
398   //  EST_SchemeTargetCost scheme_target_cost(rintern( "targetcost"));
399   //  EST_TargetCost *tc = &scheme_target_cost;
400 
401   EST_TList<DiphoneVoiceModule*>::Entries module_iter;
402   int nfound, total=0;
403 
404   ////////////////////////////////////////////////////////////////
405   // join linked list of candidates from each module into one list
406   for( module_iter.begin(voiceModules); module_iter; module_iter++ ){
407     nfound = (*module_iter)->getCandidateList( *s,
408 					       tc,
409 					       tcdh,
410 					       tc_weight,
411 					       &moduleListHead,
412 					       &moduleListTail );
413     if( nfound>0 ){
414       moduleListTail->next = c;
415       c = moduleListHead;
416       total += nfound;
417     }
418   }
419 
420   if( total==0 )
421     EST_error( "Couldn't find diphone %s", (const char*)s->S("name") );
422 
423   if( verbosity() > 0 )
424     printf( "Number of candidates found for target \"%s\": %d\n",
425 	    (const char*)s->S("name"), total );
426 
427   if( ! ((tc_rescoring_beam == -1.0) || (tc_rescoring_weight <= 0.0)) )
428     rescoreCandidates( c, tc_rescoring_beam, tc_rescoring_weight );
429 
430   return c;
431 }
432 
diphoneCoverage(const EST_String filename) const433 void DiphoneUnitVoice::diphoneCoverage(const EST_String filename) const
434 {
435 
436   EST_DiphoneCoverage dc;
437   EST_TList<DiphoneVoiceModule*>::Entries module_iter;
438 
439   // for each module
440   for( module_iter.begin(voiceModules); module_iter; module_iter++ )
441     (*module_iter)->getDiphoneCoverageStats(&dc);
442 
443   dc.print_stats(filename);
444 
445 }
446 
447 
448 
synthesiseWave(EST_Utterance * utt)449 bool DiphoneUnitVoice::synthesiseWave( EST_Utterance *utt )
450 {
451   getUnitSequence( utt );
452 
453   return true;
454 }
455 
456 
457 
getUnitSequence(EST_Utterance * utt)458 void DiphoneUnitVoice::getUnitSequence( EST_Utterance  *utt )
459 {
460   EST_Relation *segs = utt->relation( "Segment" );
461   EST_Relation *units = utt->create_relation( "Unit" );
462 
463   if(!tcdh)
464     tcdh = new TCDataHash(20);
465   else
466     tcdh->clear();
467 
468   // Initialise the Unit relation time index for decoder
469   EST_String diphone_name;
470   EST_StrList missing_diphones;
471 
472   EST_Item *it=segs->head();
473   if( it == 0 )
474     EST_error( "Segment relation is empty" );
475 
476   bool extendLeftFlag = false;
477   for( ; it->next(); it=it->next() )
478     {
479       EST_String l = it->S("name");
480       EST_String r = it->next()->S("name");
481 
482       EST_String diphone_name = EST_String::cat(l,"_",r);
483       EST_String orig = diphone_name;
484 
485       if(tc->is_flatpack())
486 	tcdh->add_item( it , ((EST_FlatTargetCost *)tc)->flatpack(it) );
487 
488 
489       // First attempt back off:
490       // If missing diphone is an interword diphone, insert a silence!
491       // Perceptual results say this is prefered.
492 
493       if ( diphone_name != EST_String::Empty &&
494 	   !this->unitAvailable(diphone_name) )
495 	{
496 	  EST_Item *s1,*s2;
497 	  EST_Item *w1=0,*w2=0;
498 
499 	  cerr << "Missing diphone: "<< diphone_name << endl;
500 
501 	  if((s1 = parent(it,"SylStructure")))
502 	    w1= parent(s1,"SylStructure");
503 	  if( (s2 = parent(it->next(),"SylStructure")))
504 	    w2= parent(s2,"SylStructure");
505 
506 	  if( w1 && w2 && (w1 != w2) )
507 	    {
508 	      EST_Item *sil;
509 
510 	      cerr << " Interword so inseting silence.\n";
511 
512 	      sil = it->insert_after();
513 	      sil->set("name",ph_silence());
514 
515 	      r = it->next()->S("name");
516 	      diphone_name = EST_String::cat(l,"_",r);
517 
518 	    }
519 	}
520 
521 
522       // Simple back off.
523       // Change diphone name for one we actually have.
524 
525       while(diphone_name != EST_String::Empty &&
526 	    !this->unitAvailable(diphone_name) &&
527 	    diphone_backoff_rules)
528 	{
529 
530 	  cerr << " diphone still missing, backing off: " << diphone_name << endl;
531 
532 	  diphone_name = diphone_backoff_rules->backoff(l,r);
533 	  l = diphone_name.before("_");
534 	  r = diphone_name.after("_");
535 
536 	  cerr << " backed off: " << orig << " -> " << diphone_name << endl;
537 
538 	  if( verbosity() > 0 ){
539 	    EST_warning("Backing off requested diphone %s to %s",
540 			orig.str(),
541 			diphone_name.str() );
542 	  }
543 	}
544 
545 
546       //// Complex backoff.  Changes the segment stream to the right,
547       //// may still leave a discontinuity to the left. This could be
548       //// fixed, but it would requires a better search. Rob's thoughts
549       //// are that the simple method works better, unless it resorts to
550       //// a bad default rule.
551 
552 
553       //    while(!this->unitAvailable(diphone_name) &&
554       //          diphone_backoff_rules &&
555       //          !diphone_backoff_rules->backoff(it))
556       //      diphone_name = EST_String::cat(it->S("name"),"_",it->next()->S("name"));
557 
558       if( !this->unitAvailable( diphone_name ) ){
559 	missing_diphones.append( diphone_name );
560 	if(units->tail())
561 	  units->tail()->set( "extendRight", 1 );
562 	extendLeftFlag = true; // trigger for next unit to make up second half of missing diphone
563       }
564       else{
565 	EST_Item *t = units->append();
566 	t->set( "name", diphone_name );
567 	if(orig != diphone_name)
568 	  t->set( "missing_diphone",orig);
569 	t->set_val( "ph1", est_val(it) );
570 	if( extendLeftFlag == true ){
571 	  t->set( "extendLeft", 1 );
572 	  extendLeftFlag = false;
573 	}
574       }
575     }
576 
577   // stop if necessary units are still missing.
578   if( missing_diphones.length() > 0 ){
579     for( EST_Litem *it=missing_diphones.head(); it!=0 ; it=it->next() )
580       printf( "requested diphone missing: %s\n", missing_diphones(it).str() );
581 
582     EST_warning("Making phone joins to compensate...");
583     //    EST_error("Unable to synthesise utterance due to missing diphones");
584   }
585 
586   // Make the decoder do its thing
587   // -1 means number of states at each time point not fixed
588   EST_Viterbi_Decoder v( getCandidatesFunction, extendPath, -1 );
589 
590   // turn on pruning if necessary
591   if( (pruning_beam>0) || (ob_pruning_beam>0) )
592     v.set_pruning_parameters( pruning_beam, ob_pruning_beam );
593 
594   // temporary hack necessary because decoder can only take a
595   // function pointer (would be better to relax this restriction in
596   // the EST_Viterbi_Decoder class, or in a replacement class, rather
597   // than using this hack)
598   globalTempVoicePtr = this;
599 
600   v.set_big_is_good(false);
601 
602   if( verbosity() > 0 )
603     v.turn_on_trace();
604 
605   v.initialise( units );
606   v.search();
607 
608   // take hold of the best path (end thereof)
609   EST_VTPath *bestp=0;
610   if( !v.result( &bestp ) )
611     EST_error( "No best candidate sequence found" );
612 
613   // fill in the best path features in the Unit Relation
614   fillUnitRelation( units, bestp );
615 
616   my_parse_diphone_times( *units, *segs );
617 }
618 
619 
620 /////////////////////////////////////////////////////////////////////////////////////
621 // Canned example experimental code (proof of concept rather than intelligently done)
622 
itemListContainsItem(const ItemList * il,const EST_Item * item)623 static inline bool itemListContainsItem( const ItemList* il, const EST_Item *item )
624 {
625   ItemList::Entries it;
626 
627   for( it.begin( *il ); it; it++ )
628     if( (*it) == item )
629       return true;
630 
631   return false;
632 }
633 
634 
getCandidatesWithOmissionsFunction(EST_Item * s,EST_Features & f)635 static EST_VTCandidate* getCandidatesWithOmissionsFunction( EST_Item *s, EST_Features &f )
636 {
637   DiphoneUnitVoice *duv = globalTempVoicePtr;
638   if( duv==0 )
639     EST_error( "Candidate source voice is unset" );
640 
641   //get candidate list as usual
642   EST_VTCandidate *candlist = duv->getCandidates( s, f );
643 
644   //filter out candidates on basis of omission list (yes, this is quite dumb)
645   if( s->f_present( "omitlist" ) ){
646 
647     EST_warning( "omitlist found in unit %s", s->S("name").str() );
648 
649     ItemList *omitlist = itemlist( s->f("omitlist") );
650 
651     //until one candidate remains as head (to keep hold of list head)
652     while( candlist != 0 && itemListContainsItem( omitlist, candlist->s ) ){
653       EST_VTCandidate *del_cand = candlist;
654       candlist = candlist->next;
655       del_cand->next = 0; //so deletion doesn't trigger total list deletion
656       delete del_cand;
657     }
658 
659     //then continue down list
660     EST_VTCandidate *prev = candlist;
661     EST_VTCandidate *cand = candlist->next;
662     while( cand!=0 ){
663       if( itemListContainsItem( omitlist, cand->s ) ){ //delete cand on true
664 	prev->next = cand->next;
665 	cand->next = 0; //so deletion doesn't trigger total list deletion
666 	delete cand;
667 	cand = prev;
668       }
669       cand = cand->next;
670     }
671 
672     if( candlist == 0 )
673       EST_error( "zero candidates remain after filtering" );
674 
675   }
676 
677   return candlist;
678 }
679 
680 // For when the utterance already has the unit sequence, with certain candidates
681 // flagged as to be avoided, or mandatory and so on...
regetUnitSequence(EST_Utterance * utt)682 void DiphoneUnitVoice::regetUnitSequence( EST_Utterance *utt )
683 {
684   // Unit relation should already be in existence for decoder
685   EST_Relation *units = utt->relation( "Unit" );
686   EST_Item *it=units->head();
687   if( it == 0 )
688     EST_error( "Unit relation is empty" );
689 
690   // Make the decoder do its thing (again)
691   // -1 means number of states at each time point not fixed
692   EST_Viterbi_Decoder v( getCandidatesWithOmissionsFunction, extendPath, -1 );
693 
694   // turn on pruning if necessary
695   if( (pruning_beam>0) || (ob_pruning_beam>0) )
696     v.set_pruning_parameters( pruning_beam, ob_pruning_beam );
697 
698   // temporary hack necessary because decoder can only take a
699   // function pointer (would be better to relax this restriction in
700   // the EST_Viterbi_Decoder class, or in a replacement class, rather
701   // than using this hack)
702   globalTempVoicePtr = this;
703 
704   v.set_big_is_good(false);
705 
706   if( verbosity() > 0 )
707     v.turn_on_trace();
708 
709   v.initialise( units );
710   v.search();
711 
712   // take hold of the best path (end thereof)
713   EST_VTPath *bestp=0;
714   if( !v.result( &bestp ) )
715     EST_error( "No best candidate sequence found" );
716 
717   // fill in the best path features in the Unit Relation
718   fillUnitRelation( units, bestp );
719 
720   EST_Relation *segs = utt->relation("Segment");
721   my_parse_diphone_times( *units, *segs );
722 }
723 
724 // End canned example experimental code ///////////////////////////////////////////
725 ///////////////////////////////////////////////////////////////////////////////////
726 
727 
unitAvailable(const EST_String & diphone) const728 bool DiphoneUnitVoice::unitAvailable( const EST_String &diphone ) const
729 {
730   EST_TList<DiphoneVoiceModule*>::Entries it;
731 
732   for( it.begin(voiceModules); it; it++ )
733     if( (*it)->numAvailableCandidates(diphone) > 0 )
734       return true;
735 
736   return false;
737 }
738 
numAvailableCandidates(const EST_String & diphone) const739 unsigned int DiphoneUnitVoice::numAvailableCandidates( const EST_String &diphone ) const
740 {
741   unsigned int number = 0;
742   EST_TList<DiphoneVoiceModule*>::Entries it;
743 
744   for( it.begin(voiceModules); it; it++ )
745     number += (*it)->numAvailableCandidates(diphone);
746 
747   return number;
748 }
749 
750 
751 ////////////////////////////////////////////////////////////////////////
752 ////////////////////////////////////////////////////////////////////////
753 // special case of the above for utterances structures that are
754 // actually in the voice database, which doesn't do any search
755 // This is useful for doing copy synthesis of utterances (eg.
756 // to test out resynthesis, prosodic modification and so on)
getCopyUnitUtterance(const EST_String & utt_fname,EST_Utterance ** utt_out) const757 void DiphoneUnitVoice::getCopyUnitUtterance( const EST_String &utt_fname,
758 					     EST_Utterance **utt_out ) const
759 {
760   // need to find which, if any, voice module has this utterance
761   // in its list
762   EST_TList<DiphoneVoiceModule*>::Entries module_iter;
763   EST_Utterance *db_utt=0;
764   for( module_iter.begin(voiceModules); module_iter; module_iter++ )
765     if( (*module_iter)->getUtterance(&db_utt, "fileid", utt_fname) == true )
766       break;
767 
768   if( db_utt == 0 )
769     EST_error( "Could not find Utterance %s in any voice module",
770 	       utt_fname.str() );
771   else{
772     // deep copy database utterance and fill in Unit relation
773     *utt_out = new EST_Utterance( *db_utt );
774     CHECK_PTR(utt_out);
775 
776     EST_Utterance myUtt( *db_utt );
777 
778     cerr << myUtt.relation_present( "Segment" ) << " "
779 	 << myUtt.num_relations() <<endl;
780 
781 
782     cerr << db_utt->relation_present( "Segment" ) << " "
783 	 << (*utt_out)->relation_present( "Segment" ) << " "
784 	 << (*utt_out)->num_relations() <<endl;
785 
786 
787     EST_Relation *segs = (*utt_out)->relation( "Segment" );
788     EST_Relation *units = (*utt_out)->create_relation( "Unit" );
789 
790     // Initialise the Unit relation + fill in necessary/suitable
791     // synthesis parameters
792     EST_String ph1, ph2;
793     EST_Item *it = segs->tail();
794     EST_Item *db_utt_seg_it = db_utt->relation( "Segment" )->tail();
795     if( it == 0 )
796       EST_error( "Segment relation is empty" );
797     else{
798       ph2 = it->S("name");
799       while( ((it=it->prev())!=0) &&
800 	     ((db_utt_seg_it=db_utt_seg_it->prev())!=0) ){
801 	EST_Track *coefs = new EST_Track;
802 	CHECK_PTR(coefs);
803 	EST_Wave *sig = new EST_Wave;
804 	CHECK_PTR(sig);
805 	int midf;
806 
807 	(*module_iter)->getDiphone( db_utt_seg_it, coefs, sig, &midf );
808 
809 	ph1 = it->S("name");
810 	EST_Item *t = units->prepend();
811 	t->set( "name", EST_String::cat(ph1,"_",ph2) );
812 	t->set_val( "ph1", est_val(it) );
813 	t->set_val( "sig", est_val( sig ) );
814 	t->set_val( "coefs", est_val( coefs ) );
815 	t->set( "middle_frame", midf );
816 	t->set( "source_utt", db_utt->f.S("fileid"));
817 	t->set_val( "source_ph1", est_val( db_utt_seg_it ));
818 	t->set( "source_end", db_utt_seg_it->F("end"));
819 	t->set( "target_cost", 0.0 );
820 	t->set( "join_cost", 0.0);
821 
822 	ph2 = ph1;
823       }
824     }
825     my_parse_diphone_times( *units, *segs );
826 
827     // this is for copy synthesis, so copy actual timings
828     //for( EST_Item *seg = segs->head(); it!=0; it=it->next() )
829       //seg->set( "end", seg->F("source_end") );
830   }
831 }
832 
833 ////////////////////////////////////////////////////////////////////////
834 ////////////////////////////////////////////////////////////////////////
835 
836 
837 
numUnitTypes() const838 unsigned int DiphoneUnitVoice::numUnitTypes() const
839 {
840   //necessary?
841   return 0;
842 }
843 
numDatabaseUnits() const844 unsigned int DiphoneUnitVoice::numDatabaseUnits() const
845 {
846   unsigned int sum=0;
847 
848   EST_TList<DiphoneVoiceModule*>::Entries it;
849 
850   for( it.begin( voiceModules ); it; it++ )
851     sum += (*it)->numModuleUnits();
852 
853   return sum;
854 }
855 
856 
857 //////////////////////////////////////////////////////////////////////////
858 
set_diphone_backoff(DiphoneBackoff * dbo)859 void DiphoneUnitVoice::set_diphone_backoff(DiphoneBackoff *dbo)
860 {
861   if (diphone_backoff_rules)
862     delete diphone_backoff_rules;
863   diphone_backoff_rules = dbo;
864 }
865 
866 
getPhoneList(const EST_String & phone,ItemList & list)867 int DiphoneUnitVoice::getPhoneList( const EST_String &phone, ItemList &list )
868 {
869   unsigned int n=0;
870 
871   EST_TList<DiphoneVoiceModule*>::Entries it;
872   for( it.begin( voiceModules ); it; it++ )
873     n += (*it)->getPhoneList( phone, list );
874 
875   return n;
876 }
877 
878 
879 
precomputeJoinCosts(const EST_StrList & phones,bool verbose)880 void DiphoneUnitVoice::precomputeJoinCosts( const EST_StrList &phones, bool verbose  )
881 {
882   EST_StrList::Entries it;
883   for( it.begin( phones ); it; it++ ){
884     ItemList *l = new ItemList;
885     CHECK_PTR(l);
886 
887     unsigned int n = getPhoneList( (*it), *l );
888 
889     if( verbose==true )
890       cerr << "phone " << (*it) << "  "  << n << " instances\n";
891 
892     if( n>0 ){
893       jc->computeAndCache( *l, true ); //verbose=true
894     }
895     else
896       EST_warning( "Phone %s not listed in voice", (*it).str() );
897 
898     delete l;
899   }
900 }
901