1 /*************************************************************************/
2 /*                                                                       */
3 /*                Centre for Speech Technology Research                  */
4 /*                 (University of Edinburgh, UK) and                     */
5 /*                           Korin Richmond                              */
6 /*                         Copyright (c) 2002                            */
7 /*                         All Rights Reserved.                          */
8 /*                                                                       */
9 /*  Permission is hereby granted, free of charge, to use and distribute  */
10 /*  this software and its documentation without restriction, including   */
11 /*  without limitation the rights to use, copy, modify, merge, publish,  */
12 /*  distribute, sublicense, and/or sell copies of this work, and to      */
13 /*  permit persons to whom this work is furnished to do so, subject to   */
14 /*  the following conditions:                                            */
15 /*                                                                       */
16 /*   1. The code must retain the above copyright notice, this list of    */
17 /*      conditions and the following disclaimer.                         */
18 /*   2. Any modifications must be clearly marked as such.                */
19 /*   3. Original authors' names are not deleted.                         */
20 /*   4. The authors' names are not used to endorse or promote products   */
21 /*      derived from this software without specific prior written        */
22 /*      permission.                                                      */
23 /*                                                                       */
24 /*  THE UNIVERSITY OF EDINBURGH AND THE CONTRIBUTORS TO THIS WORK        */
25 /*  DISCLAIM ALL WARRANTIES WITH REGARD TO THIS SOFTWARE, INCLUDING      */
26 /*  ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS. IN NO EVENT   */
27 /*  SHALL THE UNIVERSITY OF EDINBURGH NOR THE CONTRIBUTORS BE LIABLE     */
28 /*  FOR ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES    */
29 /*  WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN   */
30 /*  AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION,          */
31 /*  ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF       */
32 /*  THIS SOFTWARE.                                                       */
33 /*                                                                       */
34 /*************************************************************************/
35 /*                                                                       */
36 /*                          Author: Korin Richmond                       */
37 /*                            Date:  Aug  2002                           */
38 /* --------------------------------------------------------------------- */
39 /* first stab at a diphone unit selection "voice" - using a list of      */
40 /* utterance objects                                                     */
41 /*************************************************************************/
42 
43 
44 #ifndef __DIPHONEUNITVOICE_H__
45 #define __DIPHONEUNITVOICE_H__
46 
47 #include "VoiceBase.h"
48 #include "DiphoneBackoff.h"
49 #include "siod_defs.h"
50 #include "EST_Val_defs.h"
51 #include "EST_String.h"
52 #include "EST_FlatTargetCost.h"
53 
54 #include "EST_types.h" // for EST_StrList
55 
56 class EST_Utterance;
57 class EST_Relation;
58 class EST_VTCandidate;
59 class EST_VTPath;
60 class EST_Features;
61 class EST_Track;
62 class EST_Wave;
63 class EST_Item;
64 class DiphoneVoiceModule;
65 class EST_JoinCost;
66 
67 
68 #include "EST_THash.h"
69 template<class T> class EST_TList;
70 typedef EST_TList<EST_Item*> ItemList;
71 
SIOD_REGISTER_TYPE_DCLS(itemlist,ItemList)72 SIOD_REGISTER_TYPE_DCLS(itemlist, ItemList)
73 VAL_REGISTER_TYPE_DCLS(itemlist, ItemList)
74 
75 SIOD_REGISTER_CLASS_DCLS(du_voice,DiphoneUnitVoice)
76 VAL_REGISTER_CLASS_DCLS(du_voice,DiphoneUnitVoice)
77 
78 
79 class DiphoneUnitVoice : public VoiceBase {
80 public:
81   DiphoneUnitVoice( const EST_StrList& basenames,
82 		    const EST_String& uttDir,
83 		    const EST_String& wavDir,
84 		    const EST_String& pmDir,
85 		    const EST_String& coefDir,
86 		    unsigned int srate = 16000,
87 		    const EST_String& uttExt  = ".utt",
88 		    const EST_String& wavExt  = ".wav",
89 		    const EST_String& pmExt   = ".pm",
90 		    const EST_String& coefExt = ".coef" );
91 
92   virtual ~DiphoneUnitVoice();
93 
94   virtual void initialise( bool ignore_bad_tag=false );
95   virtual unsigned int numDatabaseUnits() const;
96   virtual unsigned int numUnitTypes() const;
97 
98   virtual bool synthesiseWave( EST_Utterance *utt );
99 
100   virtual void getUnitSequence( EST_Utterance *utt );
101 
102   void regetUnitSequence( EST_Utterance *utt );
103 
104   void getCopyUnitUtterance( const EST_String &utt_fname,
105 			     EST_Utterance **utt_out ) const;
106 
107   EST_VTCandidate* getCandidates( EST_Item *s, EST_Features &f ) const;
108   void diphoneCoverage(const EST_String filename) const;
109 
110   virtual bool unitAvailable( const EST_String &diphone ) const;
111   virtual unsigned int numAvailableCandidates( const EST_String &unit ) const;
112 
113   unsigned int numModules() const { return voiceModules.length(); }
114 
115   bool addVoiceModule( const EST_StrList& basenames,
116 		       const EST_String& uttDir,
117 		       const EST_String& wavDir,
118 		       const EST_String& pmDir,
119 		       const EST_String& coefDir,
120 		       unsigned int srate = 16000,
121 		       const EST_String& uttExt  = ".utt",
122 		       const EST_String& wavExt  = ".wav",
123 		       const EST_String& pmExt   = ".pm",
124 		       const EST_String& coefExt = ".coef" );
125 
126 
127   // assume responsibility to delete vm when done with it
128   void registerVoiceModule( DiphoneVoiceModule *vm );
129 
130   // del=true means it's ok to delete the join cost when we're done
131   // with it
132   void setJoinCost( EST_JoinCost *jcost, bool del=false );
133   const EST_JoinCost& getJoinCostCalculator( ) const { return *jc; }
134 
135   void setTargetCost( EST_TargetCost *tcost, bool del=false );
136   const EST_TargetCost& getTargetCostCalculator( ) const { return *tc; }
137 
138   void  set_pruning_beam( float width ) { pruning_beam=width; }
139   float get_pruning_beam( ) const { return pruning_beam; }
140 
141   void  set_ob_pruning_beam( float width ){ ob_pruning_beam=width; }
142   float get_ob_pruning_beam( ) const { return ob_pruning_beam; }
143 
144   void  set_jc_f0_weight( float val ) { jc_f0_weight=val; }
145   float get_jc_f0_weight() { return jc_f0_weight; }
146   EST_JoinCost * get_jc() { return jc; }
147 
148   void  set_jc_power_weight( float val ) { jc_power_weight=val; }
149   float get_jc_power_weight() { return jc_power_weight; }
150 
151   void  set_jc_spectral_weight( float val ) { jc_spectral_weight=val; }
152   float get_jc_spectral_weight() { return jc_spectral_weight; }
153 
154   void  set_tc_rescoring_beam( float width ){ tc_rescoring_beam = width; }
155   float get_tc_rescoring_beam( ) const { return tc_rescoring_beam; }
156 
157   void  set_tc_rescoring_weight( float weight ){ tc_rescoring_weight = weight; }
158   float get_tc_rescoring_weight( ) const { return tc_rescoring_weight; }
159 
160   void  set_target_cost_weight( float w ){ tc_weight=w; }
161   float get_target_cost_weight() const { return tc_weight; }
162 
163   void  set_join_cost_weight( float w ){ jc_weight=w; }
164   float get_join_cost_weight() const { return jc_weight; }
165 
166   void  set_prosodic_modification( int m ){ prosodic_modification=m; }
167   int get_prosodic_modification() const { return prosodic_modification; }
168 
169   void set_wav_samplerate( unsigned int sr ) { wav_srate = sr; }
170   unsigned int get_wav_samplerate( ) const { return wav_srate; }
171 
172   void precomputeJoinCosts( const EST_StrList &phones, bool verbose=true );
173 
174 private:
175   // don't allow copying of Voices (for now?)
176   DiphoneUnitVoice( const DiphoneUnitVoice& );
177   DiphoneUnitVoice& operator=( const DiphoneUnitVoice& );
178 
179   void addToCatalogue( const EST_Utterance *utt );
180 
181   void getDiphone( const EST_VTCandidate *cand,
182 		   EST_Track* coef, EST_Wave* sig, int *midframe,
183 		   bool extendLeft=0, bool extendRight=0 );
184 
185   int getPhoneList( const EST_String &phone, ItemList &list );
186 
187   void fillUnitRelation( EST_Relation *units, const EST_VTPath *path );
188 
189 private:
190   EST_TList<DiphoneVoiceModule*> voiceModules;
191   float pruning_beam;     // beam pruning
192   float ob_pruning_beam;  // observation beam pruning
193 
194   float tc_rescoring_beam;
195   float tc_rescoring_weight;
196 
197   float tc_weight;
198   float jc_weight;
199 
200   float jc_f0_weight;    // join cost f0 weight
201   float jc_power_weight;    // join cost f0 weight
202   float jc_spectral_weight; // join cost spectral weight
203 
204   int prosodic_modification;
205 
206   unsigned int wav_srate;
207 
208   EST_JoinCost *jc;
209   bool jc_delete;
210 
211   EST_TargetCost *tc;
212   bool tc_delete;
213 
214   TCDataHash *tcdh;
215 
216 private:
217   DiphoneBackoff *diphone_backoff_rules;   // diphone backoff rules
218 
219 public:
220   void set_diphone_backoff(DiphoneBackoff *dbo);
221 
222 };
223 
224 
225 #endif // __DIPHONEUNITVOICE_H__
226 
227