1 /*************************************************************************/
2 /* */
3 /* Centre for Speech Technology Research */
4 /* (University of Edinburgh, UK) and */
5 /* Korin Richmond */
6 /* Copyright (c) 2002 */
7 /* All Rights Reserved. */
8 /* */
9 /* Permission is hereby granted, free of charge, to use and distribute */
10 /* this software and its documentation without restriction, including */
11 /* without limitation the rights to use, copy, modify, merge, publish, */
12 /* distribute, sublicense, and/or sell copies of this work, and to */
13 /* permit persons to whom this work is furnished to do so, subject to */
14 /* the following conditions: */
15 /* */
16 /* 1. The code must retain the above copyright notice, this list of */
17 /* conditions and the following disclaimer. */
18 /* 2. Any modifications must be clearly marked as such. */
19 /* 3. Original authors' names are not deleted. */
20 /* 4. The authors' names are not used to endorse or promote products */
21 /* derived from this software without specific prior written */
22 /* permission. */
23 /* */
24 /* THE UNIVERSITY OF EDINBURGH AND THE CONTRIBUTORS TO THIS WORK */
25 /* DISCLAIM ALL WARRANTIES WITH REGARD TO THIS SOFTWARE, INCLUDING */
26 /* ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS. IN NO EVENT */
27 /* SHALL THE UNIVERSITY OF EDINBURGH NOR THE CONTRIBUTORS BE LIABLE */
28 /* FOR ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES */
29 /* WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN */
30 /* AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, */
31 /* ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF */
32 /* THIS SOFTWARE. */
33 /* */
34 /*************************************************************************/
35 /* */
36 /* Author: Korin Richmond */
37 /* Date: Aug 2002 */
38 /* --------------------------------------------------------------------- */
39 /* first stab at a diphone unit selection "voice" - using a list of */
40 /* utterance objects */
41 /*************************************************************************/
42
43
44 #ifndef __DIPHONEUNITVOICE_H__
45 #define __DIPHONEUNITVOICE_H__
46
47 #include "VoiceBase.h"
48 #include "DiphoneBackoff.h"
49 #include "siod_defs.h"
50 #include "EST_Val_defs.h"
51 #include "EST_String.h"
52 #include "EST_FlatTargetCost.h"
53
54 #include "EST_types.h" // for EST_StrList
55
56 class EST_Utterance;
57 class EST_Relation;
58 class EST_VTCandidate;
59 class EST_VTPath;
60 class EST_Features;
61 class EST_Track;
62 class EST_Wave;
63 class EST_Item;
64 class DiphoneVoiceModule;
65 class EST_JoinCost;
66
67
68 #include "EST_THash.h"
69 template<class T> class EST_TList;
70 typedef EST_TList<EST_Item*> ItemList;
71
SIOD_REGISTER_TYPE_DCLS(itemlist,ItemList)72 SIOD_REGISTER_TYPE_DCLS(itemlist, ItemList)
73 VAL_REGISTER_TYPE_DCLS(itemlist, ItemList)
74
75 SIOD_REGISTER_CLASS_DCLS(du_voice,DiphoneUnitVoice)
76 VAL_REGISTER_CLASS_DCLS(du_voice,DiphoneUnitVoice)
77
78
79 class DiphoneUnitVoice : public VoiceBase {
80 public:
81 DiphoneUnitVoice( const EST_StrList& basenames,
82 const EST_String& uttDir,
83 const EST_String& wavDir,
84 const EST_String& pmDir,
85 const EST_String& coefDir,
86 unsigned int srate = 16000,
87 const EST_String& uttExt = ".utt",
88 const EST_String& wavExt = ".wav",
89 const EST_String& pmExt = ".pm",
90 const EST_String& coefExt = ".coef" );
91
92 virtual ~DiphoneUnitVoice();
93
94 virtual void initialise( bool ignore_bad_tag=false );
95 virtual unsigned int numDatabaseUnits() const;
96 virtual unsigned int numUnitTypes() const;
97
98 virtual bool synthesiseWave( EST_Utterance *utt );
99
100 virtual void getUnitSequence( EST_Utterance *utt );
101
102 void regetUnitSequence( EST_Utterance *utt );
103
104 void getCopyUnitUtterance( const EST_String &utt_fname,
105 EST_Utterance **utt_out ) const;
106
107 EST_VTCandidate* getCandidates( EST_Item *s, EST_Features &f ) const;
108 void diphoneCoverage(const EST_String filename) const;
109
110 virtual bool unitAvailable( const EST_String &diphone ) const;
111 virtual unsigned int numAvailableCandidates( const EST_String &unit ) const;
112
113 unsigned int numModules() const { return voiceModules.length(); }
114
115 bool addVoiceModule( const EST_StrList& basenames,
116 const EST_String& uttDir,
117 const EST_String& wavDir,
118 const EST_String& pmDir,
119 const EST_String& coefDir,
120 unsigned int srate = 16000,
121 const EST_String& uttExt = ".utt",
122 const EST_String& wavExt = ".wav",
123 const EST_String& pmExt = ".pm",
124 const EST_String& coefExt = ".coef" );
125
126
127 // assume responsibility to delete vm when done with it
128 void registerVoiceModule( DiphoneVoiceModule *vm );
129
130 // del=true means it's ok to delete the join cost when we're done
131 // with it
132 void setJoinCost( EST_JoinCost *jcost, bool del=false );
133 const EST_JoinCost& getJoinCostCalculator( ) const { return *jc; }
134
135 void setTargetCost( EST_TargetCost *tcost, bool del=false );
136 const EST_TargetCost& getTargetCostCalculator( ) const { return *tc; }
137
138 void set_pruning_beam( float width ) { pruning_beam=width; }
139 float get_pruning_beam( ) const { return pruning_beam; }
140
141 void set_ob_pruning_beam( float width ){ ob_pruning_beam=width; }
142 float get_ob_pruning_beam( ) const { return ob_pruning_beam; }
143
144 void set_jc_f0_weight( float val ) { jc_f0_weight=val; }
145 float get_jc_f0_weight() { return jc_f0_weight; }
146 EST_JoinCost * get_jc() { return jc; }
147
148 void set_jc_power_weight( float val ) { jc_power_weight=val; }
149 float get_jc_power_weight() { return jc_power_weight; }
150
151 void set_jc_spectral_weight( float val ) { jc_spectral_weight=val; }
152 float get_jc_spectral_weight() { return jc_spectral_weight; }
153
154 void set_tc_rescoring_beam( float width ){ tc_rescoring_beam = width; }
155 float get_tc_rescoring_beam( ) const { return tc_rescoring_beam; }
156
157 void set_tc_rescoring_weight( float weight ){ tc_rescoring_weight = weight; }
158 float get_tc_rescoring_weight( ) const { return tc_rescoring_weight; }
159
160 void set_target_cost_weight( float w ){ tc_weight=w; }
161 float get_target_cost_weight() const { return tc_weight; }
162
163 void set_join_cost_weight( float w ){ jc_weight=w; }
164 float get_join_cost_weight() const { return jc_weight; }
165
166 void set_prosodic_modification( int m ){ prosodic_modification=m; }
167 int get_prosodic_modification() const { return prosodic_modification; }
168
169 void set_wav_samplerate( unsigned int sr ) { wav_srate = sr; }
170 unsigned int get_wav_samplerate( ) const { return wav_srate; }
171
172 void precomputeJoinCosts( const EST_StrList &phones, bool verbose=true );
173
174 private:
175 // don't allow copying of Voices (for now?)
176 DiphoneUnitVoice( const DiphoneUnitVoice& );
177 DiphoneUnitVoice& operator=( const DiphoneUnitVoice& );
178
179 void addToCatalogue( const EST_Utterance *utt );
180
181 void getDiphone( const EST_VTCandidate *cand,
182 EST_Track* coef, EST_Wave* sig, int *midframe,
183 bool extendLeft=0, bool extendRight=0 );
184
185 int getPhoneList( const EST_String &phone, ItemList &list );
186
187 void fillUnitRelation( EST_Relation *units, const EST_VTPath *path );
188
189 private:
190 EST_TList<DiphoneVoiceModule*> voiceModules;
191 float pruning_beam; // beam pruning
192 float ob_pruning_beam; // observation beam pruning
193
194 float tc_rescoring_beam;
195 float tc_rescoring_weight;
196
197 float tc_weight;
198 float jc_weight;
199
200 float jc_f0_weight; // join cost f0 weight
201 float jc_power_weight; // join cost f0 weight
202 float jc_spectral_weight; // join cost spectral weight
203
204 int prosodic_modification;
205
206 unsigned int wav_srate;
207
208 EST_JoinCost *jc;
209 bool jc_delete;
210
211 EST_TargetCost *tc;
212 bool tc_delete;
213
214 TCDataHash *tcdh;
215
216 private:
217 DiphoneBackoff *diphone_backoff_rules; // diphone backoff rules
218
219 public:
220 void set_diphone_backoff(DiphoneBackoff *dbo);
221
222 };
223
224
225 #endif // __DIPHONEUNITVOICE_H__
226
227