1 /*************************************************************************/
2 /* */
3 /* Centre for Speech Technology Research */
4 /* University of Edinburgh, UK */
5 /* Copyright (c) 1996,1997 */
6 /* All Rights Reserved. */
7 /* */
8 /* Permission is hereby granted, free of charge, to use and distribute */
9 /* this software and its documentation without restriction, including */
10 /* without limitation the rights to use, copy, modify, merge, publish, */
11 /* distribute, sublicense, and/or sell copies of this work, and to */
12 /* permit persons to whom this work is furnished to do so, subject to */
13 /* the following conditions: */
14 /* 1. The code must retain the above copyright notice, this list of */
15 /* conditions and the following disclaimer. */
16 /* 2. Any modifications must be clearly marked as such. */
17 /* 3. Original authors' names are not deleted. */
18 /* 4. The authors' names are not used to endorse or promote products */
19 /* derived from this software without specific prior written */
20 /* permission. */
21 /* */
22 /* THE UNIVERSITY OF EDINBURGH AND THE CONTRIBUTORS TO THIS WORK */
23 /* DISCLAIM ALL WARRANTIES WITH REGARD TO THIS SOFTWARE, INCLUDING */
24 /* ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS, IN NO EVENT */
25 /* SHALL THE UNIVERSITY OF EDINBURGH NOR THE CONTRIBUTORS BE LIABLE */
26 /* FOR ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES */
27 /* WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN */
28 /* AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, */
29 /* ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF */
30 /* THIS SOFTWARE. */
31 /* */
32 /*************************************************************************/
33 /* Author : Alan W Black */
34 /* Date : May 1996 */
35 /*-----------------------------------------------------------------------*/
36 /* */
37 /* Tree-based prediction of intonation. Uses accent and end */
38 /* tone prediction trees, could be ToBI could be something */
39 /* else, its up to the trees to decide ... */
40 /* */
41 /* Accents and boundaries are predicted by CART tree while */
42 /* the F0 targets are predicted by linear regression (as */
43 /* described in Black and Hunt ICSLP96) */
44 /* */
45 /*=======================================================================*/
46 #include <cstdio>
47 #include "festival.h"
48 #include "intonation.h"
49
50 enum lr_tpos {tp_start, tp_left, tp_mid, tp_right, tp_end};
51
52 static EST_String accent_specified(EST_Item *s);
53 static EST_String tone_specified(EST_Item *s);
54 static int after_pause(EST_Item *s);
55 static int before_pause(EST_Item *s);
56 static EST_Item *vowel_seg(EST_Item *syl);
57 static void init_int_lr_params(void);
58 static void add_target_at(EST_Utterance *u, EST_Item *seg,
59 float val,lr_tpos pos);
60 static float apply_lr_model(LISP model, EST_FVector &feats);
61 static void find_feat_values(EST_Item *s, LISP model,EST_FVector &feats);
62
63 static LISP Intonation_Endtone_Tree_Utt(LISP utt); // ... mh 99-08-06
64 static LISP Intonation_Accent_Tree_Utt(LISP utt);
65
66 static float target_f0_mean = 0.0;
67 static float target_f0_std = 1.0;
68 static float model_f0_mean = 0.0;
69 static float model_f0_std = 1.0;
70
71 #define MZSCORE(X) (((X)-model_f0_mean)/model_f0_std)
72 #define UNTZSCORE(X) (((X)*target_f0_std)+target_f0_mean)
73 #define MAP_F0(X) (UNTZSCORE(MZSCORE(X)))
74
FT_Intonation_Tree_Utt(LISP utt)75 LISP FT_Intonation_Tree_Utt(LISP utt)
76 {
77 // For each syllable predict intonation events. Potentially
78 // two forms, accents and ent tones
79 EST_Utterance *u = get_c_utt(utt);
80
81 u->create_relation("IntEvent");
82 u->create_relation("Intonation");
83
84 utt = Intonation_Endtone_Tree_Utt(utt);
85 utt = Intonation_Accent_Tree_Utt(utt);
86
87 return utt;
88 }
89
Intonation_Accent_Tree_Utt(LISP utt)90 LISP Intonation_Accent_Tree_Utt(LISP utt)
91 {
92 // For each syllable predict intonation events.
93 // here only accents
94 EST_Utterance *u = get_c_utt(utt);
95 EST_Item *s;
96 EST_String paccent;
97 LISP accent_tree;
98
99 accent_tree = siod_get_lval("int_accent_cart_tree","no accent tree");
100
101 for (s=u->relation("Syllable")->first(); s != 0; s=s->next())
102 {
103 if ((paccent = accent_specified(s)) == "0") // check if pre-specified
104 paccent = (EST_String)wagon_predict(s,accent_tree);
105 if (paccent != "NONE")
106 add_IntEvent(u,s,paccent);
107 }
108 return utt;
109 }
110
Intonation_Endtone_Tree_Utt(LISP utt)111 LISP Intonation_Endtone_Tree_Utt(LISP utt)
112 {
113 // For each syllable predict intonation events.
114 // here only endtones
115 EST_Utterance *u = get_c_utt(utt);
116 EST_Item *s;
117 EST_String ptone;
118 LISP endtone_tree;
119
120 endtone_tree = siod_get_lval("int_tone_cart_tree","no tone cart tree");
121
122 for (s=u->relation("Syllable")->first(); s != 0; s=s->next())
123 {
124 if ((ptone = tone_specified(s)) == "0")
125 ptone = (EST_String)wagon_predict(s,endtone_tree);
126 if (ptone != "NONE")
127 add_IntEvent(u,s,ptone);
128 }
129 return utt;
130 }
131
accent_specified(EST_Item * s)132 static EST_String accent_specified(EST_Item *s)
133 {
134 // If there is an explicit accent specifed on the related token
135 // If there is check the syllable to see if its stress or a singleton
136 EST_Item *word = parent(s,"SylStructure");
137 if (!word) return "0";
138 EST_Item *token = parent(word,"Token");
139 EST_String paccent("0");
140 if (token)
141 paccent = (EST_String)ffeature(token,"accent");
142
143 if (paccent == "0")
144 {
145 paccent = (EST_String)ffeature(word,"accent");
146 if (paccent == "0")
147 return paccent;
148 }
149 if (ffeature(s,"stress") == "1")
150 { // only goes on first stressed syllable
151 EST_Item *p;
152 for (p=as(s,"SylStructure")->prev(); p != 0; p=p->prev())
153 if (ffeature(s,"stress") == "1")
154 return "NONE"; // specified but not on this syllable
155 return paccent; // first stressed syl in word
156 }
157 else if (daughter1(word)->length() == 1)
158 return paccent;
159 else
160 return "NONE"; // pre-specified but inappropriate syllable in word
161 }
162
tone_specified(EST_Item * s)163 static EST_String tone_specified(EST_Item *s)
164 {
165 // If there is an explicit accent specifed on the related token
166 // If there is check the syllable to see if its strees or a singleton
167 EST_Item *ss = s->as_relation("SylStructure");
168 EST_Item *word = parent(ss);
169 if (!word) return "0";
170 EST_Item *token = parent(word,"Token");
171 EST_String ptone("0");
172 if (token)
173 ptone = (EST_String)ffeature(token,"tone");
174
175 if (ptone == "0")
176 {
177 ptone = (EST_String)ffeature(word,"tone");
178 if (ptone == "0")
179 return ptone;
180 }
181 if (ss->next() == 0) // final syllable in word
182 return ptone;
183 else
184 return "NONE"; // pre-specified but inappropriate syllable in word
185 }
186
FT_Int_Targets_LR_Utt(LISP utt)187 LISP FT_Int_Targets_LR_Utt(LISP utt)
188 {
189 // Predict F0 targets using Linear regression
190 EST_Utterance *u = get_c_utt(utt);
191 EST_Item *s;
192 float pstart, pmid, pend;
193 LISP start_lr, mid_lr, end_lr;
194
195 init_int_lr_params();
196 // Note the models must *all* be the same size
197 start_lr = siod_get_lval("f0_lr_start","no f0 start lr model");
198 mid_lr = siod_get_lval("f0_lr_mid","no f0 mid lr model");
199 end_lr = siod_get_lval("f0_lr_end","no f0 end lr model");
200
201 u->create_relation("Target");
202 pend = 0;
203 EST_FVector feats;
204 feats.resize(siod_llength(start_lr));
205
206 for (s=u->relation("Syllable")->first(); s != 0; s=s->next())
207 {
208 find_feat_values(s,start_lr,feats);
209 pstart = apply_lr_model(start_lr,feats);
210 pstart = MAP_F0(pstart);
211 if (after_pause(s))
212 add_target_at(u,daughter1(s,"SylStructure"),pstart,tp_start);
213 else
214 add_target_at(u,daughter1(s,"SylStructure"),
215 (pstart+pend)/2.0,tp_start);
216
217 pmid = apply_lr_model(mid_lr,feats);
218 pmid = MAP_F0(pmid);
219 add_target_at(u,vowel_seg(s),pmid,tp_mid);
220
221 pend = apply_lr_model(end_lr,feats);
222 pend = MAP_F0(pend);
223 if (before_pause(s))
224 add_target_at(u,daughtern(s,"SylStructure"),pend,tp_end);
225 }
226
227 return utt;
228
229 }
230
FT_Int_Targets_LR_5_Utt(LISP utt)231 LISP FT_Int_Targets_LR_5_Utt(LISP utt)
232 {
233 // Predict F0 targets using Linear regression
234 // This version uses 5 points rather than 3.
235 EST_Utterance *u = get_c_utt(utt);
236 EST_Item *s;
237 float pstart, pleft, pmid, pright, pend;
238 LISP start_lr, left_lr, mid_lr, right_lr, end_lr;
239
240 init_int_lr_params();
241 // Note the models must *all* be the same size
242 start_lr = siod_get_lval("f0_lr_start","no f0 start lr model");
243 left_lr = siod_get_lval("f0_lr_left","no f0 left lr model");
244 mid_lr = siod_get_lval("f0_lr_mid","no f0 mid lr model");
245 right_lr = siod_get_lval("f0_lr_right","no f0 right lr model");
246 end_lr = siod_get_lval("f0_lr_end","no f0 end lr model");
247
248 u->create_relation("Target");
249 pend = 0;
250 EST_FVector feats;
251 feats.resize(siod_llength(start_lr));
252
253 for (s=u->relation("Syllable")->first(); s != 0; s=s->next())
254 {
255 find_feat_values(s,start_lr,feats);
256 pstart = apply_lr_model(start_lr,feats);
257 pstart = MAP_F0(pstart);
258 if (after_pause(s))
259 add_target_at(u,daughter1(s,"SylStructure"),pstart,tp_start);
260 else
261 add_target_at(u,daughter1(s,"SylStructure"),
262 (pstart+pend)/2.0,tp_start);
263
264 pleft = apply_lr_model(left_lr,feats);
265 pleft = MAP_F0(pleft);
266 add_target_at(u,vowel_seg(s),pleft,tp_left);
267 pmid = apply_lr_model(mid_lr,feats);
268 pmid = MAP_F0(pmid);
269 add_target_at(u,vowel_seg(s),pmid,tp_mid);
270 pright = apply_lr_model(right_lr,feats);
271 pright = MAP_F0(pright);
272 add_target_at(u,vowel_seg(s),pright,tp_right);
273
274 pend = apply_lr_model(end_lr,feats);
275 pend = MAP_F0(pend);
276 if (before_pause(s))
277 add_target_at(u,daughtern(s,"SylStructure"),pend,tp_end);
278 }
279
280 return utt;
281
282 }
283
284
285 #define FFEATURE_NAME(X) (get_c_string(car(X)))
286 #define FFEATURE_WEIGHT(X) (get_c_float(car(cdr(X))))
287 #define FFEATURE_MAPCLASS(X) (car(cdr(cdr(X))))
288
find_feat_values(EST_Item * s,LISP model,EST_FVector & feats)289 static void find_feat_values(EST_Item *s, LISP model,EST_FVector &feats)
290 {
291 EST_Val v = 0.0;
292 int i;
293 LISP f;
294 const char *ffeature_name, *last_name="";
295
296 feats[0] = 1;
297 for (i=1,f=cdr(model); CONSP(f); f=CDR(f),i++)
298 {
299 ffeature_name = FFEATURE_NAME(CAR(f));
300 if (!streq(ffeature_name,last_name))
301 v = ffeature(s,ffeature_name);
302 if (siod_llength(CAR(f)) == 3)
303 { // A map class is specified
304 if (siod_member_str(v.string(),FFEATURE_MAPCLASS(CAR(f))) != NIL)
305 feats[i] = 1;
306 else
307 feats[i] = 0;
308 }
309 else
310 feats[i] = (float)v;
311 last_name = ffeature_name;
312 }
313 }
314
apply_lr_model(LISP model,EST_FVector & feats)315 static float apply_lr_model(LISP model, EST_FVector &feats)
316 {
317 float answer = FFEATURE_WEIGHT(car(model));
318 int i;
319 LISP f;
320
321 for(i=1,f=cdr(model); i<feats.n(); f=cdr(f),i++)
322 answer += feats.a_no_check(i) * FFEATURE_WEIGHT(CAR(f));
323
324 return answer;
325 }
326
init_int_lr_params(void)327 static void init_int_lr_params(void)
328 {
329 LISP params;
330
331 params = siod_get_lval("int_lr_params","no lr params");
332
333 target_f0_mean = get_param_float("target_f0_mean",params,0.0);
334 target_f0_std = get_param_float("target_f0_std",params,1.0);
335 model_f0_mean = get_param_float("model_f0_mean",params,0.0);
336 model_f0_std = get_param_float("model_f0_std",params,1.0);
337 }
338
339
add_target_at(EST_Utterance * u,EST_Item * seg,float val,lr_tpos pos)340 static void add_target_at(EST_Utterance *u, EST_Item *seg,
341 float val,lr_tpos pos)
342 {
343 // Add a target to segment at position
344
345 if (seg == 0)
346 {
347 cerr << "Int_Tree: failed to find seg related to syllable for target."
348 << endl;
349 return;
350 }
351
352 if (pos == tp_start)
353 add_target(u,seg,ffeature(seg,"segment_start").Float(),val);
354 else if (pos == tp_left)
355 add_target(u,seg,
356 0.5*(ffeature(seg,"segment_mid").Float()+
357 ffeature(seg,"segment_start").Float()),
358 val);
359 else if (pos == tp_mid)
360 add_target(u,seg,ffeature(seg,"segment_mid").Float(),val);
361 else if (pos == tp_right)
362 add_target(u,seg,
363 0.5*(ffeature(seg,"segment_mid").Float()+
364 seg->F("end")),
365 val);
366 else if (pos == tp_end)
367 add_target(u,seg,seg->F("end"),val);
368 else
369 {
370 cerr << "add_target_at: unknown position type\n";
371 festival_error();
372 }
373 }
374
after_pause(EST_Item * s)375 static int after_pause(EST_Item *s)
376 {
377 // TRUE if segment immediately previous to this is a silence
378 EST_Item *p;
379 if (s->prev() == 0)
380 return TRUE;
381 EST_Item *ss = s->as_relation("SylStructure");
382 if (s->prev() == ss->prev())
383 return FALSE;
384
385 p = daughter1(ss)->as_relation("Segment")->prev();
386 if (p == 0)
387 return TRUE;
388 else if (ph_is_silence(p->name()))
389 return TRUE;
390 else
391 return FALSE;
392 }
393
before_pause(EST_Item * s)394 static int before_pause(EST_Item *s)
395 {
396 // TRUE is segment immediately after this is a silence
397 if (s->next() == 0)
398 return TRUE;
399 EST_Item *ss = s->as_relation("SylStructure");
400 EST_Item *n = daughtern(ss)->as_relation("Segment")->next();
401 if (ph_is_silence(n->name()))
402 return TRUE;
403 else
404 return FALSE;
405 }
406
vowel_seg(EST_Item * syl)407 static EST_Item *vowel_seg(EST_Item *syl)
408 {
409 // return related to vowel segment
410 EST_Item *p;
411
412 for (p=daughter1(syl,"SylStructure"); p != 0; p=p->next())
413 if (ph_is_vowel(p->name()))
414 return p;
415
416 // No vowel found, so return first daughter.
417 return daughter1(syl,"SylStructure");
418 }
419
420
421