1 /*************************************************************************/
2 /*                                                                       */
3 /*                Centre for Speech Technology Research                  */
4 /*                     University of Edinburgh, UK                       */
5 /*                       Copyright (c) 1996,1997                         */
6 /*                        All Rights Reserved.                           */
7 /*                                                                       */
8 /*  Permission is hereby granted, free of charge, to use and distribute  */
9 /*  this software and its documentation without restriction, including   */
10 /*  without limitation the rights to use, copy, modify, merge, publish,  */
11 /*  distribute, sublicense, and/or sell copies of this work, and to      */
12 /*  permit persons to whom this work is furnished to do so, subject to   */
13 /*  the following conditions:                                            */
14 /*   1. The code must retain the above copyright notice, this list of    */
15 /*      conditions and the following disclaimer.                         */
16 /*   2. Any modifications must be clearly marked as such.                */
17 /*   3. Original authors' names are not deleted.                         */
18 /*   4. The authors' names are not used to endorse or promote products   */
19 /*      derived from this software without specific prior written        */
20 /*      permission.                                                      */
21 /*                                                                       */
22 /*  THE UNIVERSITY OF EDINBURGH AND THE CONTRIBUTORS TO THIS WORK        */
23 /*  DISCLAIM ALL WARRANTIES WITH REGARD TO THIS SOFTWARE, INCLUDING      */
24 /*  ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS, IN NO EVENT   */
25 /*  SHALL THE UNIVERSITY OF EDINBURGH NOR THE CONTRIBUTORS BE LIABLE     */
26 /*  FOR ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES    */
27 /*  WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN   */
28 /*  AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION,          */
29 /*  ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF       */
30 /*  THIS SOFTWARE.                                                       */
31 /*                                                                       */
32 /*************************************************************************/
33 /*             Author :  Alan W Black                                    */
34 /*             Date   :  May 1996                                        */
35 /*-----------------------------------------------------------------------*/
36 /*                                                                       */
37 /* Tree-based prediction of intonation.  Uses accent and end             */
38 /* tone prediction trees, could be ToBI could be something               */
39 /* else, its up to the trees to decide ...                               */
40 /*                                                                       */
41 /* Accents and boundaries are predicted by CART tree while               */
42 /* the F0 targets are predicted by linear regression (as                 */
43 /* described in Black and Hunt ICSLP96)                                  */
44 /*                                                                       */
45 /*=======================================================================*/
46 #include <cstdio>
47 #include "festival.h"
48 #include "intonation.h"
49 
50 enum lr_tpos {tp_start, tp_left, tp_mid, tp_right, tp_end};
51 
52 static EST_String accent_specified(EST_Item *s);
53 static EST_String tone_specified(EST_Item *s);
54 static int after_pause(EST_Item *s);
55 static int before_pause(EST_Item *s);
56 static EST_Item *vowel_seg(EST_Item *syl);
57 static void init_int_lr_params(void);
58 static void add_target_at(EST_Utterance *u, EST_Item *seg,
59 			  float val,lr_tpos pos);
60 static float apply_lr_model(LISP model, EST_FVector &feats);
61 static void find_feat_values(EST_Item *s, LISP model,EST_FVector &feats);
62 
63 static LISP Intonation_Endtone_Tree_Utt(LISP utt);  // ... mh 99-08-06
64 static LISP Intonation_Accent_Tree_Utt(LISP utt);
65 
66 static float target_f0_mean = 0.0;
67 static float target_f0_std = 1.0;
68 static float model_f0_mean = 0.0;
69 static float model_f0_std = 1.0;
70 
71 #define MZSCORE(X) (((X)-model_f0_mean)/model_f0_std)
72 #define UNTZSCORE(X) (((X)*target_f0_std)+target_f0_mean)
73 #define MAP_F0(X) (UNTZSCORE(MZSCORE(X)))
74 
FT_Intonation_Tree_Utt(LISP utt)75 LISP FT_Intonation_Tree_Utt(LISP utt)
76 {
77     // For each syllable predict intonation events.  Potentially
78     // two forms, accents and ent tones
79     EST_Utterance *u = get_c_utt(utt);
80 
81     u->create_relation("IntEvent");
82     u->create_relation("Intonation");
83 
84     utt = Intonation_Endtone_Tree_Utt(utt);
85     utt = Intonation_Accent_Tree_Utt(utt);
86 
87     return utt;
88 }
89 
Intonation_Accent_Tree_Utt(LISP utt)90 LISP Intonation_Accent_Tree_Utt(LISP utt)
91 {
92     // For each syllable predict intonation events.
93     // here only accents
94     EST_Utterance *u = get_c_utt(utt);
95     EST_Item *s;
96     EST_String paccent;
97     LISP accent_tree;
98 
99     accent_tree = siod_get_lval("int_accent_cart_tree","no accent tree");
100 
101     for (s=u->relation("Syllable")->first(); s != 0; s=s->next())
102     {
103 	if ((paccent = accent_specified(s)) == "0") // check if pre-specified
104 	    paccent = (EST_String)wagon_predict(s,accent_tree);
105 	if (paccent != "NONE")
106 	    add_IntEvent(u,s,paccent);
107     }
108     return utt;
109 }
110 
Intonation_Endtone_Tree_Utt(LISP utt)111 LISP Intonation_Endtone_Tree_Utt(LISP utt)
112 {
113     // For each syllable predict intonation events.
114     // here only endtones
115     EST_Utterance *u = get_c_utt(utt);
116     EST_Item *s;
117     EST_String ptone;
118     LISP endtone_tree;
119 
120     endtone_tree = siod_get_lval("int_tone_cart_tree","no tone cart tree");
121 
122     for (s=u->relation("Syllable")->first(); s != 0; s=s->next())
123     {
124 	if ((ptone = tone_specified(s)) == "0")
125 	    ptone = (EST_String)wagon_predict(s,endtone_tree);
126 	if (ptone != "NONE")
127 	    add_IntEvent(u,s,ptone);
128     }
129     return utt;
130 }
131 
accent_specified(EST_Item * s)132 static EST_String accent_specified(EST_Item *s)
133 {
134     // If there is an explicit accent specifed on the related token
135     // If there is check the syllable to see if its stress or a singleton
136     EST_Item *word = parent(s,"SylStructure");
137     if (!word) return "0";
138     EST_Item *token = parent(word,"Token");
139     EST_String paccent("0");
140     if (token)
141 	paccent = (EST_String)ffeature(token,"accent");
142 
143     if (paccent == "0")
144     {
145 	paccent = (EST_String)ffeature(word,"accent");
146 	if (paccent == "0")
147 	    return paccent;
148     }
149     if (ffeature(s,"stress") == "1")
150     {   // only goes on first stressed syllable
151 	EST_Item *p;
152 	for (p=as(s,"SylStructure")->prev(); p != 0; p=p->prev())
153 	    if (ffeature(s,"stress") == "1")
154 		return "NONE";  // specified but not on this syllable
155 	return paccent;  // first stressed syl in word
156     }
157     else if (daughter1(word)->length() == 1)
158 	return paccent;
159     else
160 	return "NONE";  // pre-specified but inappropriate syllable in word
161 }
162 
tone_specified(EST_Item * s)163 static EST_String tone_specified(EST_Item *s)
164 {
165     // If there is an explicit accent specifed on the related token
166     // If there is check the syllable to see if its strees or a singleton
167     EST_Item *ss = s->as_relation("SylStructure");
168     EST_Item *word = parent(ss);
169     if (!word) return "0";
170     EST_Item *token = parent(word,"Token");
171     EST_String ptone("0");
172     if (token)
173 	ptone = (EST_String)ffeature(token,"tone");
174 
175     if (ptone == "0")
176     {
177 	ptone = (EST_String)ffeature(word,"tone");
178 	if (ptone == "0")
179 	    return ptone;
180     }
181     if (ss->next() == 0)  // final syllable in word
182 	return ptone;
183     else
184 	return "NONE";  // pre-specified but inappropriate syllable in word
185 }
186 
FT_Int_Targets_LR_Utt(LISP utt)187 LISP FT_Int_Targets_LR_Utt(LISP utt)
188 {
189     // Predict F0 targets using Linear regression
190     EST_Utterance *u = get_c_utt(utt);
191     EST_Item *s;
192     float pstart, pmid, pend;
193     LISP start_lr, mid_lr, end_lr;
194 
195     init_int_lr_params();
196     // Note the models must *all* be the same size
197     start_lr = siod_get_lval("f0_lr_start","no f0 start lr model");
198     mid_lr = siod_get_lval("f0_lr_mid","no f0 mid lr model");
199     end_lr = siod_get_lval("f0_lr_end","no f0 end lr model");
200 
201     u->create_relation("Target");
202     pend = 0;
203     EST_FVector feats;
204     feats.resize(siod_llength(start_lr));
205 
206     for (s=u->relation("Syllable")->first(); s != 0; s=s->next())
207     {
208 	find_feat_values(s,start_lr,feats);
209 	pstart = apply_lr_model(start_lr,feats);
210 	pstart = MAP_F0(pstart);
211 	if (after_pause(s))
212 	    add_target_at(u,daughter1(s,"SylStructure"),pstart,tp_start);
213 	else
214 	    add_target_at(u,daughter1(s,"SylStructure"),
215 			  (pstart+pend)/2.0,tp_start);
216 
217 	pmid = apply_lr_model(mid_lr,feats);
218 	pmid = MAP_F0(pmid);
219 	add_target_at(u,vowel_seg(s),pmid,tp_mid);
220 
221 	pend = apply_lr_model(end_lr,feats);
222 	pend = MAP_F0(pend);
223 	if (before_pause(s))
224 	    add_target_at(u,daughtern(s,"SylStructure"),pend,tp_end);
225     }
226 
227     return utt;
228 
229 }
230 
FT_Int_Targets_LR_5_Utt(LISP utt)231 LISP FT_Int_Targets_LR_5_Utt(LISP utt)
232 {
233   // Predict F0 targets using Linear regression
234   // This version uses 5 points rather than 3.
235    EST_Utterance *u = get_c_utt(utt);
236     EST_Item *s;
237     float pstart, pleft, pmid, pright, pend;
238     LISP start_lr, left_lr, mid_lr, right_lr, end_lr;
239 
240     init_int_lr_params();
241     // Note the models must *all* be the same size
242     start_lr = siod_get_lval("f0_lr_start","no f0 start lr model");
243     left_lr = siod_get_lval("f0_lr_left","no f0 left lr model");
244     mid_lr = siod_get_lval("f0_lr_mid","no f0 mid lr model");
245     right_lr = siod_get_lval("f0_lr_right","no f0 right lr model");
246     end_lr = siod_get_lval("f0_lr_end","no f0 end lr model");
247 
248     u->create_relation("Target");
249     pend = 0;
250     EST_FVector feats;
251     feats.resize(siod_llength(start_lr));
252 
253     for (s=u->relation("Syllable")->first(); s != 0; s=s->next())
254     {
255 	find_feat_values(s,start_lr,feats);
256 	pstart = apply_lr_model(start_lr,feats);
257 	pstart = MAP_F0(pstart);
258 	if (after_pause(s))
259 	    add_target_at(u,daughter1(s,"SylStructure"),pstart,tp_start);
260 	else
261 	    add_target_at(u,daughter1(s,"SylStructure"),
262 			  (pstart+pend)/2.0,tp_start);
263 
264 	pleft = apply_lr_model(left_lr,feats);
265 	pleft = MAP_F0(pleft);
266 	add_target_at(u,vowel_seg(s),pleft,tp_left);
267 	pmid = apply_lr_model(mid_lr,feats);
268 	pmid = MAP_F0(pmid);
269 	add_target_at(u,vowel_seg(s),pmid,tp_mid);
270 	pright = apply_lr_model(right_lr,feats);
271 	pright = MAP_F0(pright);
272 	add_target_at(u,vowel_seg(s),pright,tp_right);
273 
274 	pend = apply_lr_model(end_lr,feats);
275 	pend = MAP_F0(pend);
276 	if (before_pause(s))
277 	    add_target_at(u,daughtern(s,"SylStructure"),pend,tp_end);
278     }
279 
280     return utt;
281 
282 }
283 
284 
285 #define FFEATURE_NAME(X) (get_c_string(car(X)))
286 #define FFEATURE_WEIGHT(X) (get_c_float(car(cdr(X))))
287 #define FFEATURE_MAPCLASS(X) (car(cdr(cdr(X))))
288 
find_feat_values(EST_Item * s,LISP model,EST_FVector & feats)289 static void find_feat_values(EST_Item *s, LISP model,EST_FVector &feats)
290 {
291     EST_Val v = 0.0;
292     int i;
293     LISP f;
294     const char *ffeature_name, *last_name="";
295 
296     feats[0] = 1;
297     for (i=1,f=cdr(model); CONSP(f); f=CDR(f),i++)
298     {
299 	ffeature_name = FFEATURE_NAME(CAR(f));
300 	if (!streq(ffeature_name,last_name))
301 	    v = ffeature(s,ffeature_name);
302 	if (siod_llength(CAR(f)) == 3)
303 	{   // A map class is specified
304 	    if (siod_member_str(v.string(),FFEATURE_MAPCLASS(CAR(f))) != NIL)
305 		feats[i] = 1;
306 	    else
307 		feats[i] = 0;
308 	}
309 	else
310 	    feats[i] = (float)v;
311 	last_name = ffeature_name;
312     }
313 }
314 
apply_lr_model(LISP model,EST_FVector & feats)315 static float apply_lr_model(LISP model, EST_FVector &feats)
316 {
317     float answer = FFEATURE_WEIGHT(car(model));
318     int i;
319     LISP f;
320 
321     for(i=1,f=cdr(model); i<feats.n(); f=cdr(f),i++)
322 	answer += feats.a_no_check(i) * FFEATURE_WEIGHT(CAR(f));
323 
324     return answer;
325 }
326 
init_int_lr_params(void)327 static void init_int_lr_params(void)
328 {
329     LISP params;
330 
331     params = siod_get_lval("int_lr_params","no lr params");
332 
333     target_f0_mean = get_param_float("target_f0_mean",params,0.0);
334     target_f0_std = get_param_float("target_f0_std",params,1.0);
335     model_f0_mean = get_param_float("model_f0_mean",params,0.0);
336     model_f0_std = get_param_float("model_f0_std",params,1.0);
337 }
338 
339 
add_target_at(EST_Utterance * u,EST_Item * seg,float val,lr_tpos pos)340 static void add_target_at(EST_Utterance *u, EST_Item *seg,
341 			  float val,lr_tpos pos)
342 {
343     // Add a target to segment at position
344 
345     if (seg == 0)
346     {
347 	cerr << "Int_Tree: failed to find seg related to syllable for target."
348 	    << endl;
349 	return;
350     }
351 
352   if (pos == tp_start)
353     add_target(u,seg,ffeature(seg,"segment_start").Float(),val);
354   else if (pos == tp_left)
355     add_target(u,seg,
356 	       0.5*(ffeature(seg,"segment_mid").Float()+
357 		    ffeature(seg,"segment_start").Float()),
358 	       val);
359   else if (pos == tp_mid)
360     add_target(u,seg,ffeature(seg,"segment_mid").Float(),val);
361   else if (pos == tp_right)
362     add_target(u,seg,
363 	       0.5*(ffeature(seg,"segment_mid").Float()+
364 		    seg->F("end")),
365 	       val);
366   else if (pos == tp_end)
367     add_target(u,seg,seg->F("end"),val);
368   else
369     {
370 	cerr << "add_target_at: unknown position type\n";
371 	festival_error();
372     }
373 }
374 
after_pause(EST_Item * s)375 static int after_pause(EST_Item *s)
376 {
377     // TRUE if segment immediately previous to this is a silence
378     EST_Item *p;
379     if (s->prev() == 0)
380 	return TRUE;
381     EST_Item *ss = s->as_relation("SylStructure");
382     if (s->prev() == ss->prev())
383 	return FALSE;
384 
385     p = daughter1(ss)->as_relation("Segment")->prev();
386     if (p == 0)
387 	return TRUE;
388     else if (ph_is_silence(p->name()))
389 	return TRUE;
390     else
391 	return FALSE;
392 }
393 
before_pause(EST_Item * s)394 static int before_pause(EST_Item *s)
395 {
396     // TRUE is segment immediately after this is a silence
397     if (s->next() == 0)
398 	return TRUE;
399     EST_Item *ss = s->as_relation("SylStructure");
400     EST_Item *n = daughtern(ss)->as_relation("Segment")->next();
401     if (ph_is_silence(n->name()))
402 	return TRUE;
403     else
404 	return FALSE;
405 }
406 
vowel_seg(EST_Item * syl)407 static EST_Item *vowel_seg(EST_Item *syl)
408 {
409     // return related to vowel segment
410     EST_Item *p;
411 
412     for (p=daughter1(syl,"SylStructure"); p != 0; p=p->next())
413 	if (ph_is_vowel(p->name()))
414 	    return p;
415 
416     // No vowel found, so return first daughter.
417     return daughter1(syl,"SylStructure");
418 }
419 
420 
421