1 /*************************************************************************/
2 /*                                                                       */
3 /*                   Carnegie Mellon University and                      */
4 /*                Centre for Speech Technology Research                  */
5 /*                     University of Edinburgh, UK                       */
6 /*                       Copyright (c) 1998-2001                         */
7 /*                        All Rights Reserved.                           */
8 /*                                                                       */
9 /*  Permission is hereby granted, free of charge, to use and distribute  */
10 /*  this software and its documentation without restriction, including   */
11 /*  without limitation the rights to use, copy, modify, merge, publish,  */
12 /*  distribute, sublicense, and/or sell copies of this work, and to      */
13 /*  permit persons to whom this work is furnished to do so, subject to   */
14 /*  the following conditions:                                            */
15 /*   1. The code must retain the above copyright notice, this list of    */
16 /*      conditions and the following disclaimer.                         */
17 /*   2. Any modifications must be clearly marked as such.                */
18 /*   3. Original authors' names are not deleted.                         */
19 /*   4. The authors' names are not used to endorse or promote products   */
20 /*      derived from this software without specific prior written        */
21 /*      permission.                                                      */
22 /*                                                                       */
23 /*  THE UNIVERSITY OF EDINBURGH, CARNEGIE MELLON UNIVERSITY AND THE      */
24 /*  CONTRIBUTORS TO THIS WORK DISCLAIM ALL WARRANTIES WITH REGARD TO     */
25 /*  THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY   */
26 /*  AND FITNESS, IN NO EVENT SHALL THE UNIVERSITY OF EDINBURGH, CARNEGIE */
27 /*  MELLON UNIVERSITY NOR THE CONTRIBUTORS BE LIABLE FOR ANY SPECIAL,    */
28 /*  INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES WHATSOEVER          */
29 /*  RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN  AN ACTION   */
30 /*  OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF     */
31 /*  OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.       */
32 /*                                                                       */
33 /*************************************************************************/
34 /*             Author :  Alan W Black                                    */
35 /*             Date   :  April 1998                                      */
36 /*-----------------------------------------------------------------------*/
37 /*                                                                       */
38 /*  Yet another unit selection method.                                   */
39 /*                                                                       */
40 /*  Using an acoustic measure find the distance between all units in the */
41 /*  db.  Try to minimise the mean difference between units in a cluster  */
42 /*  using CART technology, based on features like phonetic and prosodic  */
43 /*  context.  This gives a bunch of CARTs for each unit type in the db   */
44 /*  which are acoustically close.  Use these as candidates and optimise  */
45 /*  a path through them minimising join using a viterbi search.          */
46 /*                                                                       */
47 /*  Advantages:                                                          */
48 /*    requires little or no measurements at selection time               */
49 /*    allows for clear method of pruning                                 */
50 /*    no weights need to be generated (well, except where they do)       */
51 /*    will optimise appropriately with varying numbers of example units  */
52 /*                                                                       */
53 /*  Disadvantages:                                                       */
54 /*    Units can't cross between clusters                                 */
55 /*                                                                       */
56 /*  Implementation of Black, A. and Taylor, P. (1997). Automatically     */
57 /*  clustering similar units for unit selection in speech synthesis      */
58 /*  Proceedings of Eurospeech 97, vol2 pp 601-604, Rhodes, Greece.       */
59 /*                                                                       */
60 /*  postscript: http://www.cs.cmu.edu/~awb/papers/ES97units.ps           */
61 /*  http://www.cs.cmu.edu/~awb/papers/ES97units/ES97units.html           */
62 /*                                                                       */
63 /*  Comments:                                                            */
64 /*                                                                       */
65 /*  This is a new implementation using the newer unit selection/signal   */
66 /*  processing archtecture in festival                                   */
67 /*                                                                       */
68 /*  This is still in development but become more stable.  It is robust   */
69 /*  for many cases, though a lot depends on the db and parameters        */
70 /*  you use                                                              */
71 /*                                                                       */
72 /*  This had significant new work (and bug fixes) done on it when awb    */
73 /*  moved to CMU                                                         */
74 /*                                                                       */
75 /*=======================================================================*/
76 #include <cstdlib>
77 #include "EST_math.h"
78 #include "festival.h"
79 #include "clunits.h"
80 
81 static EST_String static_unit_prev_move = "unit_prev_move";
82 static EST_String static_unit_this_move = "unit_this_move";
83 static EST_String static_jscore = "local_join_cost";
84 static EST_String static_tscore = "local_target_cost";
85 static EST_String static_cscore = "cummulative_unit_score";
86 
87 static void setup_clunits_params();
88 static EST_VTCandidate *TS_candlist(EST_Item *s,EST_Features &f);
89 static EST_VTPath *TS_npath(EST_VTPath *p,EST_VTCandidate *c,EST_Features &f);
90 static float naive_join_cost(CLunit *unit0, CLunit *unit1,
91 			     EST_Item *s,
92 			     float &u0_move,
93 			     float &u1_move);
94 static float optimal_couple(CLunit *u0,
95 			    CLunit *u1,
96 			    float &u0_move,
97 			    float &u1_move,
98 			    int type,
99 			    float different_prev_pen,
100 			    float non_consecutive_pen);
101 static void cl_parse_diphone_times(EST_Relation &diphone_stream,
102 				   EST_Relation &source_lab);
103 
104 VAL_REGISTER_CLASS_NODEL(vtcand,EST_VTCandidate);
105 VAL_REGISTER_CLASS_NODEL(clunit,CLunit);
106 
107 LISP selection_trees = NIL;
108 LISP clunits_params = NIL;
109 static int optimal_coupling = 0;
110 static int extend_selections = 0;
111 static int clunits_debug = 0;
112 static int clunits_log_scores = 0;
113 static int clunits_smooth_frames = 0;
114 float continuity_weight = 1;
115 float f0_join_weight = 0.0;
116 float different_prev_pen = 1000.0;
117 float non_consecutive_pen = 100.0;
118 static EST_String clunit_name_feat = "name";
119 
120 static CLDB *cldb;
121 
clunits_select(LISP utt)122 static LISP clunits_select(LISP utt)
123 {
124     // Select units from db using CARTs to index into clustered unit groups
125     EST_Utterance *u = get_c_utt(utt);
126     EST_Item *s, *f;
127 
128     cldb = check_cldb();  // make sure there is one loaded
129     setup_clunits_params();
130 
131     f = u->relation("Segment")->head();
132     for (s=f; s; s=s->next())
133 	s->set_val("clunit_name",ffeature(s,clunit_name_feat));
134 
135     if (f)
136     {
137 	EST_Viterbi_Decoder v(TS_candlist,TS_npath,-1);
138 	v.set_big_is_good(FALSE);  // big is bad
139 
140 	v.initialise(u->relation("Segment"));
141 	v.search();
142 	if (!v.result("unit_id"))
143 	{
144 	    cerr << "CLUNIT: failed to find path\n";
145 	    return utt;
146 	}
147 	v.copy_feature(static_unit_this_move);
148 	v.copy_feature(static_unit_prev_move);
149 	v.copy_feature(static_jscore);
150 	v.copy_feature(static_tscore);
151 	v.copy_feature(static_cscore);
152     }
153 
154     return utt;
155 }
156 
clunits_get_units(LISP utt)157 static LISP clunits_get_units(LISP utt)
158 {
159     // Create unit stream and loading params
160     EST_Utterance *u = get_c_utt(utt);
161     EST_Relation *units,*ss;
162     EST_Item *s;
163 
164     cldb = check_cldb();  // make sure there is one loaded
165 
166     units = u->create_relation("Unit");
167     for (s=u->relation("Segment")->head(); s != 0; s=s->next())
168     {
169 	EST_Item *unit = units->append();
170 	CLunit *db_unit = clunit(s->f("unit_id"));
171 	float st,e;
172 	unit->set_name(db_unit->name);
173 	unit->set("fileid",db_unit->fileid);
174 	// These should be modified from the optimal coupling
175 	if ((s->prev()) && (s->f_present("unit_this_move")))
176 	    st = s->F("unit_this_move");
177 	else
178 	    st = db_unit->start;
179 	if (s->next() && (s->next()->f_present("unit_prev_move")))
180 	    e = s->next()->F("unit_prev_move");
181 	else
182 	    e = db_unit->end;
183 	if ((e-st) < 0.011)
184 	    e = st + 0.011;
185 	unit->set("start",st);
186 	unit->set("middle",db_unit->start);
187 	unit->set("end",e);
188 	unit->set("unit_start",st);
189 	unit->set("unit_middle",db_unit->start);
190 	unit->set("unit_end",e);
191 	unit->set("seg_start",db_unit->start);
192 	unit->set("seg_end",db_unit->end);
193 	cldb->load_coefs_sig(unit);
194 	if (clunits_debug)
195 	    printf("unit: %s fileid %s start %f end %f\n",
196 		   (const char *)db_unit->name,
197 		   (const char *)db_unit->fileid,
198 		   st,e);
199     }
200 
201     // Make it look as much like the diphones as possible for
202     // the rest of the code
203     ss = u->create_relation("SourceSegments");
204     for (s = u->relation("Segment")->head(); s != 0 ; s = s->next())
205     {
206 	EST_Item *d = ss->append();
207 	d->set_name(ffeature(s,"clunit_name"));
208     }
209 
210     cl_parse_diphone_times(*units,*ss);
211 
212     return utt;
213 }
214 
cl_parse_diphone_times(EST_Relation & diphone_stream,EST_Relation & source_lab)215 static void cl_parse_diphone_times(EST_Relation &diphone_stream,
216 				   EST_Relation &source_lab)
217 {
218     EST_Item *s, *u;
219     EST_Track *pm;
220     int e_frame, m_frame = 0;
221     float dur_1 = 0.0, dur_2 = 0.0, p_time;
222     float t_time = 0.0, end;
223     p_time = 0.0;
224 
225     for (s = source_lab.head(), u = diphone_stream.head(); u; u = u->next(),
226 	 s = s->next())
227     {
228 	pm = track(u->f("coefs"));
229 	if (pm == 0)
230 	{
231 	    cerr << "CLUNIT: couldn't get pitchmarks for " << u->name() << endl;
232 	    festival_error();
233 	}
234 
235 	e_frame = pm->num_frames() - 1;
236 	m_frame = u->I("middle_frame");
237 
238 	dur_1 = pm->t(m_frame);
239 	dur_2 = pm->t(e_frame) - dur_1;
240 
241 	s->set("end", (dur_1 + p_time));
242 	p_time = s->F("end") + dur_2;
243 
244 	end = dur_1 + dur_2 + t_time;
245 	t_time = end;
246 	u->set("end", t_time);
247     }
248     if (s)
249 	s->set("end", (dur_2 + p_time));
250 }
251 
clunits_simple_wave(LISP utt)252 static LISP clunits_simple_wave(LISP utt)
253 {
254     // Naive joining of waveforms
255     EST_Utterance *u = get_c_utt(utt);
256     EST_Wave *w = new EST_Wave;
257     EST_Wave *w1 = 0;
258     EST_Item *witem = 0;
259     EST_Item *s;
260     int size,i,k,c;
261 
262     for (size=0,s=u->relation("Unit")->head(); s != 0; s = s->next())
263 	size += wave(s->f("sig"))->num_samples();
264 
265     if (u->relation("Unit")->head())
266     {   // This will copy the necessary wave features across
267 	s = u->relation("Unit")->head();
268 	*w = *(wave(s->f("sig")));
269     }
270     i = w->num_samples();
271     w->resize(size); // its maximum size
272     for (s=u->relation("Unit")->head()->next(); s; s=s->next())
273     {
274 	w1 = wave(s->f("sig"));
275 	// Find last zero crossing
276 	for (c=0; ((i > 0) && (c < 40)); c++,i--)
277 	    if (((w->a_no_check(i) < 0) && (w->a_no_check(i-1) >= 0)) ||
278 		((w->a_no_check(i) >= 0) && (w->a_no_check(i-1) < 0)))
279 		break;
280 	if (c == 40) i += 40;
281 	// Find next zero crossing
282 	for (c=0,k=1; ((k < w1->num_samples()) && (c < 40)); k++,i++)
283 	    if (((w1->a_no_check(k) < 0) && (w1->a_no_check(k-1) >= 0)) ||
284 		((w1->a_no_check(k) >= 0) && (w1->a_no_check(k-1) < 0)))
285 		break;
286 	if (c == 40) k -= 40;
287 	for (; k < w1->num_samples(); k++,i++)
288 	    w->a_no_check(i) = w1->a_no_check(k);
289     }
290     w->resize(i);
291 
292     witem = u->create_relation("Wave")->append();
293     witem->set_val("wave",est_val(w));
294 
295     return utt;
296 }
297 
clunits_windowed_wave(LISP utt)298 static LISP clunits_windowed_wave(LISP utt)
299 {
300     // windowed join, no prosodic modification
301     EST_Utterance *u = get_c_utt(utt);
302     EST_Wave *w = new EST_Wave;
303     EST_Wave *w1 = 0;
304     EST_Track *t1 = 0;
305     EST_Item *witem = 0;
306     EST_Item *s;
307     int size,i,k,wi,samp_idx, l_samp_idx;
308     int width, lwidth;
309     EST_Wave *www=0;
310 
311     for (size=0,s=u->relation("Unit")->head(); s != 0; s = s->next())
312 	size += wave(s->f("sig"))->num_samples();
313 
314     if (u->relation("Unit")->head())
315     {   // This will copy the necessary wave features across
316 	s = u->relation("Unit")->head();
317 	www = wave(s->f("sig"));
318 	*w = *www;
319     }
320     w->resize(size); // its maximum size
321     wi=0;
322     lwidth = width = 0;
323     for (s=u->relation("Unit")->head(); s; s=s->next())
324     {
325 	w1 = wave(s->f("sig"));
326 	t1 = track(s->f("coefs"));
327 
328 	l_samp_idx = 0;
329 	for (i=0; i < t1->num_frames()-1; i++)
330 	{
331 	    samp_idx = (int)(t1->t(i)*w->sample_rate());
332 	    width = samp_idx - l_samp_idx;
333 	    if (clunits_smooth_frames && (i==0) && (lwidth != 0))
334 		width = (width+lwidth)/2;  // not sure if this is worth it
335 	    wi += width;
336 	    for (k=-width; ((k<width)&&((samp_idx+k)<w1->num_samples())) ;k++)
337 		w->a(wi+k) +=
338 		    (int)(0.5*(1+cos((PI/(double)(width))*(double)k))*
339 			w1->a(samp_idx+k));
340 	    l_samp_idx = samp_idx;
341 	}
342 	lwidth = width;
343     }
344     w->resize(wi);
345 
346     witem = u->create_relation("Wave")->append();
347     witem->set_val("wave",est_val(w));
348 
349     return utt;
350 }
351 
clunits_smoothedjoin_wave(LISP utt)352 static LISP clunits_smoothedjoin_wave(LISP utt)
353 {
354     // Actually not very smoothed yet, just joined
355     EST_Utterance *u = get_c_utt(utt);
356     EST_Wave *w = new EST_Wave;
357     EST_Wave *w1 = 0;
358     EST_Track *t1 = 0;
359     EST_Item *witem = 0;
360     EST_Item *s;
361     int size,i,wi;
362     int samp_end, samp_start;
363     EST_Wave *www=0;
364 
365     for (size=0,s=u->relation("Unit")->head(); s != 0; s = s->next())
366     {
367 	samp_end = s->I("samp_end");
368 	samp_start = s->I("samp_start");
369 	size += samp_end-samp_start;
370     }
371 
372     if (u->relation("Unit")->head())
373     {   // This will copy the necessary wave features across
374 	s = u->relation("Unit")->head();
375 	www = wave(s->f("sig"));
376 	*w = *www;
377     }
378     w->resize(size); // its maximum size
379     wi=0;
380     for (s=u->relation("Unit")->head(); s; s=s->next())
381     {
382 	samp_end = s->I("samp_end");
383 	samp_start = s->I("samp_start");
384 	w1 = wave(s->f("sig"));
385 /*	printf("%s %s %f %f %d %d\n",
386 	       (const char *)s->S("name"),
387 	       (const char *)s->S("fileid"),
388 	       (float)samp_start/(float)w->sample_rate(),
389 	       (float)samp_end/(float)w->sample_rate(),
390 	       w1->num_samples(),
391 	       samp_end); */
392 	t1 = track(s->f("coefs"));
393 	for (i=samp_start; i<samp_end; i++,wi++)
394 	    w->a_no_check(wi) = w1->a_no_check(i);
395 /*	printf("%d %f\n",wi,(float)wi/(float)w->sample_rate()); */
396     }
397     w->resize(wi);
398 
399     witem = u->create_relation("Wave")->append();
400     witem->set_val("wave",est_val(w));
401 
402     return utt;
403 }
404 
setup_clunits_params()405 static void setup_clunits_params()
406 {
407     // Set up params
408     clunits_params = siod_get_lval("clunits_params",
409 				    "CLUNITS: no parameters set for module");
410     optimal_coupling = get_param_int("optimal_coupling",clunits_params,0);
411     different_prev_pen = get_param_float("different_prev_pen",clunits_params,1000.0);
412     non_consecutive_pen = get_param_float("non_consectutive_pen",clunits_params,100.0);
413     extend_selections = get_param_int("extend_selections",clunits_params,0);
414     continuity_weight = get_param_float("continuity_weight",clunits_params,1);
415     f0_join_weight = get_param_float("f0_join_weight",clunits_params,0.0);
416     clunits_debug = get_param_int("clunits_debug",clunits_params,0);
417     clunits_log_scores = get_param_int("log_scores",clunits_params,0);
418     clunits_smooth_frames = get_param_int("smooth_frames",clunits_params,0);
419     clunit_name_feat = get_param_str("clunit_name_feat",clunits_params,"name");
420     selection_trees =
421 	siod_get_lval("clunits_selection_trees",
422 		      "CLUNITS: clunits_selection_trees unbound");
423 }
424 
TS_candlist(EST_Item * s,EST_Features & f)425 static EST_VTCandidate *TS_candlist(EST_Item *s,EST_Features &f)
426 {
427     // Return a list of candidate units for target s
428     // Use the appropriate CART to select a small group of candidates
429     EST_VTCandidate *all_cands = 0;
430     EST_VTCandidate *c, *gt;
431     LISP tree,group,l,pd,cc,ls;
432     EST_String name;
433     EST_String lookingfor;
434     CLunit *u;
435     int bbb,ccc;
436     float cluster_mean;
437     (void)f;
438     bbb=ccc=0;
439 
440     lookingfor = s->S("clunit_name");
441     ls = siod(s);
442 
443     cc = siod_get_lval("clunits_cand_hooks",NULL);
444     if (cc)
445 	pd = apply_hooks(siod_get_lval("clunits_cand_hooks",NULL),
446 			 ls);
447     else
448     {
449 	tree = car(cdr(siod_assoc_str(lookingfor,selection_trees)));
450 	pd = wagon_pd(s,tree);
451     }
452     if (pd == NIL)
453     {
454 	cerr << "CLUNITS: no predicted class for " <<
455 	    s->S("clunit_name") << endl;
456 	festival_error();
457     }
458     group = car(pd);
459     cluster_mean = get_c_float(car(cdr(pd)));
460 
461     for (bbb=0,l=group; l != NIL; l=cdr(l),bbb++)
462     {
463 	c = new EST_VTCandidate;
464 	name = s->S("clunit_name")+"_"+get_c_string(car(car(l)));
465 	u = cldb->get_unit(name);
466 	if (u == 0)
467 	{
468 	    cerr << "CLUNITS: failed to find unit " << name <<
469 		" in index" << endl;
470 	    festival_error();
471 	}
472 	cldb->load_join_coefs(u);
473 	c->name = est_val(u);
474 	c->s = s;
475 	// Mean distance from others in cluster (could be precalculated)
476 	c->score = get_c_float(car(cdr(car(l))))-cluster_mean;
477 	c->score *= c->score;
478 	// Maybe this should be divided by overall mean of set
479 	// to normalise this figure (?)
480 
481 	c->next = all_cands;
482 	all_cands = c;
483     }
484 
485     if (extend_selections)
486     {
487 	// An experiment, for all candidates of the previous
488 	// item whose following is of this phone type, include
489 	// them as a candidate
490 	EST_Item *ppp = s->prev();
491 	if (ppp)
492 	{
493 	    EST_VTCandidate *lc = vtcand(ppp->f("unit_cands"));
494 	    for (ccc=0 ; lc && (ccc < extend_selections); lc = lc->next)
495 	    {
496 		CLunit *unit = clunit(lc->name);
497 		CLunit *next_unit;
498 
499 		if (unit->next_unit)
500 		    next_unit = unit->next_unit;
501 		else
502 		    continue;
503 		EST_String ss;
504 		ss = next_unit->name.before("_");
505 		if (ss.matches(".*_.*_.*"))
506 		{
507 		    ss += "_";
508 		    ss += next_unit->name.after("_").before("_");
509 		}
510 /*		printf("%s %s\n",(const char *)ss, (const char *)lookingfor); */
511 		for (gt=all_cands; gt; gt=gt->next)
512 		    if (clunit(gt->name)->name == next_unit->name)
513 			break;  /* got this one already */
514 		if ((ss == lookingfor) && (gt == 0))
515 		{  // its the right type so add it
516 		    c = new EST_VTCandidate;
517 		    c->name = est_val(next_unit);
518 		    cldb->load_join_coefs(next_unit);
519 		    c->s = s;
520 		    c->score = 0;
521 		    c->next = all_cands;
522 		    all_cands = c;
523 		    bbb++;
524 		    ccc++;
525 		}
526 	    }
527 	}
528 
529 	s->set_val("unit_cands",est_val(all_cands));
530     }
531     if (clunits_debug)
532 	printf("cands %d (extends %d) %s\n",bbb,ccc,(const char *)lookingfor);
533     return all_cands;
534 }
535 
TS_npath(EST_VTPath * p,EST_VTCandidate * c,EST_Features & f)536 static EST_VTPath *TS_npath(EST_VTPath *p,EST_VTCandidate *c,EST_Features &f)
537 {
538     // Combine candidate c with previous path updating score
539     // with join cost
540     float cost;
541     EST_VTPath *np = new EST_VTPath;
542     CLunit *u0, *u1;
543     float u0_move=0.0, u1_move=0.0;
544     (void)f;
545 
546     np->c = c;
547     np->from = p;
548     if ((p == 0) || (p->c == 0))
549 	cost = 0;  // nothing previous to join to
550     else
551     {
552 	u0 = clunit(p->c->name);
553 	u1 = clunit(c->name);
554 //	printf("u0 %s u1 %s\n",
555 //	       (const char *)u0->name,
556 //	       (const char *)u1->name);
557 	if (optimal_coupling)
558 	    cost = optimal_couple(u0,u1,u0_move,u1_move,
559 				  optimal_coupling,
560 				  different_prev_pen,
561 				  non_consecutive_pen);
562 	else // naive measure
563 	    cost = naive_join_cost(u0,u1,c->s,u0_move,u1_move);
564 	// When optimal_coupling == 2 the moves will be 0, just the scores
565 	// are relevant
566 	if (optimal_coupling == 1)
567 	{
568 	    np->f.set(static_unit_prev_move,u0_move); // new (prev) end
569 	    np->f.set(static_unit_this_move,u1_move); // new start
570 	}
571     }
572 //    printf("cost %f continuity_weight %f\n", cost, continuity_weight);
573     cost *= continuity_weight;
574     np->state = c->pos;  // "state" is candidate number
575     if (clunits_log_scores && (cost != 0))
576 	cost = log(cost);
577 
578     np->f.set(static_jscore,cost);
579     np->f.set(static_tscore,c->score);
580     if (p==0)
581 	np->score = (c->score+cost);
582     else
583 	np->score = (c->score+cost) + p->score;
584     np->f.set(static_cscore,np->score);
585 
586     if (clunits_debug > 1)
587 	printf("joining cost %f\n",np->score);
588     return np;
589 }
590 
optimal_couple(CLunit * u0,CLunit * u1,float & u0_move,float & u1_move,int type,float different_prev_pen,float non_consecutive_pen)591 static float optimal_couple(CLunit *u0,
592 			    CLunit *u1,
593 			    float &u0_move,
594 			    float &u1_move,
595 			    int type,
596 			    float different_prev_pen,
597 			    float non_consecutive_pen
598 			    )
599 {
600     // Find combination cost of u0 to u1, checking for best
601     // frame up to n frames back in u0 and u1.
602     // Note this checks the u0 with u1's predecessor, which may or may not
603     // be of the same type
604     // There is some optimisation here in unit coeff access
605     EST_Track *u0_cep, *u1_p_cep;
606     float dist, best_val;
607     int i,eee;
608     int u0_st, u0_end;
609     int u1_p_st, u1_p_end;
610     int best_u0, best_u1;
611     CLunit *u1_p;
612     float f;
613 
614     u1_p = u1->prev_unit;
615 
616     u0_move = u0->end;
617     if (u1_p == 0)
618 	u1_move = 0;
619     else
620 	u1_move = u1_p->end;
621 
622     if (u1_p == u0)  // they are consecutive
623 	return 0.0;
624     if (u1_p == 0)   // hacky condition, when there is no previous we'll
625 	return 0.0;  // assume a good join (should be silence there)
626 
627     if (u1_p->join_coeffs == 0)
628 	cldb->load_join_coefs(u1_p);
629     // Get indexes into full cep for utterances rather than sub ceps
630     u0_cep = u0->join_coeffs;
631     u1_p_cep = u1_p->join_coeffs;
632 
633     u0_end = u0_cep->num_frames();
634     u1_p_end = u1_p_cep->num_frames();
635 
636     if (!streq(u1_p->base_name,u0->base_name))
637     {   /* prev(u1) is a different phone from u0 so don't slide */
638 	f = different_prev_pen;
639 	u0_st = u0_cep->num_frames()-1;
640 	u1_p_st = u1_p_cep->num_frames()-1;
641     }
642     else if (type == 2)
643     {   /* we'll only check the edge for the join */
644 	u0_st = u0_cep->num_frames()-1;
645 	u1_p_st = u1_p_cep->num_frames()-1;
646 	f = 1;
647     }
648     else
649     {
650 	u0_st = (int)(u0_cep->num_frames() * 0.33);
651 	u1_p_st = (int)(u1_p_cep->num_frames() * 0.33);
652 	f = 1;
653     }
654 
655     best_u0=u0_end;
656     best_u1=u1_p_end;
657     best_val = HUGE_VAL;
658 
659     // Here we look for the best join without sliding the windows
660     if ((u0_end-u0_st) < (u1_p_end-u1_p_st))
661 	eee = u0_end-u0_st;
662     else
663 	eee = u1_p_end-u1_p_st;
664     for (i=0; i < eee; i++)
665     {
666 	dist = frame_distance(*u0_cep,i+u0_st,
667 			      *u1_p_cep,i+u1_p_st,
668 			      cldb->cweights,
669 			      f0_join_weight);
670 	if (dist < best_val)
671 	{
672 	    best_val = dist;
673 	    best_u0 = i+u0_st;
674 	    best_u1 = i+u1_p_st;
675 	}
676     }
677 #if 0
678     // This tries *all* possible matches in the pair, its slow
679     // and has a tendency to shorten things more than you'd like
680     // so we just use the more simple test above.
681     int j;
682     for (i=u0_st; i < u0_end; i++)
683     {
684 	for (j=u1_p_st; j < u1_p_end; j++)
685 	{
686 	    dist = frame_distance(*u0_cep,i,
687 				  *u1_p_cep,j,
688 				  cldb->cweights);
689 	    if (dist < best_val)
690 	    {
691 		best_val = dist;
692 		best_u0 = i;
693 		best_u1 = j;
694 	    }
695 	}
696     }
697 #endif
698 
699     if (type == 1)
700     {
701 	u0_move = u0_cep->t(best_u0);
702 	u1_move = u1_p_cep->t(best_u1);
703     }
704 
705     return non_consecutive_pen+(best_val*f);
706 }
707 
naive_join_cost(CLunit * unit0,CLunit * unit1,EST_Item * s,float & u0_move,float & u1_move)708 static float naive_join_cost(CLunit *unit0, CLunit *unit1,
709 			     EST_Item *s,
710 			     float &u0_move,
711 			     float &u1_move)
712 {
713     // A naive join cost, because I haven't ported the info yet
714 
715     u0_move = unit0->end;
716     u1_move = unit1->start;
717 
718     if (unit0 == unit1)
719 	return 0;
720     else if (unit1->prev_unit->name == unit0->name)
721 	return 0;
722     else if (ph_is_silence(s->name()))
723 	return 0;
724     else if (ph_is_stop(s->name()))
725 	return 0.2;
726     else if (ph_is_fricative(s->name()))
727 	return 0.3;
728     else
729 	return 1.0;
730 }
731 
cldb_load_all_coeffs(LISP filelist)732 static LISP cldb_load_all_coeffs(LISP filelist)
733 {
734     LISP f;
735 
736     cldb = check_cldb();
737     for (f=filelist; f; f=cdr(f))
738     {
739 	cldb->get_file_coefs_sig(get_c_string(car(f)));
740 	cldb->get_file_join_coefs(get_c_string(car(f)));
741     }
742 
743     return NIL;
744 }
745 
festival_clunits_init(void)746 void festival_clunits_init(void)
747 {
748     // Initialization for clunits selection
749 
750     proclaim_module("clunits",
751             "Copyright (C) University of Edinburgh and CMU 1997-2010\n");
752 
753     gc_protect(&clunits_params);
754     gc_protect(&selection_trees);
755 
756     festival_def_utt_module("Clunits_Select",clunits_select,
757     "(Clunits_Select UTT)\n\
758   Select units from current databases using cluster selection method.");
759 
760     festival_def_utt_module("Clunits_Get_Units",clunits_get_units,
761     "(Clunits_Get_Units UTT)\n\
762   Construct Unit relation from the selected units in Segment and extract\n\
763   their parameters from the clunit db.");
764 
765     festival_def_utt_module("Clunits_Simple_Wave",clunits_simple_wave,
766     "(Clunits_Simple_Wave UTT)\n\
767   Naively concatenate signals together into a single wave (for debugging).");
768 
769     festival_def_utt_module("Clunits_Windowed_Wave",clunits_windowed_wave,
770     "(Clunits_Windowed_Wave UTT)\n\
771   Use hamming window over edges of units to join them, no prosodic \n\
772   modification though.");
773 
774     festival_def_utt_module("Clunits_SmoothedJoin_Wave",clunits_smoothedjoin_wave,
775     "(Clunits_SmoothedJoin_Wave UTT)\n\
776   smoothed join.");
777 
778     init_subr_1("clunits:load_db",cl_load_db,
779     "(clunits:load_db PARAMS)\n\
780   Load index file for cluster database and set up params, and select it.");
781 
782     init_subr_1("clunits:select",cldb_select,
783     "(clunits:select NAME)\n\
784   Select a previously loaded cluster database.");
785 
786     init_subr_1("clunits:load_all_coefs",cldb_load_all_coeffs,
787     "(clunits:load_all_coefs FILEIDLIST)\n\
788   Load in coefficients, signal and join coefficients for each named\n\
789   fileid.  This is can be called at startup to to reduce the load time\n\
790   during synthesis (though may make the image large).");
791 
792     init_subr_0("clunits:list",cldb_list,
793     "(clunits:list)\n\
794   List names of currently loaded cluster databases.");
795 
796     init_subr_2("acost:build_disttabs",make_unit_distance_tables,
797     "(acost:build_disttabs UTTTYPES PARAMS)\n\
798   Built matrices of distances between each ling_item in each each list\n\
799   of ling_items in uttypes.   Uses acoustic weights in PARAMS and save\n\
800   the result as a matrix for later use.");
801 
802     init_subr_2("acost:utt.load_coeffs",acost_utt_load_coeffs,
803     "(acost:utt.load_coeffs UTT PARAMS)\n\
804   Load in the acoustic coefficients into UTT and set the Acoustic_Coeffs\n\
805   feature for each segment in UTT.");
806 
807     init_subr_3("acost:file_difference",ac_distance_tracks,
808     "(acost:file_difference FILENAME1 FILENAME2 PARAMS)\n\
809   Load in the two named tracks and find the acoustic difference over all\n\
810   based on the weights in PARAMS.");
811 
812     init_subr_2("cl_mapping", l_cl_mapping,
813       "(cl_mapping UTT PARAMS)\n\
814   Impose prosody upto some percentage, and not absolutely.");
815 
816 }
817