1 /*************************************************************************/
2 /* */
3 /* Carnegie Mellon University and */
4 /* Centre for Speech Technology Research */
5 /* University of Edinburgh, UK */
6 /* Copyright (c) 1998-2001 */
7 /* All Rights Reserved. */
8 /* */
9 /* Permission is hereby granted, free of charge, to use and distribute */
10 /* this software and its documentation without restriction, including */
11 /* without limitation the rights to use, copy, modify, merge, publish, */
12 /* distribute, sublicense, and/or sell copies of this work, and to */
13 /* permit persons to whom this work is furnished to do so, subject to */
14 /* the following conditions: */
15 /* 1. The code must retain the above copyright notice, this list of */
16 /* conditions and the following disclaimer. */
17 /* 2. Any modifications must be clearly marked as such. */
18 /* 3. Original authors' names are not deleted. */
19 /* 4. The authors' names are not used to endorse or promote products */
20 /* derived from this software without specific prior written */
21 /* permission. */
22 /* */
23 /* THE UNIVERSITY OF EDINBURGH, CARNEGIE MELLON UNIVERSITY AND THE */
24 /* CONTRIBUTORS TO THIS WORK DISCLAIM ALL WARRANTIES WITH REGARD TO */
25 /* THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY */
26 /* AND FITNESS, IN NO EVENT SHALL THE UNIVERSITY OF EDINBURGH, CARNEGIE */
27 /* MELLON UNIVERSITY NOR THE CONTRIBUTORS BE LIABLE FOR ANY SPECIAL, */
28 /* INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES WHATSOEVER */
29 /* RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION */
30 /* OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF */
31 /* OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. */
32 /* */
33 /*************************************************************************/
34 /* Author : Alan W Black */
35 /* Date : April 1998 */
36 /*-----------------------------------------------------------------------*/
37 /* */
38 /* Yet another unit selection method. */
39 /* */
40 /* Using an acoustic measure find the distance between all units in the */
41 /* db. Try to minimise the mean difference between units in a cluster */
42 /* using CART technology, based on features like phonetic and prosodic */
43 /* context. This gives a bunch of CARTs for each unit type in the db */
44 /* which are acoustically close. Use these as candidates and optimise */
45 /* a path through them minimising join using a viterbi search. */
46 /* */
47 /* Advantages: */
48 /* requires little or no measurements at selection time */
49 /* allows for clear method of pruning */
50 /* no weights need to be generated (well, except where they do) */
51 /* will optimise appropriately with varying numbers of example units */
52 /* */
53 /* Disadvantages: */
54 /* Units can't cross between clusters */
55 /* */
56 /* Implementation of Black, A. and Taylor, P. (1997). Automatically */
57 /* clustering similar units for unit selection in speech synthesis */
58 /* Proceedings of Eurospeech 97, vol2 pp 601-604, Rhodes, Greece. */
59 /* */
60 /* postscript: http://www.cs.cmu.edu/~awb/papers/ES97units.ps */
61 /* http://www.cs.cmu.edu/~awb/papers/ES97units/ES97units.html */
62 /* */
63 /* Comments: */
64 /* */
65 /* This is a new implementation using the newer unit selection/signal */
66 /* processing archtecture in festival */
67 /* */
68 /* This is still in development but become more stable. It is robust */
69 /* for many cases, though a lot depends on the db and parameters */
70 /* you use */
71 /* */
72 /* This had significant new work (and bug fixes) done on it when awb */
73 /* moved to CMU */
74 /* */
75 /*=======================================================================*/
76 #include <cstdlib>
77 #include "EST_math.h"
78 #include "festival.h"
79 #include "clunits.h"
80
81 static EST_String static_unit_prev_move = "unit_prev_move";
82 static EST_String static_unit_this_move = "unit_this_move";
83 static EST_String static_jscore = "local_join_cost";
84 static EST_String static_tscore = "local_target_cost";
85 static EST_String static_cscore = "cummulative_unit_score";
86
87 static void setup_clunits_params();
88 static EST_VTCandidate *TS_candlist(EST_Item *s,EST_Features &f);
89 static EST_VTPath *TS_npath(EST_VTPath *p,EST_VTCandidate *c,EST_Features &f);
90 static float naive_join_cost(CLunit *unit0, CLunit *unit1,
91 EST_Item *s,
92 float &u0_move,
93 float &u1_move);
94 static float optimal_couple(CLunit *u0,
95 CLunit *u1,
96 float &u0_move,
97 float &u1_move,
98 int type,
99 float different_prev_pen,
100 float non_consecutive_pen);
101 static void cl_parse_diphone_times(EST_Relation &diphone_stream,
102 EST_Relation &source_lab);
103
104 VAL_REGISTER_CLASS_NODEL(vtcand,EST_VTCandidate);
105 VAL_REGISTER_CLASS_NODEL(clunit,CLunit);
106
107 LISP selection_trees = NIL;
108 LISP clunits_params = NIL;
109 static int optimal_coupling = 0;
110 static int extend_selections = 0;
111 static int clunits_debug = 0;
112 static int clunits_log_scores = 0;
113 static int clunits_smooth_frames = 0;
114 float continuity_weight = 1;
115 float f0_join_weight = 0.0;
116 float different_prev_pen = 1000.0;
117 float non_consecutive_pen = 100.0;
118 static EST_String clunit_name_feat = "name";
119
120 static CLDB *cldb;
121
clunits_select(LISP utt)122 static LISP clunits_select(LISP utt)
123 {
124 // Select units from db using CARTs to index into clustered unit groups
125 EST_Utterance *u = get_c_utt(utt);
126 EST_Item *s, *f;
127
128 cldb = check_cldb(); // make sure there is one loaded
129 setup_clunits_params();
130
131 f = u->relation("Segment")->head();
132 for (s=f; s; s=s->next())
133 s->set_val("clunit_name",ffeature(s,clunit_name_feat));
134
135 if (f)
136 {
137 EST_Viterbi_Decoder v(TS_candlist,TS_npath,-1);
138 v.set_big_is_good(FALSE); // big is bad
139
140 v.initialise(u->relation("Segment"));
141 v.search();
142 if (!v.result("unit_id"))
143 {
144 cerr << "CLUNIT: failed to find path\n";
145 return utt;
146 }
147 v.copy_feature(static_unit_this_move);
148 v.copy_feature(static_unit_prev_move);
149 v.copy_feature(static_jscore);
150 v.copy_feature(static_tscore);
151 v.copy_feature(static_cscore);
152 }
153
154 return utt;
155 }
156
clunits_get_units(LISP utt)157 static LISP clunits_get_units(LISP utt)
158 {
159 // Create unit stream and loading params
160 EST_Utterance *u = get_c_utt(utt);
161 EST_Relation *units,*ss;
162 EST_Item *s;
163
164 cldb = check_cldb(); // make sure there is one loaded
165
166 units = u->create_relation("Unit");
167 for (s=u->relation("Segment")->head(); s != 0; s=s->next())
168 {
169 EST_Item *unit = units->append();
170 CLunit *db_unit = clunit(s->f("unit_id"));
171 float st,e;
172 unit->set_name(db_unit->name);
173 unit->set("fileid",db_unit->fileid);
174 // These should be modified from the optimal coupling
175 if ((s->prev()) && (s->f_present("unit_this_move")))
176 st = s->F("unit_this_move");
177 else
178 st = db_unit->start;
179 if (s->next() && (s->next()->f_present("unit_prev_move")))
180 e = s->next()->F("unit_prev_move");
181 else
182 e = db_unit->end;
183 if ((e-st) < 0.011)
184 e = st + 0.011;
185 unit->set("start",st);
186 unit->set("middle",db_unit->start);
187 unit->set("end",e);
188 unit->set("unit_start",st);
189 unit->set("unit_middle",db_unit->start);
190 unit->set("unit_end",e);
191 unit->set("seg_start",db_unit->start);
192 unit->set("seg_end",db_unit->end);
193 cldb->load_coefs_sig(unit);
194 if (clunits_debug)
195 printf("unit: %s fileid %s start %f end %f\n",
196 (const char *)db_unit->name,
197 (const char *)db_unit->fileid,
198 st,e);
199 }
200
201 // Make it look as much like the diphones as possible for
202 // the rest of the code
203 ss = u->create_relation("SourceSegments");
204 for (s = u->relation("Segment")->head(); s != 0 ; s = s->next())
205 {
206 EST_Item *d = ss->append();
207 d->set_name(ffeature(s,"clunit_name"));
208 }
209
210 cl_parse_diphone_times(*units,*ss);
211
212 return utt;
213 }
214
cl_parse_diphone_times(EST_Relation & diphone_stream,EST_Relation & source_lab)215 static void cl_parse_diphone_times(EST_Relation &diphone_stream,
216 EST_Relation &source_lab)
217 {
218 EST_Item *s, *u;
219 EST_Track *pm;
220 int e_frame, m_frame = 0;
221 float dur_1 = 0.0, dur_2 = 0.0, p_time;
222 float t_time = 0.0, end;
223 p_time = 0.0;
224
225 for (s = source_lab.head(), u = diphone_stream.head(); u; u = u->next(),
226 s = s->next())
227 {
228 pm = track(u->f("coefs"));
229 if (pm == 0)
230 {
231 cerr << "CLUNIT: couldn't get pitchmarks for " << u->name() << endl;
232 festival_error();
233 }
234
235 e_frame = pm->num_frames() - 1;
236 m_frame = u->I("middle_frame");
237
238 dur_1 = pm->t(m_frame);
239 dur_2 = pm->t(e_frame) - dur_1;
240
241 s->set("end", (dur_1 + p_time));
242 p_time = s->F("end") + dur_2;
243
244 end = dur_1 + dur_2 + t_time;
245 t_time = end;
246 u->set("end", t_time);
247 }
248 if (s)
249 s->set("end", (dur_2 + p_time));
250 }
251
clunits_simple_wave(LISP utt)252 static LISP clunits_simple_wave(LISP utt)
253 {
254 // Naive joining of waveforms
255 EST_Utterance *u = get_c_utt(utt);
256 EST_Wave *w = new EST_Wave;
257 EST_Wave *w1 = 0;
258 EST_Item *witem = 0;
259 EST_Item *s;
260 int size,i,k,c;
261
262 for (size=0,s=u->relation("Unit")->head(); s != 0; s = s->next())
263 size += wave(s->f("sig"))->num_samples();
264
265 if (u->relation("Unit")->head())
266 { // This will copy the necessary wave features across
267 s = u->relation("Unit")->head();
268 *w = *(wave(s->f("sig")));
269 }
270 i = w->num_samples();
271 w->resize(size); // its maximum size
272 for (s=u->relation("Unit")->head()->next(); s; s=s->next())
273 {
274 w1 = wave(s->f("sig"));
275 // Find last zero crossing
276 for (c=0; ((i > 0) && (c < 40)); c++,i--)
277 if (((w->a_no_check(i) < 0) && (w->a_no_check(i-1) >= 0)) ||
278 ((w->a_no_check(i) >= 0) && (w->a_no_check(i-1) < 0)))
279 break;
280 if (c == 40) i += 40;
281 // Find next zero crossing
282 for (c=0,k=1; ((k < w1->num_samples()) && (c < 40)); k++,i++)
283 if (((w1->a_no_check(k) < 0) && (w1->a_no_check(k-1) >= 0)) ||
284 ((w1->a_no_check(k) >= 0) && (w1->a_no_check(k-1) < 0)))
285 break;
286 if (c == 40) k -= 40;
287 for (; k < w1->num_samples(); k++,i++)
288 w->a_no_check(i) = w1->a_no_check(k);
289 }
290 w->resize(i);
291
292 witem = u->create_relation("Wave")->append();
293 witem->set_val("wave",est_val(w));
294
295 return utt;
296 }
297
clunits_windowed_wave(LISP utt)298 static LISP clunits_windowed_wave(LISP utt)
299 {
300 // windowed join, no prosodic modification
301 EST_Utterance *u = get_c_utt(utt);
302 EST_Wave *w = new EST_Wave;
303 EST_Wave *w1 = 0;
304 EST_Track *t1 = 0;
305 EST_Item *witem = 0;
306 EST_Item *s;
307 int size,i,k,wi,samp_idx, l_samp_idx;
308 int width, lwidth;
309 EST_Wave *www=0;
310
311 for (size=0,s=u->relation("Unit")->head(); s != 0; s = s->next())
312 size += wave(s->f("sig"))->num_samples();
313
314 if (u->relation("Unit")->head())
315 { // This will copy the necessary wave features across
316 s = u->relation("Unit")->head();
317 www = wave(s->f("sig"));
318 *w = *www;
319 }
320 w->resize(size); // its maximum size
321 wi=0;
322 lwidth = width = 0;
323 for (s=u->relation("Unit")->head(); s; s=s->next())
324 {
325 w1 = wave(s->f("sig"));
326 t1 = track(s->f("coefs"));
327
328 l_samp_idx = 0;
329 for (i=0; i < t1->num_frames()-1; i++)
330 {
331 samp_idx = (int)(t1->t(i)*w->sample_rate());
332 width = samp_idx - l_samp_idx;
333 if (clunits_smooth_frames && (i==0) && (lwidth != 0))
334 width = (width+lwidth)/2; // not sure if this is worth it
335 wi += width;
336 for (k=-width; ((k<width)&&((samp_idx+k)<w1->num_samples())) ;k++)
337 w->a(wi+k) +=
338 (int)(0.5*(1+cos((PI/(double)(width))*(double)k))*
339 w1->a(samp_idx+k));
340 l_samp_idx = samp_idx;
341 }
342 lwidth = width;
343 }
344 w->resize(wi);
345
346 witem = u->create_relation("Wave")->append();
347 witem->set_val("wave",est_val(w));
348
349 return utt;
350 }
351
clunits_smoothedjoin_wave(LISP utt)352 static LISP clunits_smoothedjoin_wave(LISP utt)
353 {
354 // Actually not very smoothed yet, just joined
355 EST_Utterance *u = get_c_utt(utt);
356 EST_Wave *w = new EST_Wave;
357 EST_Wave *w1 = 0;
358 EST_Track *t1 = 0;
359 EST_Item *witem = 0;
360 EST_Item *s;
361 int size,i,wi;
362 int samp_end, samp_start;
363 EST_Wave *www=0;
364
365 for (size=0,s=u->relation("Unit")->head(); s != 0; s = s->next())
366 {
367 samp_end = s->I("samp_end");
368 samp_start = s->I("samp_start");
369 size += samp_end-samp_start;
370 }
371
372 if (u->relation("Unit")->head())
373 { // This will copy the necessary wave features across
374 s = u->relation("Unit")->head();
375 www = wave(s->f("sig"));
376 *w = *www;
377 }
378 w->resize(size); // its maximum size
379 wi=0;
380 for (s=u->relation("Unit")->head(); s; s=s->next())
381 {
382 samp_end = s->I("samp_end");
383 samp_start = s->I("samp_start");
384 w1 = wave(s->f("sig"));
385 /* printf("%s %s %f %f %d %d\n",
386 (const char *)s->S("name"),
387 (const char *)s->S("fileid"),
388 (float)samp_start/(float)w->sample_rate(),
389 (float)samp_end/(float)w->sample_rate(),
390 w1->num_samples(),
391 samp_end); */
392 t1 = track(s->f("coefs"));
393 for (i=samp_start; i<samp_end; i++,wi++)
394 w->a_no_check(wi) = w1->a_no_check(i);
395 /* printf("%d %f\n",wi,(float)wi/(float)w->sample_rate()); */
396 }
397 w->resize(wi);
398
399 witem = u->create_relation("Wave")->append();
400 witem->set_val("wave",est_val(w));
401
402 return utt;
403 }
404
setup_clunits_params()405 static void setup_clunits_params()
406 {
407 // Set up params
408 clunits_params = siod_get_lval("clunits_params",
409 "CLUNITS: no parameters set for module");
410 optimal_coupling = get_param_int("optimal_coupling",clunits_params,0);
411 different_prev_pen = get_param_float("different_prev_pen",clunits_params,1000.0);
412 non_consecutive_pen = get_param_float("non_consectutive_pen",clunits_params,100.0);
413 extend_selections = get_param_int("extend_selections",clunits_params,0);
414 continuity_weight = get_param_float("continuity_weight",clunits_params,1);
415 f0_join_weight = get_param_float("f0_join_weight",clunits_params,0.0);
416 clunits_debug = get_param_int("clunits_debug",clunits_params,0);
417 clunits_log_scores = get_param_int("log_scores",clunits_params,0);
418 clunits_smooth_frames = get_param_int("smooth_frames",clunits_params,0);
419 clunit_name_feat = get_param_str("clunit_name_feat",clunits_params,"name");
420 selection_trees =
421 siod_get_lval("clunits_selection_trees",
422 "CLUNITS: clunits_selection_trees unbound");
423 }
424
TS_candlist(EST_Item * s,EST_Features & f)425 static EST_VTCandidate *TS_candlist(EST_Item *s,EST_Features &f)
426 {
427 // Return a list of candidate units for target s
428 // Use the appropriate CART to select a small group of candidates
429 EST_VTCandidate *all_cands = 0;
430 EST_VTCandidate *c, *gt;
431 LISP tree,group,l,pd,cc,ls;
432 EST_String name;
433 EST_String lookingfor;
434 CLunit *u;
435 int bbb,ccc;
436 float cluster_mean;
437 (void)f;
438 bbb=ccc=0;
439
440 lookingfor = s->S("clunit_name");
441 ls = siod(s);
442
443 cc = siod_get_lval("clunits_cand_hooks",NULL);
444 if (cc)
445 pd = apply_hooks(siod_get_lval("clunits_cand_hooks",NULL),
446 ls);
447 else
448 {
449 tree = car(cdr(siod_assoc_str(lookingfor,selection_trees)));
450 pd = wagon_pd(s,tree);
451 }
452 if (pd == NIL)
453 {
454 cerr << "CLUNITS: no predicted class for " <<
455 s->S("clunit_name") << endl;
456 festival_error();
457 }
458 group = car(pd);
459 cluster_mean = get_c_float(car(cdr(pd)));
460
461 for (bbb=0,l=group; l != NIL; l=cdr(l),bbb++)
462 {
463 c = new EST_VTCandidate;
464 name = s->S("clunit_name")+"_"+get_c_string(car(car(l)));
465 u = cldb->get_unit(name);
466 if (u == 0)
467 {
468 cerr << "CLUNITS: failed to find unit " << name <<
469 " in index" << endl;
470 festival_error();
471 }
472 cldb->load_join_coefs(u);
473 c->name = est_val(u);
474 c->s = s;
475 // Mean distance from others in cluster (could be precalculated)
476 c->score = get_c_float(car(cdr(car(l))))-cluster_mean;
477 c->score *= c->score;
478 // Maybe this should be divided by overall mean of set
479 // to normalise this figure (?)
480
481 c->next = all_cands;
482 all_cands = c;
483 }
484
485 if (extend_selections)
486 {
487 // An experiment, for all candidates of the previous
488 // item whose following is of this phone type, include
489 // them as a candidate
490 EST_Item *ppp = s->prev();
491 if (ppp)
492 {
493 EST_VTCandidate *lc = vtcand(ppp->f("unit_cands"));
494 for (ccc=0 ; lc && (ccc < extend_selections); lc = lc->next)
495 {
496 CLunit *unit = clunit(lc->name);
497 CLunit *next_unit;
498
499 if (unit->next_unit)
500 next_unit = unit->next_unit;
501 else
502 continue;
503 EST_String ss;
504 ss = next_unit->name.before("_");
505 if (ss.matches(".*_.*_.*"))
506 {
507 ss += "_";
508 ss += next_unit->name.after("_").before("_");
509 }
510 /* printf("%s %s\n",(const char *)ss, (const char *)lookingfor); */
511 for (gt=all_cands; gt; gt=gt->next)
512 if (clunit(gt->name)->name == next_unit->name)
513 break; /* got this one already */
514 if ((ss == lookingfor) && (gt == 0))
515 { // its the right type so add it
516 c = new EST_VTCandidate;
517 c->name = est_val(next_unit);
518 cldb->load_join_coefs(next_unit);
519 c->s = s;
520 c->score = 0;
521 c->next = all_cands;
522 all_cands = c;
523 bbb++;
524 ccc++;
525 }
526 }
527 }
528
529 s->set_val("unit_cands",est_val(all_cands));
530 }
531 if (clunits_debug)
532 printf("cands %d (extends %d) %s\n",bbb,ccc,(const char *)lookingfor);
533 return all_cands;
534 }
535
TS_npath(EST_VTPath * p,EST_VTCandidate * c,EST_Features & f)536 static EST_VTPath *TS_npath(EST_VTPath *p,EST_VTCandidate *c,EST_Features &f)
537 {
538 // Combine candidate c with previous path updating score
539 // with join cost
540 float cost;
541 EST_VTPath *np = new EST_VTPath;
542 CLunit *u0, *u1;
543 float u0_move=0.0, u1_move=0.0;
544 (void)f;
545
546 np->c = c;
547 np->from = p;
548 if ((p == 0) || (p->c == 0))
549 cost = 0; // nothing previous to join to
550 else
551 {
552 u0 = clunit(p->c->name);
553 u1 = clunit(c->name);
554 // printf("u0 %s u1 %s\n",
555 // (const char *)u0->name,
556 // (const char *)u1->name);
557 if (optimal_coupling)
558 cost = optimal_couple(u0,u1,u0_move,u1_move,
559 optimal_coupling,
560 different_prev_pen,
561 non_consecutive_pen);
562 else // naive measure
563 cost = naive_join_cost(u0,u1,c->s,u0_move,u1_move);
564 // When optimal_coupling == 2 the moves will be 0, just the scores
565 // are relevant
566 if (optimal_coupling == 1)
567 {
568 np->f.set(static_unit_prev_move,u0_move); // new (prev) end
569 np->f.set(static_unit_this_move,u1_move); // new start
570 }
571 }
572 // printf("cost %f continuity_weight %f\n", cost, continuity_weight);
573 cost *= continuity_weight;
574 np->state = c->pos; // "state" is candidate number
575 if (clunits_log_scores && (cost != 0))
576 cost = log(cost);
577
578 np->f.set(static_jscore,cost);
579 np->f.set(static_tscore,c->score);
580 if (p==0)
581 np->score = (c->score+cost);
582 else
583 np->score = (c->score+cost) + p->score;
584 np->f.set(static_cscore,np->score);
585
586 if (clunits_debug > 1)
587 printf("joining cost %f\n",np->score);
588 return np;
589 }
590
optimal_couple(CLunit * u0,CLunit * u1,float & u0_move,float & u1_move,int type,float different_prev_pen,float non_consecutive_pen)591 static float optimal_couple(CLunit *u0,
592 CLunit *u1,
593 float &u0_move,
594 float &u1_move,
595 int type,
596 float different_prev_pen,
597 float non_consecutive_pen
598 )
599 {
600 // Find combination cost of u0 to u1, checking for best
601 // frame up to n frames back in u0 and u1.
602 // Note this checks the u0 with u1's predecessor, which may or may not
603 // be of the same type
604 // There is some optimisation here in unit coeff access
605 EST_Track *u0_cep, *u1_p_cep;
606 float dist, best_val;
607 int i,eee;
608 int u0_st, u0_end;
609 int u1_p_st, u1_p_end;
610 int best_u0, best_u1;
611 CLunit *u1_p;
612 float f;
613
614 u1_p = u1->prev_unit;
615
616 u0_move = u0->end;
617 if (u1_p == 0)
618 u1_move = 0;
619 else
620 u1_move = u1_p->end;
621
622 if (u1_p == u0) // they are consecutive
623 return 0.0;
624 if (u1_p == 0) // hacky condition, when there is no previous we'll
625 return 0.0; // assume a good join (should be silence there)
626
627 if (u1_p->join_coeffs == 0)
628 cldb->load_join_coefs(u1_p);
629 // Get indexes into full cep for utterances rather than sub ceps
630 u0_cep = u0->join_coeffs;
631 u1_p_cep = u1_p->join_coeffs;
632
633 u0_end = u0_cep->num_frames();
634 u1_p_end = u1_p_cep->num_frames();
635
636 if (!streq(u1_p->base_name,u0->base_name))
637 { /* prev(u1) is a different phone from u0 so don't slide */
638 f = different_prev_pen;
639 u0_st = u0_cep->num_frames()-1;
640 u1_p_st = u1_p_cep->num_frames()-1;
641 }
642 else if (type == 2)
643 { /* we'll only check the edge for the join */
644 u0_st = u0_cep->num_frames()-1;
645 u1_p_st = u1_p_cep->num_frames()-1;
646 f = 1;
647 }
648 else
649 {
650 u0_st = (int)(u0_cep->num_frames() * 0.33);
651 u1_p_st = (int)(u1_p_cep->num_frames() * 0.33);
652 f = 1;
653 }
654
655 best_u0=u0_end;
656 best_u1=u1_p_end;
657 best_val = HUGE_VAL;
658
659 // Here we look for the best join without sliding the windows
660 if ((u0_end-u0_st) < (u1_p_end-u1_p_st))
661 eee = u0_end-u0_st;
662 else
663 eee = u1_p_end-u1_p_st;
664 for (i=0; i < eee; i++)
665 {
666 dist = frame_distance(*u0_cep,i+u0_st,
667 *u1_p_cep,i+u1_p_st,
668 cldb->cweights,
669 f0_join_weight);
670 if (dist < best_val)
671 {
672 best_val = dist;
673 best_u0 = i+u0_st;
674 best_u1 = i+u1_p_st;
675 }
676 }
677 #if 0
678 // This tries *all* possible matches in the pair, its slow
679 // and has a tendency to shorten things more than you'd like
680 // so we just use the more simple test above.
681 int j;
682 for (i=u0_st; i < u0_end; i++)
683 {
684 for (j=u1_p_st; j < u1_p_end; j++)
685 {
686 dist = frame_distance(*u0_cep,i,
687 *u1_p_cep,j,
688 cldb->cweights);
689 if (dist < best_val)
690 {
691 best_val = dist;
692 best_u0 = i;
693 best_u1 = j;
694 }
695 }
696 }
697 #endif
698
699 if (type == 1)
700 {
701 u0_move = u0_cep->t(best_u0);
702 u1_move = u1_p_cep->t(best_u1);
703 }
704
705 return non_consecutive_pen+(best_val*f);
706 }
707
naive_join_cost(CLunit * unit0,CLunit * unit1,EST_Item * s,float & u0_move,float & u1_move)708 static float naive_join_cost(CLunit *unit0, CLunit *unit1,
709 EST_Item *s,
710 float &u0_move,
711 float &u1_move)
712 {
713 // A naive join cost, because I haven't ported the info yet
714
715 u0_move = unit0->end;
716 u1_move = unit1->start;
717
718 if (unit0 == unit1)
719 return 0;
720 else if (unit1->prev_unit->name == unit0->name)
721 return 0;
722 else if (ph_is_silence(s->name()))
723 return 0;
724 else if (ph_is_stop(s->name()))
725 return 0.2;
726 else if (ph_is_fricative(s->name()))
727 return 0.3;
728 else
729 return 1.0;
730 }
731
cldb_load_all_coeffs(LISP filelist)732 static LISP cldb_load_all_coeffs(LISP filelist)
733 {
734 LISP f;
735
736 cldb = check_cldb();
737 for (f=filelist; f; f=cdr(f))
738 {
739 cldb->get_file_coefs_sig(get_c_string(car(f)));
740 cldb->get_file_join_coefs(get_c_string(car(f)));
741 }
742
743 return NIL;
744 }
745
festival_clunits_init(void)746 void festival_clunits_init(void)
747 {
748 // Initialization for clunits selection
749
750 proclaim_module("clunits",
751 "Copyright (C) University of Edinburgh and CMU 1997-2010\n");
752
753 gc_protect(&clunits_params);
754 gc_protect(&selection_trees);
755
756 festival_def_utt_module("Clunits_Select",clunits_select,
757 "(Clunits_Select UTT)\n\
758 Select units from current databases using cluster selection method.");
759
760 festival_def_utt_module("Clunits_Get_Units",clunits_get_units,
761 "(Clunits_Get_Units UTT)\n\
762 Construct Unit relation from the selected units in Segment and extract\n\
763 their parameters from the clunit db.");
764
765 festival_def_utt_module("Clunits_Simple_Wave",clunits_simple_wave,
766 "(Clunits_Simple_Wave UTT)\n\
767 Naively concatenate signals together into a single wave (for debugging).");
768
769 festival_def_utt_module("Clunits_Windowed_Wave",clunits_windowed_wave,
770 "(Clunits_Windowed_Wave UTT)\n\
771 Use hamming window over edges of units to join them, no prosodic \n\
772 modification though.");
773
774 festival_def_utt_module("Clunits_SmoothedJoin_Wave",clunits_smoothedjoin_wave,
775 "(Clunits_SmoothedJoin_Wave UTT)\n\
776 smoothed join.");
777
778 init_subr_1("clunits:load_db",cl_load_db,
779 "(clunits:load_db PARAMS)\n\
780 Load index file for cluster database and set up params, and select it.");
781
782 init_subr_1("clunits:select",cldb_select,
783 "(clunits:select NAME)\n\
784 Select a previously loaded cluster database.");
785
786 init_subr_1("clunits:load_all_coefs",cldb_load_all_coeffs,
787 "(clunits:load_all_coefs FILEIDLIST)\n\
788 Load in coefficients, signal and join coefficients for each named\n\
789 fileid. This is can be called at startup to to reduce the load time\n\
790 during synthesis (though may make the image large).");
791
792 init_subr_0("clunits:list",cldb_list,
793 "(clunits:list)\n\
794 List names of currently loaded cluster databases.");
795
796 init_subr_2("acost:build_disttabs",make_unit_distance_tables,
797 "(acost:build_disttabs UTTTYPES PARAMS)\n\
798 Built matrices of distances between each ling_item in each each list\n\
799 of ling_items in uttypes. Uses acoustic weights in PARAMS and save\n\
800 the result as a matrix for later use.");
801
802 init_subr_2("acost:utt.load_coeffs",acost_utt_load_coeffs,
803 "(acost:utt.load_coeffs UTT PARAMS)\n\
804 Load in the acoustic coefficients into UTT and set the Acoustic_Coeffs\n\
805 feature for each segment in UTT.");
806
807 init_subr_3("acost:file_difference",ac_distance_tracks,
808 "(acost:file_difference FILENAME1 FILENAME2 PARAMS)\n\
809 Load in the two named tracks and find the acoustic difference over all\n\
810 based on the weights in PARAMS.");
811
812 init_subr_2("cl_mapping", l_cl_mapping,
813 "(cl_mapping UTT PARAMS)\n\
814 Impose prosody upto some percentage, and not absolutely.");
815
816 }
817