1 /*************************************************************************/
2 /* */
3 /* Language Technologies Institute */
4 /* Carnegie Mellon University */
5 /* Copyright (c) 2007-2017 */
6 /* All Rights Reserved. */
7 /* */
8 /* Permission is hereby granted, free of charge, to use and distribute */
9 /* this software and its documentation without restriction, including */
10 /* without limitation the rights to use, copy, modify, merge, publish, */
11 /* distribute, sublicense, and/or sell copies of this work, and to */
12 /* permit persons to whom this work is furnished to do so, subject to */
13 /* the following conditions: */
14 /* 1. The code must retain the above copyright notice, this list of */
15 /* conditions and the following disclaimer. */
16 /* 2. Any modifications must be clearly marked as such. */
17 /* 3. Original authors' names are not deleted. */
18 /* 4. The authors' names are not used to endorse or promote products */
19 /* derived from this software without specific prior written */
20 /* permission. */
21 /* */
22 /* CARNEGIE MELLON UNIVERSITY AND THE CONTRIBUTORS TO THIS WORK */
23 /* DISCLAIM ALL WARRANTIES WITH REGARD TO THIS SOFTWARE, INCLUDING */
24 /* ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS, IN NO EVENT */
25 /* SHALL CARNEGIE MELLON UNIVERSITY NOR THE CONTRIBUTORS BE LIABLE */
26 /* FOR ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES */
27 /* WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN */
28 /* AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, */
29 /* ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF */
30 /* THIS SOFTWARE. */
31 /* */
32 /*************************************************************************/
33 /* Authors: Alan W Black (awb@cs.cmu.edu) */
34 /* Date: November 2007 */
35 /*************************************************************************/
36 /* */
37 /* Implementation of Clustergen, Statistical Parameter Synthesizer in */
38 /* Flite */
39 /* */
40 /* A statistical corpus based synthesizer. */
41 /* See Black, A. (2006), CLUSTERGEN: A Statistical Parametric */
42 /* Synthesizer using Trajectory Modeling", Interspeech 2006 - ICSLP, */
43 /* Pittsburgh, PA. */
44 /* http://www.cs.cmu.edu/~awb/papers/is2006/IS061394.PDF */
45 /* */
46 /* Uses MLSA for resynthesis and MLPG for smoothing */
47 /* mlsa and mlpg come from Festvox's VC code (which came in turn */
48 /* came from NITECH's HTS */
49 /* */
50 /*************************************************************************/
51
52 #include "cst_cg.h"
53 #include "cst_spamf0.h"
54 #include "cst_hrg.h"
55 #include "cst_utt_utils.h"
56 #include "cst_audio.h"
57
58 CST_VAL_REGISTER_TYPE(cg_db,cst_cg_db)
59
60 static cst_utterance *cg_make_hmmstates(cst_utterance *utt);
61 static cst_utterance *cg_make_params(cst_utterance *utt);
62 static cst_utterance *cg_predict_params(cst_utterance *utt);
63 static cst_utterance *cg_resynth(cst_utterance *utt);
64
delete_cg_db(cst_cg_db * db)65 void delete_cg_db(cst_cg_db *db)
66 {
67 int i,j;
68
69 if (db->freeable == 0)
70 return; /* its in the data segment, so not freeable */
71
72 /* Woo Hoo! We're gonna free this garbage with a big mallet */
73 /* In spite of what the const qualifiers say ... */
74 cst_free((void *)db->name);
75
76 for (i=0; db->types && db->types[i]; i++)
77 cst_free((void *)db->types[i]);
78 cst_free((void *)db->types);
79
80 for (j=0; j<db->num_f0_models; j++)
81 {
82 for (i=0; db->f0_trees[j] && db->f0_trees[j][i]; i++)
83 delete_cart((cst_cart *)(void *)db->f0_trees[j][i]);
84 cst_free((void *)db->f0_trees[j]);
85 }
86 cst_free((void *)db->f0_trees);
87
88 for (j=0; j<db->num_param_models; j++)
89 {
90 for (i=0; db->param_trees[j] && db->param_trees[j][i]; i++)
91 delete_cart((cst_cart *)(void *)db->param_trees[j][i]);
92 cst_free((void *)db->param_trees[j]);
93 }
94 cst_free((void *)db->param_trees);
95
96 if (db->spamf0)
97 {
98 delete_cart((cst_cart *)(void *)db->spamf0_accent_tree);
99 delete_cart((cst_cart *)(void *)db->spamf0_phrase_tree);
100 for (i=0; i< db->num_frames_spamf0_accent; i++)
101 cst_free((void *)db->spamf0_accent_vectors[i]);
102 cst_free((void *)db->spamf0_accent_vectors);
103 }
104
105 for (j=0; j<db->num_param_models; j++)
106 {
107 for (i=0; i<db->num_frames[j]; i++)
108 cst_free((void *)db->model_vectors[j][i]);
109 cst_free((void *)db->model_vectors[j]);
110 }
111
112 cst_free((void *)db->model_min);
113 cst_free((void *)db->model_range);
114
115 if (db->model_shape != CST_CG_MODEL_SHAPE_BASE_MINRANGE)
116 {
117 for (j = 0; j<db->num_param_models; j++)
118 {
119 for (i=0; i<db->num_channels[j]; i++)
120 cst_free((void *)db->qtable[j][i]);
121 cst_free((void *)db->qtable[j]);
122 }
123 }
124 cst_free((void *)db->qtable);
125
126 /* Moved to here so they can be used for the model_shape freeing */
127 cst_free(db->num_channels);
128 cst_free(db->num_frames);
129 cst_free((void *)db->model_vectors);
130
131 for (j = 0; j<db->num_dur_models; j++)
132 {
133 for (i=0; db->dur_stats[j] && db->dur_stats[j][i]; i++)
134 {
135 cst_free((void *)db->dur_stats[j][i]->phone);
136 cst_free((void *)db->dur_stats[j][i]);
137 }
138 cst_free((void *)db->dur_stats[j]);
139 delete_cart((cst_cart *)(void *)db->dur_cart[j]);
140 }
141 cst_free((void *)db->dur_stats);
142 cst_free((void *)db->dur_cart);
143
144 for (i=0; db->phone_states && db->phone_states[i]; i++)
145 {
146 for (j=0; db->phone_states[i][j]; j++)
147 cst_free((void *)db->phone_states[i][j]);
148 cst_free((void *)db->phone_states[i]);
149 }
150 cst_free((void *)db->phone_states);
151
152 cst_free((void *)db->dynwin);
153
154 for (i=0; i<db->ME_num; i++)
155 cst_free((void *)db->me_h[i]);
156 cst_free((void *)db->me_h);
157
158 cst_free((void *)db);
159 }
160
161 /* */
cg_synth(cst_utterance * utt)162 cst_utterance *cg_synth(cst_utterance *utt)
163 {
164 cst_cg_db *cg_db;
165 cg_db = val_cg_db(utt_feat_val(utt,"cg_db"));
166
167 cg_make_hmmstates(utt);
168 cg_make_params(utt);
169 cg_predict_params(utt);
170 if (cg_db->spamf0)
171 {
172 cst_spamf0(utt);
173 }
174 cg_resynth(utt);
175
176 return utt;
177 }
178
cg_state_duration(cst_item * s,cst_cg_db * cg_db)179 static float cg_state_duration(cst_item *s, cst_cg_db *cg_db)
180 {
181 float zdur, dur;
182 const char *n;
183 int i, x, dm;
184
185 for (dm=0,zdur=0.0; dm < cg_db->num_dur_models; dm++)
186 zdur += val_float(cart_interpret(s,cg_db->dur_cart[dm]));
187 zdur /= dm; /* get average zdur prediction from all dur models */
188 n = item_feat_string(s,"name");
189
190 /* Note we only use the dur stats from the first model, that is */
191 /* correct, but wouldn't be if the dur tree was trained on different */
192 /* data */
193 for (x=i=0; cg_db->dur_stats[0][i]; i++)
194 {
195 if (cst_streq(cg_db->dur_stats[0][i]->phone,n))
196 {
197 x=i;
198 break;
199 }
200 }
201 if (!cg_db->dur_stats[0][i]) /* unknown type name */
202 x = 0;
203
204 dur = (zdur*cg_db->dur_stats[0][x]->stddev)+cg_db->dur_stats[0][x]->mean;
205
206 /* dur = 1.2 * (float)exp((float)dur); */
207
208 return dur;
209 }
210
cg_make_hmmstates(cst_utterance * utt)211 static cst_utterance *cg_make_hmmstates(cst_utterance *utt)
212 {
213 /* Build HMM state structure below the segment structure */
214 cst_cg_db *cg_db;
215 cst_relation *hmmstate, *segstate;
216 cst_item *seg, *s, *ss;
217 const char *segname;
218 int sp,p;
219
220 cg_db = val_cg_db(utt_feat_val(utt,"cg_db"));
221 hmmstate = utt_relation_create(utt,"HMMstate");
222 segstate = utt_relation_create(utt,"segstate");
223
224 for (seg = utt_rel_head(utt,"Segment"); seg; seg=item_next(seg))
225 {
226 ss = relation_append(segstate,seg);
227 segname = item_feat_string(seg,"name");
228 for (p=0; cg_db->phone_states[p]; p++)
229 if (cst_streq(segname,cg_db->phone_states[p][0]))
230 break;
231 if (cg_db->phone_states[p] == NULL)
232 p = 0; /* unknown phoneme */
233 for (sp=1; cg_db->phone_states[p][sp]; sp++)
234 {
235 s = relation_append(hmmstate,NULL);
236 item_add_daughter(ss,s);
237 item_set_string(s,"name",cg_db->phone_states[p][sp]);
238 item_set_int(s,"statepos",sp);
239 }
240 }
241
242 return utt;
243 }
244
cg_make_params(cst_utterance * utt)245 static cst_utterance *cg_make_params(cst_utterance *utt)
246 {
247 /* puts in the frame items */
248 /* historically called "mcep" but can actually be any random vectors */
249 cst_cg_db *cg_db;
250 cst_relation *mcep, *mcep_link;
251 cst_item *s, *mcep_parent, *mcep_frame;
252 int num_frames;
253 float start, end;
254 float dur_stretch, tok_stretch, rdur;
255
256 cg_db = val_cg_db(utt_feat_val(utt,"cg_db"));
257 mcep = utt_relation_create(utt,"mcep");
258 mcep_link = utt_relation_create(utt,"mcep_link");
259 end = 0.0;
260 num_frames = 0;
261 dur_stretch = get_param_float(utt->features,"duration_stretch", 1.0);
262
263 for (s = utt_rel_head(utt,"HMMstate"); s; s=item_next(s))
264 {
265 start = end;
266 tok_stretch = ffeature_float(s,"R:segstate.parent.R:SylStructure.parent.parent.R:Token.parent.local_duration_stretch");
267 if (tok_stretch == 0)
268 tok_stretch = 1.0;
269 rdur = tok_stretch*dur_stretch*cg_state_duration(s,cg_db);
270 /* Guarantee duration to be alt least one frame */
271 if (rdur < cg_db->frame_advance)
272 end = start + cg_db->frame_advance;
273 else
274 end = start + rdur;
275 item_set_float(s,"end",end);
276 mcep_parent = relation_append(mcep_link, s);
277 for ( ; (num_frames * cg_db->frame_advance) <= end; num_frames++ )
278 {
279 mcep_frame = relation_append(mcep,NULL);
280 item_add_daughter(mcep_parent,mcep_frame);
281 item_set_int(mcep_frame,"frame_number",num_frames);
282 item_set(mcep_frame,"name",item_feat(mcep_parent,"name"));
283 }
284 }
285
286 /* Copy duration up onto Segment relation */
287 for (s = utt_rel_head(utt,"Segment"); s; s=item_next(s))
288 item_set(s,"end",ffeature(s,"R:segstate.daughtern.end"));
289
290 utt_set_feat_int(utt,"param_track_num_frames",num_frames);
291
292 return utt;
293 }
294
295 #if CG_OLD
voiced_frame(cst_item * m)296 static int voiced_frame(cst_item *m)
297 {
298 const char *ph_vc;
299 const char *ph_cvox;
300
301 ph_vc = ffeature_string(m,"R:mcep_link.parent.R:segstate.parent.ph_vc");
302 ph_cvox = ffeature_string(m,"R:mcep_link.parent.R:segstate.parent.ph_cvox");
303
304 if (cst_streq("-",ph_vc) &&
305 cst_streq("-",ph_cvox))
306 return 0; /* unvoiced */
307 else
308 return 1; /* voiced */
309 }
310 #endif
311
voiced_frame(cst_item * m)312 static int voiced_frame(cst_item *m)
313 {
314 const char *ph_vc;
315 const char *ph_name;
316
317 ph_vc = ffeature_string(m,"R:mcep_link.parent.R:segstate.parent.ph_vc");
318 ph_name = ffeature_string(m,"R:mcep_link.parent.R:segstate.parent.name");
319
320 if (cst_streq(ph_name,"pau"))
321 return 0; /* unvoiced */
322 else if (cst_streq("+",ph_vc))
323 return 1; /* voiced */
324 else if (item_feat_float(m,"voicing") > 0.5)
325 /* Even though the range is 0-10, I *do* mean 0.5 */
326 return 1; /* voiced */
327 else
328 return 0; /* unvoiced */
329 }
330
catmull_rom_spline(float p,float p0,float p1,float p2,float p3)331 static float catmull_rom_spline(float p,float p0,float p1,float p2,float p3)
332 /* http://www.mvps.org/directx/articles/ */
333 {
334 float q;
335
336 q = ( 0.5 *
337 ( ( 2.0 * p1 ) +
338 ( p * (-p0 + p2) ) +
339 ( (p*p) * (((2.0 * p0) - (5.0 * p1)) +
340 ((4.0 * p2) - p3))) +
341 ( (p*p*p) * (-p0 +
342 ((3.0 * p1) - (3.0 * p2)) +
343 p3))));
344 /* (set! q (* 0.5 (+ (* 2 p1)
345 (* (+ (* -1 p0) p2) p)
346 (* (+ (- (* 2 p0) (* 5 p1)) (- (* 4 p2) p3)) (* p p))
347 (* (+ (* -1 p0) (- (* 3 p1) (* 3 p2)) p3) (* p p p)))))
348 */
349 return q;
350 }
351
cg_F0_interpolate_spline(cst_utterance * utt,cst_track * param_track)352 static void cg_F0_interpolate_spline(cst_utterance *utt,
353 cst_track *param_track)
354 {
355 float start_f0, mid_f0, end_f0;
356 int start_index, end_index, mid_index;
357 int nsi, nei, nmi; /* next syllable indices */
358 float nmid_f0, pmid_f0;
359 cst_item *syl;
360 int i;
361 float m;
362
363 start_f0 = mid_f0 = end_f0 = -1.0;
364
365 for (syl=utt_rel_head(utt,"Syllable"); syl; syl=item_next(syl))
366 {
367 start_index = ffeature_int(syl,"R:SylStructure.daughter1.R:segstate.daughter1.R:mcep_link.daughter1.frame_number");
368 end_index = ffeature_int(syl,"R:SylStructure.daughtern.R:segstate.daughtern.R:mcep_link.daughtern.frame_number");
369 mid_index = (int)((start_index + end_index)/2.0);
370
371 start_f0 = param_track->frames[start_index][0];
372 if (end_f0 > 0.0)
373 start_f0 = end_f0; /* not first time through */
374 if (mid_f0 < 0.0)
375 pmid_f0 = start_f0; /* first time through */
376 else
377 pmid_f0 = mid_f0;
378 mid_f0 = param_track->frames[mid_index][0];
379 if (item_next(syl)) /* not last syllable */
380 end_f0 = (param_track->frames[end_index-1][0]+
381 param_track->frames[end_index][0])/2.0;
382 else
383 end_f0 = param_track->frames[end_index-1][0];
384 nmid_f0=end_f0; /* in case there is no next syl */
385
386 if (item_next(syl))
387 {
388 nsi = ffeature_int(syl,"n.R:SylStructure.daughter1.R:segstate.daughter1.R:mcep_link.daughter1.frame_number");
389 nei = ffeature_int(syl,"n.R:SylStructure.daughtern.R:segstate.daughtern.R:mcep_link.daughtern.frame_number");
390 nmi = (int)((nsi + nei)/2.0);
391 nmid_f0 = param_track->frames[nmi][0];
392 }
393 /* start to mid syl */
394 m = 1.0 / (mid_index - start_index);
395 for (i=0; ((start_index+i)<mid_index); i++)
396 param_track->frames[start_index+i][0] =
397 catmull_rom_spline(i*m,pmid_f0,start_f0,mid_f0,end_f0);
398
399 /* mid syl to end */
400 m = 1.0 / (end_index - mid_index);
401 for (i=0; ((mid_index+i)<end_index); i++)
402 param_track->frames[mid_index+i][0] =
403 catmull_rom_spline(i*m,start_f0,mid_f0,end_f0,nmid_f0);
404 }
405
406 return;
407 }
408
409 #if 0
410 static void cg_smooth_F0_naive(cst_track *param_track)
411 {
412 float l,s;
413 int i,c;
414
415 l = 0.0;
416 for (i=0; i<param_track->num_frames-1; i++)
417 {
418 c = 0; s = 0;
419 if (l > 0.0)
420 {
421 c++; s+=l;
422 }
423 if (param_track->frames[i+1][0] > 0.0)
424 {
425 c++; s+=param_track->frames[i+1][0];
426 }
427 l = param_track->frames[i][0];
428 if (param_track->frames[i][0] > 0.0)
429 {
430 c++; s+=param_track->frames[i][0];
431 param_track->frames[i][0] = s/c;
432 }
433 }
434
435 return;
436 }
437 #endif
438
cg_smooth_F0(cst_utterance * utt,cst_cg_db * cg_db,cst_track * param_track)439 static void cg_smooth_F0(cst_utterance *utt,
440 cst_cg_db *cg_db,
441 cst_track *param_track)
442 {
443 /* Smooth F0 and mark unvoice frames as 0.0 */
444 cst_item *mcep;
445 int i;
446 float mean, stddev;
447
448 /* cg_smooth_F0_naive(param_track); */
449
450 cg_F0_interpolate_spline(utt,param_track);
451
452 mean = get_param_float(utt->features,"int_f0_target_mean", cg_db->f0_mean);
453 mean *= get_param_float(utt->features,"f0_shift", 1.0);
454 stddev =
455 get_param_float(utt->features,"int_f0_target_stddev", cg_db->f0_stddev);
456 #if 0
457 FILE *ftt; int ii;
458 ftt = cst_fopen("awb.f0",CST_OPEN_WRITE);
459 printf("awb_debug saving F0\n");
460 for (ii=0; ii<param_track->num_frames; ii++)
461 cst_fprintf(ftt,"%f %f\n",param_track->frames[ii][0],
462 param_track->frames[ii][param_track->num_channels-2]);
463 cst_fclose(ftt);
464 #endif
465
466 for (i=0,mcep=utt_rel_head(utt,"mcep"); mcep; i++,mcep=item_next(mcep))
467 {
468 if (voiced_frame(mcep))
469 {
470 /* scale the F0 -- which normally wont change it at all */
471 param_track->frames[i][0] =
472 (((param_track->frames[i][0]-cg_db->f0_mean)/cg_db->f0_stddev)
473 *stddev)+mean;
474 /* Some safety checks */
475 if (param_track->frames[i][0] < 50)
476 param_track->frames[i][0] = 50;
477 if (param_track->frames[i][0] > 700)
478 param_track->frames[i][0] = 700;
479 }
480 else /* Unvoice it */
481 param_track->frames[i][0] = 0.0;
482 }
483
484 return;
485 }
486
unpack_model_vector(cst_cg_db * cg_db,int pm,int f,float * v)487 static int unpack_model_vector(cst_cg_db *cg_db,int pm,int f,float *v)
488 {
489 /* This unpacked the potentially compressed/quantized data from the model */
490 int i,j;
491
492 if (cg_db->model_shape == CST_CG_MODEL_SHAPE_QUANTIZED_PARAMS)
493 {
494 for (i=0; i<cg_db->num_channels[pm]/2; i++)
495 {
496 v[i*2] = cg_db->qtable[pm][i*2][cg_db->model_vectors[pm][f][i]/256];
497 v[(i*2)+1] =
498 cg_db->qtable[pm][(i*2)+1][cg_db->model_vectors[pm][f][i]%256];
499 }
500 #if 0
501 printf("awb_debug %d\n",f);
502 for (i=0; i<cg_db->num_channels[pm]; i++)
503 printf("%f ",v[i]);
504 printf("\n");
505 for (i=0; i<cg_db->num_channels[pm]/2; i++)
506 printf("%d %d ",cg_db->model_vectors[pm][f][i]/256,
507 cg_db->model_vectors[pm][f][i]%256);
508 printf("\n");
509 #endif
510 return 0;
511 }
512 if (cg_db->model_shape == CST_CG_MODEL_SHAPE_QUANTIZED_PARAMS_41)
513 {
514 j=1; /* skip F0 mean/stddev */
515 for (i=0; i<25; i++,j++) /* mcep static mean/stddev */
516 {
517 v[j*2] = cg_db->qtable[pm][j*2][cg_db->model_vectors[pm][f][i]/256];
518 v[(j*2)+1] =
519 cg_db->qtable[pm][(j*2)+1][cg_db->model_vectors[pm][f][i]%256];
520 }
521 for (i=25; i<25+12; i+=1,j+=2) /* mcep deltas no mean/stddev */
522 {
523 v[(j*2)+1] = cg_db->qtable[pm][(j*2)+1][cg_db->model_vectors[pm][f][i]/256];
524 v[(j*2)+3] =
525 cg_db->qtable[pm][(j*2)+3][cg_db->model_vectors[pm][f][i]%256];
526 }
527 /* one delta, one me */
528 v[(j*2)+1] = cg_db->qtable[pm][(j*2)+1][cg_db->model_vectors[pm][f][i]/256];
529 v[(j*2)+2] = cg_db->qtable[pm][(j*2)+2][cg_db->model_vectors[pm][f][i]%256];
530 i++; j+=2;
531 /* one me, another me */
532 v[(j*2)] = cg_db->qtable[pm][j*2][cg_db->model_vectors[pm][f][i]/256];
533 v[(j*2)+2] = cg_db->qtable[pm][(j*2)+2][cg_db->model_vectors[pm][f][i]%256];
534 i++; j+=2;
535 /* one me, another me */
536 v[(j*2)] = cg_db->qtable[pm][j*2][cg_db->model_vectors[pm][f][i]/256];
537 v[(j*2)+2] = cg_db->qtable[pm][(j*2)+2][cg_db->model_vectors[pm][f][i]%256];
538 i++; j+=2;
539 /* one voicing and another v-stddef */
540 v[(j*2)] = cg_db->qtable[pm][j*2][cg_db->model_vectors[pm][f][i]/256];
541 v[(j*2)+1] = cg_db->qtable[pm][(j*2)+1][cg_db->model_vectors[pm][f][i]%256];
542 #if 0
543 printf("awb_debug pm %d frame %d\n",pm,f);
544 for (i=0; i<cg_db->num_channels[pm]; i++)
545 printf("%f ",v[i]);
546 printf("\n");
547 #endif
548 return 0;
549 }
550 /* if (cg_db->model_shape == CST_CG_MODEL_SHAPE_BASE_MINRANGE) */
551 else /* let's always do this second one in case model_shape isn't set */
552 {
553 for (i=0; i<cg_db->num_channels[pm]; i++)
554 {
555 v[i] = cg_db->model_min[i]+
556 ((float)((cg_db->model_vectors[pm][f][i])/
557 65535.0)*cg_db->model_range[i]);
558 }
559 return 0;
560 }
561 }
562
cg_predict_params(cst_utterance * utt)563 static cst_utterance *cg_predict_params(cst_utterance *utt)
564 {
565 cst_cg_db *cg_db;
566 cst_track *param_track;
567 cst_track *str_track = NULL;
568 cst_item *mcep;
569 const cst_cart *mcep_tree, *f0_tree;
570 int i,j,f,p,o,pm;
571 const char *mname;
572 float *unpacked_vector;
573 float f0_val, f0_bit;
574 float local_gain, voicing;
575 int fff;
576 int extra_feats = 0;
577
578 cg_db = val_cg_db(utt_feat_val(utt,"cg_db"));
579 param_track = new_track();
580 if (cg_db->do_mlpg) /* which should be the default */
581 fff = 1; /* copy details with stddevs */
582 else
583 fff = 2; /* copy details without stddevs */
584
585 extra_feats = 1; /* voicing */
586 if (cg_db->mixed_excitation)
587 {
588 extra_feats += 5;
589 str_track = new_track();
590 cst_track_resize(str_track,
591 utt_feat_int(utt,"param_track_num_frames"),
592 5);
593 }
594
595 cst_track_resize(param_track,
596 utt_feat_int(utt,"param_track_num_frames"),
597 (cg_db->num_channels[0]/fff)-
598 (2 * extra_feats));/* no voicing or str */
599 unpacked_vector = cst_alloc(float,cg_db->num_channels[0]);
600 f = 0;
601 for (i=0,mcep=utt_rel_head(utt,"mcep"); mcep; i++,mcep=item_next(mcep))
602 {
603 mname = item_feat_string(mcep,"name");
604 local_gain = ffeature_float(mcep,"R:mcep_link.parent.R:segstate.parent.R:SylStructure.parent.parent.R:Token.parent.local_gain");
605 if (local_gain == 0.0) local_gain = 1.0;
606 for (p=0; cg_db->types[p]; p++)
607 if (cst_streq(mname,cg_db->types[p]))
608 break;
609 if (cg_db->types[p] == NULL)
610 p=0; /* if there isn't a matching tree, use the first one */
611
612 /* Predict F0 */
613 for (f0_val=pm=0; pm<cg_db->num_f0_models; pm++)
614 {
615 f0_tree = cg_db->f0_trees[pm][p];
616 f0_bit = val_float(cart_interpret(mcep,f0_tree));
617 f0_val += f0_bit;
618 }
619 param_track->frames[i][0] = f0_val/cg_db->num_f0_models;
620 if (param_track->frames[i][0] < 50.0)
621 param_track->frames[i][0] = 0.0;
622 /* what about stddev ? */
623
624 /* We only have multiple models now, but the default is one model */
625 /* Predict spectral coeffs */
626 voicing = 0.0;
627 for (pm=0; pm<cg_db->num_param_models; pm++)
628 {
629 mcep_tree = cg_db->param_trees[pm][p];
630 f = val_int(cart_interpret(mcep,mcep_tree));
631 /* If there is one model this will be fine, if there are */
632 /* multiple models this will be the nth model */
633 item_set_int(mcep,"clustergen_param_frame",f);
634
635 /* Unpack the model[pm][f] vector */
636 unpack_model_vector(cg_db,pm,f,unpacked_vector);
637
638 /* Old code used to average in param[0] with F0 too (???) */
639
640 for (j=2; j<param_track->num_channels; j++)
641 {
642 if (pm == 0) param_track->frames[i][j] = 0.0;
643 param_track->frames[i][j] += unpacked_vector[j*fff]/
644
645 (float)cg_db->num_param_models;
646 }
647
648 if (cg_db->mixed_excitation)
649 {
650 o = j;
651 for (j=0; j<5; j++)
652 {
653 if (pm == 0) str_track->frames[i][j] = 0.0;
654 str_track->frames[i][j] +=
655 unpacked_vector[(o+(2*j))*fff] /
656 (float)cg_db->num_param_models;
657 }
658 }
659
660 /* last coefficient is average voicing for cluster */
661 voicing /= (float)(pm+1);
662 voicing += unpacked_vector[cg_db->num_channels[pm]-2] /
663 (float)(pm+1);
664 }
665 item_set_float(mcep,"voicing",voicing);
666 /* Apply local gain to c0 */
667 param_track->frames[i][2] *= local_gain;
668
669 param_track->times[i] = i * cg_db->frame_advance;
670 }
671
672 cst_free(unpacked_vector);
673 cg_smooth_F0(utt,cg_db,param_track);
674
675 utt_set_feat(utt,"param_track",track_val(param_track));
676 if (cg_db->mixed_excitation)
677 utt_set_feat(utt,"str_track",track_val(str_track));
678
679 return utt;
680 }
681
cg_resynth(cst_utterance * utt)682 static cst_utterance *cg_resynth(cst_utterance *utt)
683 {
684 cst_cg_db *cg_db;
685 cst_wave *w;
686 cst_track *param_track;
687 cst_track *str_track = NULL;
688 cst_track *smoothed_track;
689 const cst_val *streaming_info_val;
690 cst_audio_streaming_info *asi = NULL;
691 int mlsa_speed_param = 0;
692
693 streaming_info_val=get_param_val(utt->features,"streaming_info",NULL);
694 if (streaming_info_val)
695 {
696 asi = val_audio_streaming_info(streaming_info_val);
697 asi->utt = utt;
698 }
699 /* Values 5-15 might be reasonably to speed things up. This number */
700 /* is used to reduce the number of parameters used in the mceps */
701 /* e.g. value 10 will speed up from 21.0 faster than real time */
702 /* to 26.4 times faster than real time (for builtin rms) */
703 mlsa_speed_param = get_param_int(utt->features,"mlsa_speed_param",0);
704
705 cg_db = val_cg_db(utt_feat_val(utt,"cg_db"));
706 param_track = val_track(utt_feat_val(utt,"param_track"));
707 if (cg_db->mixed_excitation)
708 str_track = val_track(utt_feat_val(utt,"str_track"));
709
710 if (cg_db->do_mlpg)
711 {
712 smoothed_track = mlpg(param_track, cg_db);
713 w = mlsa_resynthesis(smoothed_track,str_track,cg_db,
714 asi,mlsa_speed_param);
715 delete_track(smoothed_track);
716 }
717 else
718 w=mlsa_resynthesis(param_track,str_track,cg_db,
719 asi,mlsa_speed_param);
720
721 if (w == NULL)
722 {
723 /* Synthesis Failed, probably because it was interrupted */
724 utt_set_feat_int(utt,"Interrupted",1);
725 w = new_wave();
726 }
727
728 #if 0
729 /* Apply local gain */
730 for (i=0,tok=utt_rel_head(utt,"Token"); tok; i++,tok=item_next(tok))
731 {
732 if (item_feat_present(tok,"local_gain"))
733 local_gain = item_feat_float(tokget_param_fffeature_float(tok,"R:mcep_link.parent.R:segstate.parent.R:SylStructure.parent.parent.R:Token.parent.local_gain");
734
735 }
736 #endif
737
738 utt_set_wave(utt,w);
739
740 return utt;
741 }
742
743
744
745