1 /*************************************************************************/
2 /*                                                                       */
3 /*                  Language Technologies Institute                      */
4 /*                     Carnegie Mellon University                        */
5 /*                     Copyright (c) 2007-2017                           */
6 /*                        All Rights Reserved.                           */
7 /*                                                                       */
8 /*  Permission is hereby granted, free of charge, to use and distribute  */
9 /*  this software and its documentation without restriction, including   */
10 /*  without limitation the rights to use, copy, modify, merge, publish,  */
11 /*  distribute, sublicense, and/or sell copies of this work, and to      */
12 /*  permit persons to whom this work is furnished to do so, subject to   */
13 /*  the following conditions:                                            */
14 /*   1. The code must retain the above copyright notice, this list of    */
15 /*      conditions and the following disclaimer.                         */
16 /*   2. Any modifications must be clearly marked as such.                */
17 /*   3. Original authors' names are not deleted.                         */
18 /*   4. The authors' names are not used to endorse or promote products   */
19 /*      derived from this software without specific prior written        */
20 /*      permission.                                                      */
21 /*                                                                       */
22 /*  CARNEGIE MELLON UNIVERSITY AND THE CONTRIBUTORS TO THIS WORK         */
23 /*  DISCLAIM ALL WARRANTIES WITH REGARD TO THIS SOFTWARE, INCLUDING      */
24 /*  ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS, IN NO EVENT   */
25 /*  SHALL CARNEGIE MELLON UNIVERSITY NOR THE CONTRIBUTORS BE LIABLE      */
26 /*  FOR ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES    */
27 /*  WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN   */
28 /*  AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION,          */
29 /*  ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF       */
30 /*  THIS SOFTWARE.                                                       */
31 /*                                                                       */
32 /*************************************************************************/
33 /*             Authors:  Alan W Black (awb@cs.cmu.edu)                   */
34 /*                Date:  November 2007                                   */
35 /*************************************************************************/
36 /*                                                                       */
37 /*  Implementation of Clustergen, Statistical Parameter Synthesizer in   */
38 /*  Flite                                                                */
39 /*                                                                       */
40 /*  A statistical corpus based synthesizer.                              */
41 /*  See Black, A. (2006), CLUSTERGEN: A Statistical Parametric           */
42 /*  Synthesizer using Trajectory Modeling", Interspeech 2006 - ICSLP,    */
43 /*  Pittsburgh, PA.                                                      */
44 /*  http://www.cs.cmu.edu/~awb/papers/is2006/IS061394.PDF                */
45 /*                                                                       */
46 /*  Uses MLSA for resynthesis and MLPG for smoothing                     */
47 /*  mlsa and mlpg come from Festvox's VC code (which came in turn        */
48 /*  came from NITECH's HTS                                               */
49 /*                                                                       */
50 /*************************************************************************/
51 
52 #include "cst_cg.h"
53 #include "cst_spamf0.h"
54 #include "cst_hrg.h"
55 #include "cst_utt_utils.h"
56 #include "cst_audio.h"
57 
58 CST_VAL_REGISTER_TYPE(cg_db,cst_cg_db)
59 
60 static cst_utterance *cg_make_hmmstates(cst_utterance *utt);
61 static cst_utterance *cg_make_params(cst_utterance *utt);
62 static cst_utterance *cg_predict_params(cst_utterance *utt);
63 static cst_utterance *cg_resynth(cst_utterance *utt);
64 
delete_cg_db(cst_cg_db * db)65 void delete_cg_db(cst_cg_db *db)
66 {
67     int i,j;
68 
69     if (db->freeable == 0)
70         return;  /* its in the data segment, so not freeable */
71 
72     /* Woo Hoo!  We're gonna free this garbage with a big mallet */
73     /* In spite of what the const qualifiers say ... */
74     cst_free((void *)db->name);
75 
76     for (i=0; db->types && db->types[i]; i++)
77         cst_free((void *)db->types[i]);
78     cst_free((void *)db->types);
79 
80     for (j=0; j<db->num_f0_models; j++)
81     {
82         for (i=0; db->f0_trees[j] && db->f0_trees[j][i]; i++)
83             delete_cart((cst_cart *)(void *)db->f0_trees[j][i]);
84         cst_free((void *)db->f0_trees[j]);
85     }
86     cst_free((void *)db->f0_trees);
87 
88     for (j=0; j<db->num_param_models; j++)
89     {
90         for (i=0; db->param_trees[j] && db->param_trees[j][i]; i++)
91             delete_cart((cst_cart *)(void *)db->param_trees[j][i]);
92         cst_free((void *)db->param_trees[j]);
93     }
94     cst_free((void *)db->param_trees);
95 
96     if (db->spamf0)
97     {
98         delete_cart((cst_cart *)(void *)db->spamf0_accent_tree);
99         delete_cart((cst_cart *)(void *)db->spamf0_phrase_tree);
100         for (i=0; i< db->num_frames_spamf0_accent; i++)
101             cst_free((void *)db->spamf0_accent_vectors[i]);
102         cst_free((void *)db->spamf0_accent_vectors);
103     }
104 
105     for (j=0; j<db->num_param_models; j++)
106     {
107         for (i=0; i<db->num_frames[j]; i++)
108             cst_free((void *)db->model_vectors[j][i]);
109         cst_free((void *)db->model_vectors[j]);
110     }
111 
112     cst_free((void *)db->model_min);
113     cst_free((void *)db->model_range);
114 
115     if (db->model_shape != CST_CG_MODEL_SHAPE_BASE_MINRANGE)
116     {
117         for (j = 0; j<db->num_param_models; j++)
118         {
119             for (i=0; i<db->num_channels[j]; i++)
120                 cst_free((void *)db->qtable[j][i]);
121             cst_free((void *)db->qtable[j]);
122         }
123     }
124     cst_free((void *)db->qtable);
125 
126     /* Moved to here so they can be used for the model_shape freeing */
127     cst_free(db->num_channels);
128     cst_free(db->num_frames);
129     cst_free((void *)db->model_vectors);
130 
131     for (j = 0; j<db->num_dur_models; j++)
132     {
133         for (i=0; db->dur_stats[j] && db->dur_stats[j][i]; i++)
134         {
135             cst_free((void *)db->dur_stats[j][i]->phone);
136             cst_free((void *)db->dur_stats[j][i]);
137         }
138         cst_free((void *)db->dur_stats[j]);
139         delete_cart((cst_cart *)(void *)db->dur_cart[j]);
140     }
141     cst_free((void *)db->dur_stats);
142     cst_free((void *)db->dur_cart);
143 
144     for (i=0; db->phone_states && db->phone_states[i]; i++)
145     {
146         for (j=0; db->phone_states[i][j]; j++)
147             cst_free((void *)db->phone_states[i][j]);
148         cst_free((void *)db->phone_states[i]);
149     }
150     cst_free((void *)db->phone_states);
151 
152     cst_free((void *)db->dynwin);
153 
154     for (i=0; i<db->ME_num; i++)
155         cst_free((void *)db->me_h[i]);
156     cst_free((void *)db->me_h);
157 
158     cst_free((void *)db);
159 }
160 
161 /* */
cg_synth(cst_utterance * utt)162 cst_utterance *cg_synth(cst_utterance *utt)
163 {
164     cst_cg_db *cg_db;
165     cg_db = val_cg_db(utt_feat_val(utt,"cg_db"));
166 
167     cg_make_hmmstates(utt);
168     cg_make_params(utt);
169     cg_predict_params(utt);
170     if (cg_db->spamf0)
171     {
172 	cst_spamf0(utt);
173     }
174     cg_resynth(utt);
175 
176     return utt;
177 }
178 
cg_state_duration(cst_item * s,cst_cg_db * cg_db)179 static float cg_state_duration(cst_item *s, cst_cg_db *cg_db)
180 {
181     float zdur, dur;
182     const char *n;
183     int i, x, dm;
184 
185     for (dm=0,zdur=0.0; dm < cg_db->num_dur_models; dm++)
186         zdur += val_float(cart_interpret(s,cg_db->dur_cart[dm]));
187     zdur /= dm;  /* get average zdur prediction from all dur models */
188     n = item_feat_string(s,"name");
189 
190     /* Note we only use the dur stats from the first model, that is */
191     /* correct, but wouldn't be if the dur tree was trained on different */
192     /* data */
193     for (x=i=0; cg_db->dur_stats[0][i]; i++)
194     {
195         if (cst_streq(cg_db->dur_stats[0][i]->phone,n))
196         {
197             x=i;
198             break;
199         }
200     }
201     if (!cg_db->dur_stats[0][i])  /* unknown type name */
202         x = 0;
203 
204     dur = (zdur*cg_db->dur_stats[0][x]->stddev)+cg_db->dur_stats[0][x]->mean;
205 
206     /*    dur = 1.2 * (float)exp((float)dur); */
207 
208     return dur;
209 }
210 
cg_make_hmmstates(cst_utterance * utt)211 static cst_utterance *cg_make_hmmstates(cst_utterance *utt)
212 {
213     /* Build HMM state structure below the segment structure */
214     cst_cg_db *cg_db;
215     cst_relation *hmmstate, *segstate;
216     cst_item *seg, *s, *ss;
217     const char *segname;
218     int sp,p;
219 
220     cg_db = val_cg_db(utt_feat_val(utt,"cg_db"));
221     hmmstate = utt_relation_create(utt,"HMMstate");
222     segstate = utt_relation_create(utt,"segstate");
223 
224     for (seg = utt_rel_head(utt,"Segment"); seg; seg=item_next(seg))
225     {
226         ss = relation_append(segstate,seg);
227         segname = item_feat_string(seg,"name");
228         for (p=0; cg_db->phone_states[p]; p++)
229             if (cst_streq(segname,cg_db->phone_states[p][0]))
230                 break;
231         if (cg_db->phone_states[p] == NULL)
232             p = 0;  /* unknown phoneme */
233         for (sp=1; cg_db->phone_states[p][sp]; sp++)
234         {
235             s = relation_append(hmmstate,NULL);
236             item_add_daughter(ss,s);
237             item_set_string(s,"name",cg_db->phone_states[p][sp]);
238             item_set_int(s,"statepos",sp);
239         }
240     }
241 
242     return utt;
243 }
244 
cg_make_params(cst_utterance * utt)245 static cst_utterance *cg_make_params(cst_utterance *utt)
246 {
247     /* puts in the frame items */
248     /* historically called "mcep" but can actually be any random vectors */
249     cst_cg_db *cg_db;
250     cst_relation *mcep, *mcep_link;
251     cst_item *s, *mcep_parent, *mcep_frame;
252     int num_frames;
253     float start, end;
254     float dur_stretch, tok_stretch, rdur;
255 
256     cg_db = val_cg_db(utt_feat_val(utt,"cg_db"));
257     mcep = utt_relation_create(utt,"mcep");
258     mcep_link = utt_relation_create(utt,"mcep_link");
259     end = 0.0;
260     num_frames = 0;
261     dur_stretch = get_param_float(utt->features,"duration_stretch", 1.0);
262 
263     for (s = utt_rel_head(utt,"HMMstate"); s; s=item_next(s))
264     {
265         start = end;
266         tok_stretch = ffeature_float(s,"R:segstate.parent.R:SylStructure.parent.parent.R:Token.parent.local_duration_stretch");
267         if (tok_stretch == 0)
268             tok_stretch = 1.0;
269         rdur = tok_stretch*dur_stretch*cg_state_duration(s,cg_db);
270         /* Guarantee duration to be alt least one frame */
271         if (rdur < cg_db->frame_advance)
272             end = start + cg_db->frame_advance;
273         else
274             end = start + rdur;
275         item_set_float(s,"end",end);
276         mcep_parent = relation_append(mcep_link, s);
277         for ( ; (num_frames * cg_db->frame_advance) <= end; num_frames++ )
278         {
279             mcep_frame = relation_append(mcep,NULL);
280             item_add_daughter(mcep_parent,mcep_frame);
281             item_set_int(mcep_frame,"frame_number",num_frames);
282             item_set(mcep_frame,"name",item_feat(mcep_parent,"name"));
283         }
284     }
285 
286     /* Copy duration up onto Segment relation */
287     for (s = utt_rel_head(utt,"Segment"); s; s=item_next(s))
288         item_set(s,"end",ffeature(s,"R:segstate.daughtern.end"));
289 
290     utt_set_feat_int(utt,"param_track_num_frames",num_frames);
291 
292     return utt;
293 }
294 
295 #if CG_OLD
voiced_frame(cst_item * m)296 static int voiced_frame(cst_item *m)
297 {
298     const char *ph_vc;
299     const char *ph_cvox;
300 
301     ph_vc = ffeature_string(m,"R:mcep_link.parent.R:segstate.parent.ph_vc");
302     ph_cvox = ffeature_string(m,"R:mcep_link.parent.R:segstate.parent.ph_cvox");
303 
304     if (cst_streq("-",ph_vc) &&
305         cst_streq("-",ph_cvox))
306         return 0; /* unvoiced */
307     else
308         return 1; /* voiced */
309 }
310 #endif
311 
voiced_frame(cst_item * m)312 static int voiced_frame(cst_item *m)
313 {
314     const char *ph_vc;
315     const char *ph_name;
316 
317     ph_vc = ffeature_string(m,"R:mcep_link.parent.R:segstate.parent.ph_vc");
318     ph_name = ffeature_string(m,"R:mcep_link.parent.R:segstate.parent.name");
319 
320     if (cst_streq(ph_name,"pau"))
321         return 0; /* unvoiced */
322     else if (cst_streq("+",ph_vc))
323         return 1; /* voiced */
324     else if (item_feat_float(m,"voicing") > 0.5)
325         /* Even though the range is 0-10, I *do* mean 0.5 */
326         return 1; /* voiced */
327     else
328         return 0; /* unvoiced */
329 }
330 
catmull_rom_spline(float p,float p0,float p1,float p2,float p3)331 static float catmull_rom_spline(float p,float p0,float p1,float p2,float p3)
332 /* http://www.mvps.org/directx/articles/ */
333 {
334     float q;
335 
336     q = ( 0.5 *
337           ( ( 2.0 * p1 ) +
338             ( p * (-p0 + p2) ) +
339             ( (p*p) * (((2.0 * p0) - (5.0 * p1)) +
340                        ((4.0 * p2) - p3))) +
341             ( (p*p*p) * (-p0 +
342                          ((3.0 * p1) - (3.0 * p2)) +
343                          p3))));
344      /*    (set! q (* 0.5 (+ (* 2 p1)
345            (* (+ (* -1 p0) p2) p)
346             (* (+ (- (* 2 p0) (* 5 p1)) (- (* 4 p2) p3)) (* p p))
347             (* (+ (* -1 p0) (- (* 3 p1) (* 3 p2)) p3) (* p p p)))))
348      */
349     return q;
350 }
351 
cg_F0_interpolate_spline(cst_utterance * utt,cst_track * param_track)352 static void cg_F0_interpolate_spline(cst_utterance *utt,
353                                      cst_track *param_track)
354 {
355     float start_f0, mid_f0, end_f0;
356     int start_index, end_index, mid_index;
357     int nsi, nei, nmi;  /* next syllable indices */
358     float nmid_f0, pmid_f0;
359     cst_item *syl;
360     int i;
361     float m;
362 
363     start_f0 = mid_f0 = end_f0 = -1.0;
364 
365     for (syl=utt_rel_head(utt,"Syllable"); syl; syl=item_next(syl))
366     {
367         start_index = ffeature_int(syl,"R:SylStructure.daughter1.R:segstate.daughter1.R:mcep_link.daughter1.frame_number");
368         end_index = ffeature_int(syl,"R:SylStructure.daughtern.R:segstate.daughtern.R:mcep_link.daughtern.frame_number");
369         mid_index = (int)((start_index + end_index)/2.0);
370 
371         start_f0 = param_track->frames[start_index][0];
372         if (end_f0 > 0.0)
373             start_f0 = end_f0;  /* not first time through */
374         if (mid_f0 < 0.0)
375             pmid_f0 = start_f0;  /* first time through */
376         else
377             pmid_f0 = mid_f0;
378         mid_f0 =  param_track->frames[mid_index][0];
379         if (item_next(syl)) /* not last syllable */
380             end_f0 = (param_track->frames[end_index-1][0]+
381                       param_track->frames[end_index][0])/2.0;
382         else
383             end_f0 = param_track->frames[end_index-1][0];
384         nmid_f0=end_f0; /* in case there is no next syl */
385 
386         if (item_next(syl))
387         {
388             nsi = ffeature_int(syl,"n.R:SylStructure.daughter1.R:segstate.daughter1.R:mcep_link.daughter1.frame_number");
389             nei = ffeature_int(syl,"n.R:SylStructure.daughtern.R:segstate.daughtern.R:mcep_link.daughtern.frame_number");
390             nmi = (int)((nsi + nei)/2.0);
391             nmid_f0 = param_track->frames[nmi][0];
392         }
393         /* start to mid syl */
394         m = 1.0 / (mid_index - start_index);
395         for (i=0; ((start_index+i)<mid_index); i++)
396             param_track->frames[start_index+i][0] =
397                  catmull_rom_spline(i*m,pmid_f0,start_f0,mid_f0,end_f0);
398 
399         /* mid syl to end */
400         m = 1.0 / (end_index - mid_index);
401         for (i=0; ((mid_index+i)<end_index); i++)
402             param_track->frames[mid_index+i][0] =
403                 catmull_rom_spline(i*m,start_f0,mid_f0,end_f0,nmid_f0);
404     }
405 
406     return;
407 }
408 
409 #if 0
410 static void cg_smooth_F0_naive(cst_track *param_track)
411 {
412     float l,s;
413     int i,c;
414 
415     l = 0.0;
416     for (i=0; i<param_track->num_frames-1; i++)
417     {
418         c = 0; s = 0;
419         if (l > 0.0)
420         {
421             c++; s+=l;
422         }
423         if (param_track->frames[i+1][0] > 0.0)
424         {
425             c++; s+=param_track->frames[i+1][0];
426         }
427         l = param_track->frames[i][0];
428         if (param_track->frames[i][0] > 0.0)
429         {
430             c++; s+=param_track->frames[i][0];
431             param_track->frames[i][0] = s/c;
432         }
433     }
434 
435     return;
436 }
437 #endif
438 
cg_smooth_F0(cst_utterance * utt,cst_cg_db * cg_db,cst_track * param_track)439 static void cg_smooth_F0(cst_utterance *utt,
440                          cst_cg_db *cg_db,
441                          cst_track *param_track)
442 {
443     /* Smooth F0 and mark unvoice frames as 0.0 */
444     cst_item *mcep;
445     int i;
446     float mean, stddev;
447 
448     /* cg_smooth_F0_naive(param_track); */
449 
450     cg_F0_interpolate_spline(utt,param_track);
451 
452     mean = get_param_float(utt->features,"int_f0_target_mean", cg_db->f0_mean);
453     mean *= get_param_float(utt->features,"f0_shift", 1.0);
454     stddev =
455         get_param_float(utt->features,"int_f0_target_stddev", cg_db->f0_stddev);
456 #if 0
457     FILE *ftt; int ii;
458     ftt = cst_fopen("awb.f0",CST_OPEN_WRITE);
459     printf("awb_debug saving F0\n");
460     for (ii=0; ii<param_track->num_frames; ii++)
461         cst_fprintf(ftt,"%f %f\n",param_track->frames[ii][0],
462                     param_track->frames[ii][param_track->num_channels-2]);
463     cst_fclose(ftt);
464 #endif
465 
466     for (i=0,mcep=utt_rel_head(utt,"mcep"); mcep; i++,mcep=item_next(mcep))
467     {
468         if (voiced_frame(mcep))
469         {
470             /* scale the F0 -- which normally wont change it at all */
471             param_track->frames[i][0] =
472                 (((param_track->frames[i][0]-cg_db->f0_mean)/cg_db->f0_stddev)
473                  *stddev)+mean;
474             /* Some safety checks */
475             if (param_track->frames[i][0] < 50)
476                 param_track->frames[i][0] = 50;
477             if (param_track->frames[i][0] > 700)
478                 param_track->frames[i][0] = 700;
479         }
480         else /* Unvoice it */
481             param_track->frames[i][0] = 0.0;
482     }
483 
484     return;
485 }
486 
unpack_model_vector(cst_cg_db * cg_db,int pm,int f,float * v)487 static int unpack_model_vector(cst_cg_db *cg_db,int pm,int f,float *v)
488 {
489     /* This unpacked the potentially compressed/quantized data from the model */
490     int i,j;
491 
492     if (cg_db->model_shape == CST_CG_MODEL_SHAPE_QUANTIZED_PARAMS)
493     {
494         for (i=0; i<cg_db->num_channels[pm]/2; i++)
495         {
496             v[i*2] = cg_db->qtable[pm][i*2][cg_db->model_vectors[pm][f][i]/256];
497             v[(i*2)+1] =
498                 cg_db->qtable[pm][(i*2)+1][cg_db->model_vectors[pm][f][i]%256];
499         }
500 #if 0
501         printf("awb_debug %d\n",f);
502         for (i=0; i<cg_db->num_channels[pm]; i++)
503             printf("%f ",v[i]);
504         printf("\n");
505         for (i=0; i<cg_db->num_channels[pm]/2; i++)
506             printf("%d %d ",cg_db->model_vectors[pm][f][i]/256,
507                    cg_db->model_vectors[pm][f][i]%256);
508         printf("\n");
509 #endif
510         return 0;
511     }
512     if (cg_db->model_shape == CST_CG_MODEL_SHAPE_QUANTIZED_PARAMS_41)
513     {
514         j=1; /* skip F0 mean/stddev */
515         for (i=0; i<25; i++,j++)        /* mcep static mean/stddev */
516         {
517             v[j*2] = cg_db->qtable[pm][j*2][cg_db->model_vectors[pm][f][i]/256];
518             v[(j*2)+1] =
519                 cg_db->qtable[pm][(j*2)+1][cg_db->model_vectors[pm][f][i]%256];
520         }
521         for (i=25; i<25+12; i+=1,j+=2)  /* mcep deltas no mean/stddev */
522         {
523             v[(j*2)+1] = cg_db->qtable[pm][(j*2)+1][cg_db->model_vectors[pm][f][i]/256];
524             v[(j*2)+3] =
525                 cg_db->qtable[pm][(j*2)+3][cg_db->model_vectors[pm][f][i]%256];
526         }
527         /* one delta, one me */
528         v[(j*2)+1] = cg_db->qtable[pm][(j*2)+1][cg_db->model_vectors[pm][f][i]/256];
529         v[(j*2)+2] = cg_db->qtable[pm][(j*2)+2][cg_db->model_vectors[pm][f][i]%256];
530         i++; j+=2;
531         /* one me, another me */
532         v[(j*2)] = cg_db->qtable[pm][j*2][cg_db->model_vectors[pm][f][i]/256];
533         v[(j*2)+2] = cg_db->qtable[pm][(j*2)+2][cg_db->model_vectors[pm][f][i]%256];
534         i++; j+=2;
535         /* one me, another me */
536         v[(j*2)] = cg_db->qtable[pm][j*2][cg_db->model_vectors[pm][f][i]/256];
537         v[(j*2)+2] = cg_db->qtable[pm][(j*2)+2][cg_db->model_vectors[pm][f][i]%256];
538         i++; j+=2;
539         /* one voicing and another v-stddef */
540         v[(j*2)] = cg_db->qtable[pm][j*2][cg_db->model_vectors[pm][f][i]/256];
541         v[(j*2)+1] = cg_db->qtable[pm][(j*2)+1][cg_db->model_vectors[pm][f][i]%256];
542 #if 0
543         printf("awb_debug pm %d frame %d\n",pm,f);
544         for (i=0; i<cg_db->num_channels[pm]; i++)
545             printf("%f ",v[i]);
546         printf("\n");
547 #endif
548         return 0;
549     }
550     /* if (cg_db->model_shape == CST_CG_MODEL_SHAPE_BASE_MINRANGE) */
551     else /* let's always do this second one in case model_shape isn't set */
552     {
553         for (i=0; i<cg_db->num_channels[pm]; i++)
554         {
555             v[i] = cg_db->model_min[i]+
556                 ((float)((cg_db->model_vectors[pm][f][i])/
557                          65535.0)*cg_db->model_range[i]);
558         }
559         return 0;
560     }
561 }
562 
cg_predict_params(cst_utterance * utt)563 static cst_utterance *cg_predict_params(cst_utterance *utt)
564 {
565     cst_cg_db *cg_db;
566     cst_track *param_track;
567     cst_track *str_track = NULL;
568     cst_item *mcep;
569     const cst_cart *mcep_tree, *f0_tree;
570     int i,j,f,p,o,pm;
571     const char *mname;
572     float *unpacked_vector;
573     float f0_val, f0_bit;
574     float local_gain, voicing;
575     int fff;
576     int extra_feats = 0;
577 
578     cg_db = val_cg_db(utt_feat_val(utt,"cg_db"));
579     param_track = new_track();
580     if (cg_db->do_mlpg) /* which should be the default */
581         fff = 1;  /* copy details with stddevs */
582     else
583         fff = 2;  /* copy details without stddevs */
584 
585     extra_feats = 1;  /* voicing */
586     if (cg_db->mixed_excitation)
587     {
588         extra_feats += 5;
589         str_track = new_track();
590         cst_track_resize(str_track,
591                          utt_feat_int(utt,"param_track_num_frames"),
592                          5);
593     }
594 
595     cst_track_resize(param_track,
596                      utt_feat_int(utt,"param_track_num_frames"),
597                      (cg_db->num_channels[0]/fff)-
598                        (2 * extra_feats));/* no voicing or str */
599     unpacked_vector = cst_alloc(float,cg_db->num_channels[0]);
600     f = 0;
601     for (i=0,mcep=utt_rel_head(utt,"mcep"); mcep; i++,mcep=item_next(mcep))
602     {
603         mname = item_feat_string(mcep,"name");
604         local_gain = ffeature_float(mcep,"R:mcep_link.parent.R:segstate.parent.R:SylStructure.parent.parent.R:Token.parent.local_gain");
605         if (local_gain == 0.0) local_gain = 1.0;
606         for (p=0; cg_db->types[p]; p++)
607             if (cst_streq(mname,cg_db->types[p]))
608                 break;
609         if (cg_db->types[p] == NULL)
610             p=0; /* if there isn't a matching tree, use the first one */
611 
612         /* Predict F0 */
613         for (f0_val=pm=0; pm<cg_db->num_f0_models; pm++)
614         {
615             f0_tree = cg_db->f0_trees[pm][p];
616             f0_bit = val_float(cart_interpret(mcep,f0_tree));
617             f0_val += f0_bit;
618         }
619         param_track->frames[i][0] = f0_val/cg_db->num_f0_models;
620         if (param_track->frames[i][0] < 50.0)
621             param_track->frames[i][0] = 0.0;
622         /* what about stddev ? */
623 
624         /* We only have multiple models now, but the default is one model */
625         /* Predict spectral coeffs */
626         voicing = 0.0;
627         for (pm=0; pm<cg_db->num_param_models; pm++)
628         {
629             mcep_tree = cg_db->param_trees[pm][p];
630             f = val_int(cart_interpret(mcep,mcep_tree));
631             /* If there is one model this will be fine, if there are */
632             /* multiple models this will be the nth model */
633             item_set_int(mcep,"clustergen_param_frame",f);
634 
635             /* Unpack the model[pm][f] vector */
636             unpack_model_vector(cg_db,pm,f,unpacked_vector);
637 
638             /* Old code used to average in param[0] with F0 too (???) */
639 
640             for (j=2; j<param_track->num_channels; j++)
641             {
642                 if (pm == 0) param_track->frames[i][j] = 0.0;
643                 param_track->frames[i][j] += unpacked_vector[j*fff]/
644 
645                     (float)cg_db->num_param_models;
646             }
647 
648             if (cg_db->mixed_excitation)
649             {
650                 o = j;
651                 for (j=0; j<5; j++)
652                 {
653                     if (pm == 0) str_track->frames[i][j] = 0.0;
654                     str_track->frames[i][j] +=
655                         unpacked_vector[(o+(2*j))*fff] /
656                         (float)cg_db->num_param_models;
657                 }
658             }
659 
660             /* last coefficient is average voicing for cluster */
661             voicing /= (float)(pm+1);
662             voicing += unpacked_vector[cg_db->num_channels[pm]-2] /
663                 (float)(pm+1);
664         }
665         item_set_float(mcep,"voicing",voicing);
666         /* Apply local gain to c0 */
667         param_track->frames[i][2] *= local_gain;
668 
669         param_track->times[i] = i * cg_db->frame_advance;
670     }
671 
672     cst_free(unpacked_vector);
673     cg_smooth_F0(utt,cg_db,param_track);
674 
675     utt_set_feat(utt,"param_track",track_val(param_track));
676     if (cg_db->mixed_excitation)
677         utt_set_feat(utt,"str_track",track_val(str_track));
678 
679     return utt;
680 }
681 
cg_resynth(cst_utterance * utt)682 static cst_utterance *cg_resynth(cst_utterance *utt)
683 {
684     cst_cg_db *cg_db;
685     cst_wave *w;
686     cst_track *param_track;
687     cst_track *str_track = NULL;
688     cst_track *smoothed_track;
689     const cst_val *streaming_info_val;
690     cst_audio_streaming_info *asi = NULL;
691     int mlsa_speed_param = 0;
692 
693     streaming_info_val=get_param_val(utt->features,"streaming_info",NULL);
694     if (streaming_info_val)
695     {
696         asi = val_audio_streaming_info(streaming_info_val);
697         asi->utt = utt;
698     }
699     /* Values 5-15 might be reasonably to speed things up.  This number */
700     /* is used to reduce the number of parameters used in the mceps      */
701     /* e.g. value 10 will speed up from 21.0 faster than real time       */
702     /* to 26.4 times faster than real time (for builtin rms) */
703     mlsa_speed_param = get_param_int(utt->features,"mlsa_speed_param",0);
704 
705     cg_db = val_cg_db(utt_feat_val(utt,"cg_db"));
706     param_track = val_track(utt_feat_val(utt,"param_track"));
707     if (cg_db->mixed_excitation)
708         str_track = val_track(utt_feat_val(utt,"str_track"));
709 
710     if (cg_db->do_mlpg)
711     {
712         smoothed_track = mlpg(param_track, cg_db);
713         w = mlsa_resynthesis(smoothed_track,str_track,cg_db,
714                              asi,mlsa_speed_param);
715         delete_track(smoothed_track);
716     }
717     else
718         w=mlsa_resynthesis(param_track,str_track,cg_db,
719                            asi,mlsa_speed_param);
720 
721     if (w == NULL)
722     {
723         /* Synthesis Failed, probably because it was interrupted */
724         utt_set_feat_int(utt,"Interrupted",1);
725         w = new_wave();
726     }
727 
728 #if 0
729     /* Apply local gain */
730     for (i=0,tok=utt_rel_head(utt,"Token"); tok; i++,tok=item_next(tok))
731     {
732         if (item_feat_present(tok,"local_gain"))
733             local_gain = item_feat_float(tokget_param_fffeature_float(tok,"R:mcep_link.parent.R:segstate.parent.R:SylStructure.parent.parent.R:Token.parent.local_gain");
734 
735     }
736 #endif
737 
738     utt_set_wave(utt,w);
739 
740     return utt;
741 }
742 
743 
744 
745