1 /*************************************************************************/
2 /* */
3 /* Language Technologies Institute */
4 /* Carnegie Mellon University */
5 /* Copyright (c) 2001 */
6 /* All Rights Reserved. */
7 /* */
8 /* Permission is hereby granted, free of charge, to use and distribute */
9 /* this software and its documentation without restriction, including */
10 /* without limitation the rights to use, copy, modify, merge, publish, */
11 /* distribute, sublicense, and/or sell copies of this work, and to */
12 /* permit persons to whom this work is furnished to do so, subject to */
13 /* the following conditions: */
14 /* 1. The code must retain the above copyright notice, this list of */
15 /* conditions and the following disclaimer. */
16 /* 2. Any modifications must be clearly marked as such. */
17 /* 3. Original authors' names are not deleted. */
18 /* 4. The authors' names are not used to endorse or promote products */
19 /* derived from this software without specific prior written */
20 /* permission. */
21 /* */
22 /* CARNEGIE MELLON UNIVERSITY AND THE CONTRIBUTORS TO THIS WORK */
23 /* DISCLAIM ALL WARRANTIES WITH REGARD TO THIS SOFTWARE, INCLUDING */
24 /* ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS, IN NO EVENT */
25 /* SHALL CARNEGIE MELLON UNIVERSITY NOR THE CONTRIBUTORS BE LIABLE */
26 /* FOR ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES */
27 /* WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN */
28 /* AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, */
29 /* ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF */
30 /* THIS SOFTWARE. */
31 /* */
32 /*************************************************************************/
33 /* Author: Alan W Black (awb@cs.cmu.edu) */
34 /* Date: January 2001 */
35 /*************************************************************************/
36 /* */
37 /* General unit functions (diphones or clunit) */
38 /* */
39 /*************************************************************************/
40
41 #include "cst_math.h"
42 #include "cst_hrg.h"
43 #include "cst_utt_utils.h"
44 #include "cst_wave.h"
45 #include "cst_track.h"
46 #include "cst_units.h"
47 #include "cst_sigpr.h"
48
49 static int nearest_pm(cst_sts_list *sts_list,int start,int end,float u_index);
50
join_units(cst_utterance * utt)51 cst_utterance *join_units(cst_utterance *utt)
52 {
53 /* Make a waveform form the units */
54 const char *join_type;
55
56 join_type = get_param_string(utt->features,"join_type", "modified_lpc");
57
58 if (cst_streq(join_type,"none"))
59 return utt;
60 #if 0
61 else if (cst_streq(join_type,"windowed_join"))
62 join_units_windowed(utt);
63 #endif
64 else if (cst_streq(join_type,"simple_join"))
65 join_units_simple(utt);
66 else if (cst_streq(join_type,"modified_lpc"))
67 join_units_modified_lpc(utt);
68
69 return utt;
70 }
71
join_units_simple(cst_utterance * utt)72 cst_utterance *join_units_simple(cst_utterance *utt)
73 {
74 cst_wave *w = 0;
75 cst_lpcres *lpcres;
76 const char *resynth_type;
77 const cst_val *streaming_info_val;
78
79 resynth_type = get_param_string(utt->features,"resynth_type", "fixed");
80
81 asis_to_pm(utt);
82 concat_units(utt);
83
84 lpcres = val_lpcres(utt_feat_val(utt,"target_lpcres"));
85
86 streaming_info_val=get_param_val(utt->features,"streaming_info",NULL);
87 if (streaming_info_val)
88 {
89 lpcres->asi = val_audio_streaming_info(streaming_info_val);
90 lpcres->asi->utt = utt;
91 }
92
93 if (cst_streq(resynth_type, "fixed"))
94 w = lpc_resynth_fixedpoint(lpcres);
95 else
96 {
97 cst_errmsg("unknown resynthesis type %s\n", resynth_type);
98 cst_error(); /* Should not happen */
99 }
100
101 utt_set_wave(utt,w);
102
103 return utt;
104 }
105
join_units_modified_lpc(cst_utterance * utt)106 cst_utterance *join_units_modified_lpc(cst_utterance *utt)
107 {
108 cst_wave *w = 0;
109 cst_lpcres *lpcres;
110 const char *resynth_type;
111 const cst_val *streaming_info_val;
112
113 resynth_type = get_param_string(utt->features,"resynth_type", "float");
114
115 f0_targets_to_pm(utt);
116 concat_units(utt);
117
118 lpcres = val_lpcres(utt_feat_val(utt,"target_lpcres"));
119
120 streaming_info_val=get_param_val(utt->features,"streaming_info",NULL);
121 if (streaming_info_val)
122 {
123 lpcres->asi = val_audio_streaming_info(streaming_info_val);
124 lpcres->asi->utt = utt;
125 }
126
127 if (cst_streq(resynth_type, "float"))
128 w = lpc_resynth(lpcres);
129 else if (cst_streq(resynth_type, "fixed"))
130 {
131 w = lpc_resynth_fixedpoint(lpcres);
132 }
133 else
134 {
135 cst_errmsg("unknown resynthesis type %s\n", resynth_type);
136 cst_error(); /* Should not happen */
137 }
138
139 if (w == NULL)
140 {
141 /* Synthesis Failed, probably because it was interrupted */
142 utt_set_feat_int(utt,"Interrupted",1);
143 w = new_wave();
144 }
145
146 utt_set_wave(utt,w);
147
148 return utt;
149 }
150
asis_to_pm(cst_utterance * utt)151 cst_utterance *asis_to_pm(cst_utterance *utt)
152 {
153 /* Copy the PM structure from the units unchanged */
154 cst_item *u;
155 cst_lpcres *target_lpcres;
156 int unit_start, unit_end;
157 int utt_pms, utt_size, i;
158 cst_sts_list *sts_list;
159
160 sts_list = val_sts_list(utt_feat_val(utt,"sts_list"));
161 target_lpcres = new_lpcres();
162
163 /* Pass one to find the size */
164 utt_pms = utt_size = 0;
165 for (u=relation_head(utt_relation(utt,"Unit"));
166 u;
167 u=item_next(u))
168 {
169 unit_start = item_feat_int(u,"unit_start");
170 unit_end = item_feat_int(u,"unit_end");
171 utt_size += get_unit_size(sts_list,unit_start,unit_end);
172 utt_pms += unit_end - unit_start;
173 item_set_int(u,"target_end",utt_size);
174 }
175 lpcres_resize_frames(target_lpcres,utt_pms);
176
177 /* Pass two to fill in the values */
178 utt_pms = utt_size = 0;
179 for (u=relation_head(utt_relation(utt,"Unit"));
180 u;
181 u=item_next(u))
182 {
183 unit_start = item_feat_int(u,"unit_start");
184 unit_end = item_feat_int(u,"unit_end");
185 for (i=unit_start; i<unit_end; i++,utt_pms++)
186 {
187 utt_size += get_frame_size(sts_list, i);
188 target_lpcres->times[utt_pms] = utt_size;
189 }
190 }
191 utt_set_feat(utt,"target_lpcres",lpcres_val(target_lpcres));
192 return utt;
193 }
194
f0_targets_to_pm(cst_utterance * utt)195 cst_utterance *f0_targets_to_pm(cst_utterance *utt)
196 {
197 cst_item *t;
198 float pos,lpos,f0,lf0,m;
199 double time;
200 int pm;
201 cst_sts_list *sts_list;
202 cst_lpcres *target_lpcres;
203
204 sts_list = val_sts_list(utt_feat_val(utt,"sts_list"));
205 lpos = 0;
206 lf0 = 120; /* hmm */
207 pm = 0;
208 time = 0;
209 /* First pass to count how many pms will be required */
210 for (t=relation_head(utt_relation(utt,"Target"));
211 t;
212 t=item_next(t), lf0 = f0, lpos = pos) /* changed by dhopkins */
213 {
214 pos = item_feat_float(t,"pos");
215 f0 = item_feat_float(t,"f0");
216 if (time == pos) continue;
217 m = (f0-lf0)/(pos-lpos);
218 for ( ; time < pos; pm++)
219 {
220 time += 1/(lf0 + ((time-lpos)*m));
221 }
222 }
223 target_lpcres = new_lpcres();
224 lpcres_resize_frames(target_lpcres,pm);
225
226 lpos = 0;
227 lf0 = 120;
228 pm = 0;
229 time = 0;
230 /* Second pass puts the values in */
231 for (t=relation_head(utt_relation(utt,"Target"));
232 t;
233 t=item_next(t), lf0 = f0, lpos = pos) /* changed by dhopkins */
234 {
235 pos = item_feat_float(t,"pos");
236 f0 = item_feat_float(t,"f0");
237 if (time == pos) continue;
238 m = (f0-lf0)/(pos-lpos);
239 for ( ; time < pos; pm++)
240 {
241 time += 1/(lf0 + ((time-lpos)*m));
242 target_lpcres->times[pm] = sts_list->sample_rate * time;
243 }
244 }
245 utt_set_feat(utt,"target_lpcres",lpcres_val(target_lpcres));
246 return utt;
247 }
248
concat_units(cst_utterance * utt)249 cst_utterance *concat_units(cst_utterance *utt)
250 {
251 cst_lpcres *target_lpcres;
252 cst_item *u;
253 int pm_i;
254 int unit_size, unit_start, unit_end;
255 int rpos, nearest_u_pm;
256 int target_end, target_start;
257 float m, u_index;
258 cst_sts_list *sts_list;
259 const char *residual_type;
260
261 sts_list = val_sts_list(utt_feat_val(utt,"sts_list"));
262 if (sts_list->codec == NULL)
263 residual_type = "ulaw";
264 else
265 residual_type = sts_list->codec;
266 target_lpcres = val_lpcres(utt_feat_val(utt,"target_lpcres"));
267
268 target_lpcres->lpc_min = sts_list->coeff_min;
269 target_lpcres->lpc_range = sts_list->coeff_range;
270 target_lpcres->num_channels = sts_list->num_channels;
271 target_lpcres->sample_rate = sts_list->sample_rate;
272 lpcres_resize_samples(target_lpcres,
273 target_lpcres->times[target_lpcres->num_frames-1]);
274 if (utt_feat_val(utt,"delayed_decoding"))
275 {
276 target_lpcres->delayed_decoding = 1;
277 target_lpcres->packed_residuals =
278 cst_alloc(const unsigned char *,target_lpcres->num_frames);
279 }
280
281 target_start = 0.0; rpos = 0; pm_i = 0; u_index = 0;
282 for (u=relation_head(utt_relation(utt,"Unit")); u; u=item_next(u))
283 {
284 unit_start = item_feat_int(u,"unit_start");
285 unit_end = item_feat_int(u,"unit_end");
286 unit_size = get_unit_size(sts_list,unit_start,unit_end);
287 target_end = item_feat_int(u,"target_end");
288
289 u_index = 0;
290 m = (float)unit_size/(float)(target_end-target_start);
291 /* printf("unit_size %d start %d end %d tstart %d tend %d m %f\n",
292 unit_size, unit_start, unit_end, target_start, target_end, m); */
293 for ( /* pm_start=pm_i */ ;
294 (pm_i < target_lpcres->num_frames) &&
295 (target_lpcres->times[pm_i] <= target_end);
296 pm_i++)
297 {
298 nearest_u_pm = nearest_pm(sts_list,unit_start,unit_end,u_index);
299 /* Get LPC coefs (pointer) */
300 target_lpcres->frames[pm_i] = get_sts_frame(sts_list, nearest_u_pm);
301 /* Get residual (copy) */
302 target_lpcres->sizes[pm_i] =
303 target_lpcres->times[pm_i] -
304 (pm_i > 0 ? target_lpcres->times[pm_i-1] : 0);
305 if (cst_streq(residual_type,"pulse"))
306 add_residual_pulse(target_lpcres->sizes[pm_i],
307 &target_lpcres->residual[rpos],
308 get_frame_size(sts_list, nearest_u_pm),
309 get_sts_residual(sts_list, nearest_u_pm));
310 else if (cst_streq(residual_type,"g721"))
311 add_residual_g721(target_lpcres->sizes[pm_i],
312 &target_lpcres->residual[rpos],
313 get_frame_size(sts_list, nearest_u_pm),
314 get_sts_residual(sts_list, nearest_u_pm));
315 else if (cst_streq(residual_type,"g721vuv"))
316 {
317 if (target_lpcres->delayed_decoding)
318 {
319 target_lpcres->packed_residuals[pm_i] =
320 get_sts_residual(sts_list, nearest_u_pm);
321 }
322 else
323 {
324 add_residual_g721vuv(target_lpcres->sizes[pm_i],
325 &target_lpcres->residual[rpos],
326 get_frame_size(sts_list, nearest_u_pm),
327 get_sts_residual(sts_list, nearest_u_pm));
328 }
329 }
330 else if (cst_streq(residual_type,"vuv"))
331 add_residual_vuv(target_lpcres->sizes[pm_i],
332 &target_lpcres->residual[rpos],
333 get_frame_size(sts_list, nearest_u_pm),
334 get_sts_residual(sts_list, nearest_u_pm));
335 /* But this requires particular layout of residuals which
336 probably isn't true */
337 /*
338 if (cst_streq(residual_type,"windowed"))
339 add_residual_windowed(target_lpcres->sizes[pm_i],
340 &target_lpcres->residual[rpos],
341 get_frame_size(sts_list, nearest_u_pm),
342 get_sts_residual(sts_list, nearest_u_pm));
343 */
344 else /* default is "ulaw" */
345 add_residual(target_lpcres->sizes[pm_i],
346 &target_lpcres->residual[rpos],
347 get_frame_size(sts_list, nearest_u_pm),
348 get_sts_residual(sts_list, nearest_u_pm));
349 rpos+=target_lpcres->sizes[pm_i];
350 u_index += (float)target_lpcres->sizes[pm_i]*m;
351 }
352 target_start = target_end;
353 }
354 target_lpcres->num_frames = pm_i;
355 return utt;
356 }
357
nearest_pm(cst_sts_list * sts_list,int start,int end,float u_index)358 static int nearest_pm(cst_sts_list *sts_list, int start,int end,float u_index)
359 {
360 /* First the pm in unit_entry that is closest to u_index */
361 int i, i_size, n_size;
362 i_size = 0;
363
364 for (i=start; i < end; i++)
365 {
366 n_size = i_size + get_frame_size(sts_list, i);
367 if (fabs((double)(u_index-(float)i_size)) <
368 fabs((double)(u_index-(float)n_size)))
369 return i;
370 i_size = n_size;
371 }
372
373 return end-1;
374 }
375
376 #if 0
377 void add_residual_windowed(int targ_size,
378 unsigned char *targ_residual,
379 int unit_size,
380 const unsigned char *unit_residual)
381 {
382 /* Note this doesn't work unless the unit_residuals and consecutive */
383 #define DI_PI 3.14159265358979323846
384 float *window, *unit, *residual;
385 int i,j,k, offset, win_size;
386
387 win_size = (targ_size*2)+1;
388 window = cst_alloc(float,win_size);
389 window[targ_size+1] = 1.0;
390 k = DI_PI / (win_size - 1);
391 for (i=0,j=win_size-1; i < targ_size+1; i++,j--)
392 window[j] = window[i] = 0.54 - (0.46 * cos(k * i));
393
394 residual = cst_alloc(float,win_size);
395 for (i=0; i<win_size; i++)
396 residual[i] = cst_ulaw_to_short(targ_residual[i]);
397
398 unit = cst_alloc(float,(unit_size*2)+1);
399 for (i=0; i<(unit_size*2)+1; i++)
400 unit[i] = cst_ulaw_to_short(unit_residual[i]);
401
402 if (targ_size < unit_size)
403 for (i=0; i < win_size; i++)
404 residual[i] += window[i] * unit[i+(unit_size-targ_size)/2];
405 else
406 {
407 offset = (targ_size-unit_size)/2;
408 for (i=offset; i < win_size-offset; i++)
409 residual[i] += window[i] * unit[i-offset];
410 }
411
412 for (i=0; i < win_size; i++)
413 targ_residual[i] = cst_short_to_ulaw((short)residual[i]);
414
415 cst_free(window);
416 cst_free(residual);
417 cst_free(unit);
418
419 }
420 #endif
421
add_residual(int targ_size,unsigned char * targ_residual,int unit_size,const unsigned char * unit_residual)422 void add_residual(int targ_size, unsigned char *targ_residual,
423 int unit_size, const unsigned char *unit_residual)
424 {
425 /* float pow_factor;
426 int i; */
427
428 if (unit_size < targ_size)
429 memmove(&targ_residual[((targ_size-unit_size)/2)],
430 &unit_residual[0],
431 unit_size*sizeof(unsigned char));
432 else
433 {
434 memmove(&targ_residual[0],
435 &unit_residual[((unit_size-targ_size)/2)],
436 targ_size*sizeof(unsigned char));
437 }
438 #if 0
439 if (unit_size < targ_size)
440 memmove(&targ_residual[0],
441 &unit_residual[0],
442 unit_size*sizeof(unsigned char));
443 else
444 {
445 memmove(&targ_residual[0],
446 &unit_residual[0],
447 targ_size*sizeof(unsigned char));
448 }
449 #endif
450 }
451
add_residual_g721(int targ_size,unsigned char * targ_residual,int uunit_size,const unsigned char * unit_residual)452 void add_residual_g721(int targ_size, unsigned char *targ_residual,
453 int uunit_size, const unsigned char *unit_residual)
454 {
455 /* Residual is encoded with g721 */
456 unsigned char *unit_residual_unpacked;
457 int unit_size;
458
459 unit_residual_unpacked =
460 cst_g721_decode(&unit_size, (uunit_size+CST_G721_LEADIN+1)/2, unit_residual);
461
462 if (uunit_size < targ_size)
463 memmove(&targ_residual[((targ_size-uunit_size)/2)],
464 &unit_residual_unpacked[CST_G721_LEADIN],
465 uunit_size*sizeof(unsigned char));
466 else
467 {
468 memmove(&targ_residual[0],
469 &unit_residual_unpacked[CST_G721_LEADIN+((uunit_size-targ_size)/2)],
470 targ_size*sizeof(unsigned char));
471 }
472
473 cst_free(unit_residual_unpacked);
474 }
475
plus_or_minus_one()476 static double plus_or_minus_one()
477 {
478 /* Randomly return 1 or -1 */
479 /* not sure rand() is portable */
480 if (rand() > RAND_MAX/2.0)
481 return 1.0;
482 else
483 return -1.0;
484 }
485
rand_zero_to_one()486 static double rand_zero_to_one()
487 {
488 /* Return number between 0.0 and 1.0 */
489 return rand()/(float)RAND_MAX;
490 }
491
add_residual_g721vuv(int targ_size,unsigned char * targ_residual,int uunit_size,const unsigned char * unit_residual)492 void add_residual_g721vuv(int targ_size, unsigned char *targ_residual,
493 int uunit_size, const unsigned char *unit_residual)
494 {
495 /* Residual is encoded with g721 */
496 unsigned char *unit_residual_unpacked;
497 int p, j;
498 float m, q;
499 int unit_size;
500 int offset;
501
502 if (unit_residual[0] == 0)
503 {
504 unit_size = uunit_size;
505 unit_residual_unpacked = cst_alloc(unsigned char,unit_size);
506 p = unit_residual[4]; p = p << 8;
507 p += unit_residual[3]; p = p << 8;
508 p += unit_residual[2]; p = p << 8;
509 p += unit_residual[1];
510 m = ((float)p);
511 for (j=0; j<unit_size; j++)
512 {
513 q = m*2*rand_zero_to_one()*plus_or_minus_one();
514 unit_residual_unpacked[j] = cst_short_to_ulaw((short)q);
515 }
516 offset = 0;
517 }
518 else
519 {
520 unit_residual_unpacked =
521 cst_g721_decode(&unit_size, (uunit_size+CST_G721_LEADIN+1)/2, unit_residual);
522 offset = CST_G721_LEADIN;
523 }
524
525 if (uunit_size < targ_size)
526 memmove(&targ_residual[((targ_size-uunit_size)/2)],
527 &unit_residual_unpacked[offset],
528 uunit_size*sizeof(unsigned char));
529 else
530 {
531 memmove(&targ_residual[0],
532 &unit_residual_unpacked[offset+((uunit_size-targ_size)/2)],
533 targ_size*sizeof(unsigned char));
534 }
535
536 cst_free(unit_residual_unpacked);
537 }
538
add_residual_vuv(int targ_size,unsigned char * targ_residual,int uunit_size,const unsigned char * unit_residual)539 void add_residual_vuv(int targ_size, unsigned char *targ_residual,
540 int uunit_size, const unsigned char *unit_residual)
541 {
542 /* Residual is encoded with vuv */
543 unsigned char *unit_residual_unpacked;
544 int p, j;
545 float m, q;
546 int unit_size;
547
548 if (unit_residual[0] == 0)
549 {
550 unit_size = uunit_size;
551 unit_residual_unpacked = cst_alloc(unsigned char,unit_size);
552 p = unit_residual[4]; p = p << 8;
553 p += unit_residual[3]; p = p << 8;
554 p += unit_residual[2]; p = p << 8;
555 p += unit_residual[1];
556 m = ((float)p);
557 for (j=0; j<unit_size; j++)
558 {
559 q = m*2*rand_zero_to_one()*plus_or_minus_one();
560 unit_residual_unpacked[j] = cst_short_to_ulaw((short)q);
561 }
562 }
563 else
564 {
565 /* Put in to the unpacked -- with no unpacking */
566 /* The cast is because unit_residual is const, and can't be deleted */
567 unit_residual_unpacked = (unsigned char *)(void *)unit_residual;
568 }
569
570 if (uunit_size < targ_size)
571 memmove(&targ_residual[((targ_size-uunit_size)/2)],
572 &unit_residual_unpacked[0],
573 uunit_size*sizeof(unsigned char));
574 else
575 {
576 memmove(&targ_residual[0],
577 &unit_residual_unpacked[((uunit_size-targ_size)/2)],
578 targ_size*sizeof(unsigned char));
579 }
580
581 if (unit_residual[0] == 0)
582 cst_free(unit_residual_unpacked);
583 }
584
add_residual_pulse(int targ_size,unsigned char * targ_residual,int unit_size,const unsigned char * unit_residual)585 void add_residual_pulse(int targ_size, unsigned char *targ_residual,
586 int unit_size, const unsigned char *unit_residual)
587 {
588 int p,i,m;
589 /* Unit residual isn't a pointer its a number, the power for the
590 the sts, yes this is hackily casting the address to a number */
591
592 /* Need voiced and unvoiced model */
593 p = (int)unit_residual; /* I know the compiler will complain about this */
594
595 if (p > 7000) /* voiced */
596 {
597 i = ((targ_size-unit_size)/2);
598 targ_residual[i-2] = cst_short_to_ulaw((short)(p/4));
599 targ_residual[i] = cst_short_to_ulaw((short)(p/2));
600 targ_residual[i+2] = cst_short_to_ulaw((short)(p/4));
601 }
602 else /* unvoiced */
603 {
604 m = p / targ_size;
605 for (i=0; i<targ_size; i++)
606 targ_residual[i] =
607 cst_short_to_ulaw((short)(m*plus_or_minus_one()));
608 }
609
610 #if 0
611 if (unit_size < targ_size)
612 targ_residual[((targ_size-unit_size)/2)]
613 = cst_short_to_ulaw((short)(int)unit_residual);
614 else
615 targ_residual[((unit_size-targ_size)/2)]
616 = cst_short_to_ulaw((short)(int)unit_residual);
617 #endif
618 }
619
620