1 /* ----------------------------------------------------------------- */
2 /*           The HMM-Based Speech Synthesis Engine "hts_engine API"  */
3 /*           developed by HTS Working Group                          */
4 /*           http://hts-engine.sourceforge.net/                      */
5 /* ----------------------------------------------------------------- */
6 /*                                                                   */
7 /*  Copyright (c) 2001-2015  Nagoya Institute of Technology          */
8 /*                           Department of Computer Science          */
9 /*                                                                   */
10 /*                2001-2008  Tokyo Institute of Technology           */
11 /*                           Interdisciplinary Graduate School of    */
12 /*                           Science and Engineering                 */
13 /*                                                                   */
14 /* All rights reserved.                                              */
15 /*                                                                   */
16 /* Redistribution and use in source and binary forms, with or        */
17 /* without modification, are permitted provided that the following   */
18 /* conditions are met:                                               */
19 /*                                                                   */
20 /* - Redistributions of source code must retain the above copyright  */
21 /*   notice, this list of conditions and the following disclaimer.   */
22 /* - Redistributions in binary form must reproduce the above         */
23 /*   copyright notice, this list of conditions and the following     */
24 /*   disclaimer in the documentation and/or other materials provided */
25 /*   with the distribution.                                          */
26 /* - Neither the name of the HTS working group nor the names of its  */
27 /*   contributors may be used to endorse or promote products derived */
28 /*   from this software without specific prior written permission.   */
29 /*                                                                   */
30 /* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND            */
31 /* CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES,       */
32 /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF          */
33 /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE          */
34 /* DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS */
35 /* BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,          */
36 /* EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED   */
37 /* TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,     */
38 /* DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON */
39 /* ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,   */
40 /* OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY    */
41 /* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE           */
42 /* POSSIBILITY OF SUCH DAMAGE.                                       */
43 /* ----------------------------------------------------------------- */
44 
45 #ifndef HTS_SSTREAM_C
46 #define HTS_SSTREAM_C
47 
48 #ifdef __cplusplus
49 #define HTS_SSTREAM_C_START extern "C" {
50 #define HTS_SSTREAM_C_END   }
51 #else
52 #define HTS_SSTREAM_C_START
53 #define HTS_SSTREAM_C_END
54 #endif                          /* __CPLUSPLUS */
55 
56 HTS_SSTREAM_C_START;
57 
58 #include <stdlib.h>
59 #include <math.h>
60 
61 /* hts_engine libraries */
62 #include "HTS_hidden.h"
63 
64 /* HTS_set_default_duration: set default duration from state duration probability distribution */
65 static double HTS_set_default_duration(size_t * duration, double *mean, double *vari, size_t size)
66 {
67    size_t i;
68    double temp;
69    size_t sum = 0;
70 
71    for (i = 0; i < size; i++) {
72       temp = mean[i] + 0.5;
73       if (temp < 1.0)
74          duration[i] = 1;
75       else
76          duration[i] = (size_t) temp;
77       sum += duration[i];
78    }
79 
80    return (double) sum;
81 }
82 
83 static double HTS_set_duration_by_speed(size_t * duration, double *mean, double *vari, size_t size, double speed)
84 {
85    size_t i;
86    double temp;
87    size_t sum = 0;
88 
89    for (i = 0; i < size; i++) {
90       temp = mean[i]/speed + 0.5;
91       if (temp < 1.0)
92          duration[i] = 1;
93       else
94          duration[i] = (size_t) temp;
95       sum += duration[i];
96    }
97 
98    return (double) sum;
99 }
100 
101 /* HTS_set_specified_duration: set duration from state duration probability distribution and specified frame length */
102 static double HTS_set_specified_duration(size_t * duration, double *mean, double *vari, size_t size, double frame_length)
103 {
104    size_t i;
105    int j;
106    double temp1, temp2;
107    double rho = 0.0;
108    size_t sum = 0;
109    size_t target_length;
110 
111    /* get the target frame length */
112    if (frame_length + 0.5 < 1.0)
113       target_length = 1;
114    else
115       target_length = (size_t) (frame_length + 0.5);
116 
117    /* check the specified duration */
118    if (target_length <= size) {
119       if (target_length < size)
120          HTS_error(-1, "HTS_set_specified_duration: Specified frame length is too short.\n");
121       for (i = 0; i < size; i++)
122          duration[i] = 1;
123       return (double) size;
124    }
125 
126    /* RHO calculation */
127    temp1 = 0.0;
128    temp2 = 0.0;
129    for (i = 0; i < size; i++) {
130       temp1 += mean[i];
131       temp2 += vari[i];
132    }
133    rho = ((double) target_length - temp1) / temp2;
134 
135    /* first estimation */
136    for (i = 0; i < size; i++) {
137       temp1 = mean[i] + rho * vari[i] + 0.5;
138       if (temp1 < 1.0)
139          duration[i] = 1;
140       else
141          duration[i] = (size_t) temp1;
142       sum += duration[i];
143    }
144 
145    /* loop estimation */
146    while (target_length != sum) {
147       /* sarch flexible state and modify its duration */
148       if (target_length > sum) {
149          j = -1;
150          for (i = 0; i < size; i++) {
151             temp2 = fabs(rho - ((double) duration[i] + 1 - mean[i]) / vari[i]);
152             if (j < 0 || temp1 > temp2) {
153                j = i;
154                temp1 = temp2;
155             }
156          }
157          sum++;
158          duration[j]++;
159       } else {
160          j = -1;
161          for (i = 0; i < size; i++) {
162             if (duration[i] > 1) {
163                temp2 = fabs(rho - ((double) duration[i] - 1 - mean[i]) / vari[i]);
164                if (j < 0 || temp1 > temp2) {
165                   j = i;
166                   temp1 = temp2;
167                }
168             }
169          }
170          sum--;
171          duration[j]--;
172       }
173    }
174 
175    return (double) target_length;
176 }
177 
178 /* HTS_SStreamSet_initialize: initialize state stream set */
179 void HTS_SStreamSet_initialize(HTS_SStreamSet * sss)
180 {
181    sss->nstream = 0;
182    sss->nstate = 0;
183    sss->sstream = NULL;
184    sss->duration = NULL;
185    sss->total_state = 0;
186    sss->total_frame = 0;
187 }
188 
189 /* HTS_SStreamSet_create: parse label and determine state duration */
190 HTS_Boolean HTS_SStreamSet_create(HTS_SStreamSet * sss, HTS_ModelSet * ms, HTS_Label * label, HTS_Boolean phoneme_alignment_flag, double speed, double *duration_iw, double **parameter_iw, double **gv_iw)
191 {
192    size_t i, j, k;
193    double temp;
194    int shift;
195    size_t state;
196    HTS_SStream *sst;
197    double *duration_mean, *duration_vari;
198    double frame_length;
199    size_t next_time;
200    size_t next_state;
201    double label_dur_mod;
202 
203    if (HTS_Label_get_size(label) == 0)
204       return FALSE;
205 
206    /* check interpolation weights */
207    for (i = 0, temp = 0.0; i < HTS_ModelSet_get_nvoices(ms); i++)
208       temp += duration_iw[i];
209    if (temp == 0.0) {
210       return FALSE;
211    } else if (temp != 1.0) {
212       for (i = 0; i < HTS_ModelSet_get_nvoices(ms); i++)
213          if (duration_iw[i] != 0.0)
214             duration_iw[i] /= temp;
215    }
216 
217    for (i = 0; i < HTS_ModelSet_get_nstream(ms); i++) {
218       for (j = 0, temp = 0.0; j < HTS_ModelSet_get_nvoices(ms); j++)
219          temp += parameter_iw[j][i];
220       if (temp == 0.0) {
221          return FALSE;
222       } else if (temp != 1.0) {
223          for (j = 0; j < HTS_ModelSet_get_nvoices(ms); j++)
224             if (parameter_iw[j][i] != 0.0)
225                parameter_iw[j][i] /= temp;
226       }
227       if (HTS_ModelSet_use_gv(ms, i)) {
228          for (j = 0, temp = 0.0; j < HTS_ModelSet_get_nvoices(ms); j++)
229             temp += gv_iw[j][i];
230          if (temp == 0.0)
231             return FALSE;
232          else if (temp != 1.0)
233             for (j = 0; j < HTS_ModelSet_get_nvoices(ms); j++)
234                if (gv_iw[j][i] != 0.0)
235                   gv_iw[j][i] /= temp;
236       }
237    }
238 
239    /* initialize state sequence */
240    sss->nstate = HTS_ModelSet_get_nstate(ms);
241    sss->nstream = HTS_ModelSet_get_nstream(ms);
242    sss->total_frame = 0;
243    sss->total_state = HTS_Label_get_size(label) * sss->nstate;
244    sss->duration = (size_t *) HTS_calloc(sss->total_state, sizeof(size_t));
245    sss->sstream = (HTS_SStream *) HTS_calloc(sss->nstream, sizeof(HTS_SStream));
246    for (i = 0; i < sss->nstream; i++) {
247       sst = &sss->sstream[i];
248       sst->vector_length = HTS_ModelSet_get_vector_length(ms, i);
249       sst->mean = (double **) HTS_calloc(sss->total_state, sizeof(double *));
250       sst->vari = (double **) HTS_calloc(sss->total_state, sizeof(double *));
251       if (HTS_ModelSet_is_msd(ms, i))
252          sst->msd = (double *) HTS_calloc(sss->total_state, sizeof(double));
253       else
254          sst->msd = NULL;
255       for (j = 0; j < sss->total_state; j++) {
256          sst->mean[j] = (double *) HTS_calloc(sst->vector_length * HTS_ModelSet_get_window_size(ms, i), sizeof(double));
257          sst->vari[j] = (double *) HTS_calloc(sst->vector_length * HTS_ModelSet_get_window_size(ms, i), sizeof(double));
258       }
259       if (HTS_ModelSet_use_gv(ms, i)) {
260          sst->gv_switch = (HTS_Boolean *) HTS_calloc(sss->total_state, sizeof(HTS_Boolean));
261          for (j = 0; j < sss->total_state; j++)
262             sst->gv_switch[j] = TRUE;
263       } else {
264          sst->gv_switch = NULL;
265       }
266    }
267 
268    /* determine state duration */
269    duration_mean = (double *) HTS_calloc(sss->total_state, sizeof(double));
270    duration_vari = (double *) HTS_calloc(sss->total_state, sizeof(double));
271    for (i = 0; i < HTS_Label_get_size(label); i++)
272      {
273        HTS_ModelSet_get_duration(ms, HTS_Label_get_string(label, i), HTS_Label_get_parsed(label, i), duration_iw, &duration_mean[i * sss->nstate], &duration_vari[i * sss->nstate]);
274        label_dur_mod=HTS_Label_get_dur_mod(label, i);
275        for(j=0; j < sss->nstate; ++j)
276          duration_mean[i*sss->nstate+j]*=label_dur_mod;
277      }
278    if (phoneme_alignment_flag == TRUE) {
279       /* use duration set by user */
280       next_time = 0;
281       next_state = 0;
282       state = 0;
283       for (i = 0; i < HTS_Label_get_size(label); i++) {
284          temp = HTS_Label_get_end_frame(label, i);
285          if (temp >= 0) {
286             next_time += (size_t) HTS_set_specified_duration(&sss->duration[next_state], &duration_mean[next_state], &duration_vari[next_state], state + sss->nstate - next_state, temp - next_time);
287             next_state = state + sss->nstate;
288          } else if (i + 1 == HTS_Label_get_size(label)) {
289             HTS_error(-1, "HTS_SStreamSet_create: The time of final label is not specified.\n");
290             HTS_set_default_duration(&sss->duration[next_state], &duration_mean[next_state], &duration_vari[next_state], state + sss->nstate - next_state);
291          }
292          state += sss->nstate;
293       }
294    } else {
295       /* determine frame length */
296       if (speed != 1.0) {
297          /* temp = 0.0; */
298          /* for (i = 0; i < sss->total_state; i++) { */
299          /*    temp += duration_mean[i]; */
300          /* } */
301          /* frame_length = temp / speed; */
302          /* HTS_set_specified_duration(sss->duration, duration_mean, duration_vari, sss->total_state, frame_length); */
303          HTS_set_duration_by_speed(sss->duration, duration_mean, duration_vari, sss->total_state, speed);
304       } else {
305          HTS_set_default_duration(sss->duration, duration_mean, duration_vari, sss->total_state);
306       }
307    }
308    HTS_free(duration_mean);
309    HTS_free(duration_vari);
310 
311    for(i=0;i<(sss->nstate-1);++i)
312      {
313        sss->duration[i]=1;
314 }
315    if(sss->duration[sss->nstate-1]>1)
316      sss->duration[sss->nstate-1]=1;
317 
318    /* get parameter */
319    for (i = 0, state = 0; i < HTS_Label_get_size(label); i++) {
320       for (j = 2; j <= sss->nstate + 1; j++) {
321          sss->total_frame += sss->duration[state];
322          for (k = 0; k < sss->nstream; k++) {
323             sst = &sss->sstream[k];
324             if (sst->msd)
325                HTS_ModelSet_get_parameter(ms, k, j, HTS_Label_get_string(label, i), HTS_Label_get_parsed(label, i), (const double *const *) parameter_iw, sst->mean[state], sst->vari[state], &sst->msd[state]);
326             else
327                HTS_ModelSet_get_parameter(ms, k, j, HTS_Label_get_string(label, i), HTS_Label_get_parsed(label, i), (const double *const *) parameter_iw, sst->mean[state], sst->vari[state], NULL);
328          }
329          state++;
330       }
331    }
332 
333    /* copy dynamic window */
334    for (i = 0; i < sss->nstream; i++) {
335       sst = &sss->sstream[i];
336       sst->win_size = HTS_ModelSet_get_window_size(ms, i);
337       sst->win_max_width = HTS_ModelSet_get_window_max_width(ms, i);
338       sst->win_l_width = (int *) HTS_calloc(sst->win_size, sizeof(int));
339       sst->win_r_width = (int *) HTS_calloc(sst->win_size, sizeof(int));
340       sst->win_coefficient = (double **) HTS_calloc(sst->win_size, sizeof(double));
341       for (j = 0; j < sst->win_size; j++) {
342          sst->win_l_width[j] = HTS_ModelSet_get_window_left_width(ms, i, j);
343          sst->win_r_width[j] = HTS_ModelSet_get_window_right_width(ms, i, j);
344          if (sst->win_l_width[j] + sst->win_r_width[j] == 0)
345             sst->win_coefficient[j] = (double *) HTS_calloc(-2 * sst->win_l_width[j] + 1, sizeof(double));
346          else
347             sst->win_coefficient[j] = (double *) HTS_calloc(-2 * sst->win_l_width[j], sizeof(double));
348          sst->win_coefficient[j] -= sst->win_l_width[j];
349          for (shift = sst->win_l_width[j]; shift <= sst->win_r_width[j]; shift++)
350             sst->win_coefficient[j][shift] = HTS_ModelSet_get_window_coefficient(ms, i, j, shift);
351       }
352    }
353 
354    /* determine GV */
355    for (i = 0; i < sss->nstream; i++) {
356       sst = &sss->sstream[i];
357       if (HTS_ModelSet_use_gv(ms, i)) {
358          sst->gv_mean = (double *) HTS_calloc(sst->vector_length, sizeof(double));
359          sst->gv_vari = (double *) HTS_calloc(sst->vector_length, sizeof(double));
360          HTS_ModelSet_get_gv(ms, i, HTS_Label_get_string(label, 0), HTS_Label_get_parsed(label, 0), (const double *const *) gv_iw, sst->gv_mean, sst->gv_vari);
361       } else {
362          sst->gv_mean = NULL;
363          sst->gv_vari = NULL;
364       }
365    }
366 
367    for (i = 0; i < HTS_Label_get_size(label); i++)
368       if (HTS_ModelSet_get_gv_flag(ms, HTS_Label_get_string(label, i), HTS_Label_get_parsed(label, i)) == FALSE)
369          for (j = 0; j < sss->nstream; j++)
370             if (HTS_ModelSet_use_gv(ms, j) == TRUE)
371                for (k = 0; k < sss->nstate; k++)
372                   sss->sstream[j].gv_switch[i * sss->nstate + k] = FALSE;
373 
374    return TRUE;
375 }
376 
377 /* HTS_SStreamSet_get_nstream: get number of stream */
378 size_t HTS_SStreamSet_get_nstream(HTS_SStreamSet * sss)
379 {
380    return sss->nstream;
381 }
382 
383 /* HTS_SStreamSet_get_vector_length: get vector length */
384 size_t HTS_SStreamSet_get_vector_length(HTS_SStreamSet * sss, size_t stream_index)
385 {
386    return sss->sstream[stream_index].vector_length;
387 }
388 
389 /* HTS_SStreamSet_is_msd: get MSD flag */
390 HTS_Boolean HTS_SStreamSet_is_msd(HTS_SStreamSet * sss, size_t stream_index)
391 {
392    return sss->sstream[stream_index].msd ? TRUE : FALSE;
393 }
394 
395 /* HTS_SStreamSet_get_total_state: get total number of state */
396 size_t HTS_SStreamSet_get_total_state(HTS_SStreamSet * sss)
397 {
398    return sss->total_state;
399 }
400 
401 /* HTS_SStreamSet_get_total_frame: get total number of frame */
402 size_t HTS_SStreamSet_get_total_frame(HTS_SStreamSet * sss)
403 {
404    return sss->total_frame;
405 }
406 
407 /* HTS_SStreamSet_get_msd: get MSD parameter */
408 double HTS_SStreamSet_get_msd(HTS_SStreamSet * sss, size_t stream_index, size_t state_index)
409 {
410    return sss->sstream[stream_index].msd[state_index];
411 }
412 
413 /* HTS_SStreamSet_window_size: get dynamic window size */
414 size_t HTS_SStreamSet_get_window_size(HTS_SStreamSet * sss, size_t stream_index)
415 {
416    return sss->sstream[stream_index].win_size;
417 }
418 
419 /* HTS_SStreamSet_get_window_left_width: get left width of dynamic window */
420 int HTS_SStreamSet_get_window_left_width(HTS_SStreamSet * sss, size_t stream_index, size_t window_index)
421 {
422    return sss->sstream[stream_index].win_l_width[window_index];
423 }
424 
425 /* HTS_SStreamSet_get_winodow_right_width: get right width of dynamic window */
426 int HTS_SStreamSet_get_window_right_width(HTS_SStreamSet * sss, size_t stream_index, size_t window_index)
427 {
428    return sss->sstream[stream_index].win_r_width[window_index];
429 }
430 
431 /* HTS_SStreamSet_get_window_coefficient: get coefficient of dynamic window */
432 double HTS_SStreamSet_get_window_coefficient(HTS_SStreamSet * sss, size_t stream_index, size_t window_index, int coefficient_index)
433 {
434    return sss->sstream[stream_index].win_coefficient[window_index][coefficient_index];
435 }
436 
437 /* HTS_SStreamSet_get_window_max_width: get max width of dynamic window */
438 size_t HTS_SStreamSet_get_window_max_width(HTS_SStreamSet * sss, size_t stream_index)
439 {
440    return sss->sstream[stream_index].win_max_width;
441 }
442 
443 /* HTS_SStreamSet_use_gv: get GV flag */
444 HTS_Boolean HTS_SStreamSet_use_gv(HTS_SStreamSet * sss, size_t stream_index)
445 {
446    return sss->sstream[stream_index].gv_mean ? TRUE : FALSE;
447 }
448 
449 /* HTS_SStreamSet_get_duration: get state duration */
450 size_t HTS_SStreamSet_get_duration(HTS_SStreamSet * sss, size_t state_index)
451 {
452    return sss->duration[state_index];
453 }
454 
455 /* HTS_SStreamSet_get_mean: get mean parameter */
456 double HTS_SStreamSet_get_mean(HTS_SStreamSet * sss, size_t stream_index, size_t state_index, size_t vector_index)
457 {
458    return sss->sstream[stream_index].mean[state_index][vector_index];
459 }
460 
461 /* HTS_SStreamSet_set_mean: set mean parameter */
462 void HTS_SStreamSet_set_mean(HTS_SStreamSet * sss, size_t stream_index, size_t state_index, size_t vector_index, double f)
463 {
464    sss->sstream[stream_index].mean[state_index][vector_index] = f;
465 }
466 
467 /* HTS_SStreamSet_get_vari: get variance parameter */
468 double HTS_SStreamSet_get_vari(HTS_SStreamSet * sss, size_t stream_index, size_t state_index, size_t vector_index)
469 {
470    return sss->sstream[stream_index].vari[state_index][vector_index];
471 }
472 
473 /* HTS_SStreamSet_set_vari: set variance parameter */
474 void HTS_SStreamSet_set_vari(HTS_SStreamSet * sss, size_t stream_index, size_t state_index, size_t vector_index, double f)
475 {
476    sss->sstream[stream_index].vari[state_index][vector_index] = f;
477 }
478 
479 /* HTS_SStreamSet_get_gv_mean: get GV mean parameter */
480 double HTS_SStreamSet_get_gv_mean(HTS_SStreamSet * sss, size_t stream_index, size_t vector_index)
481 {
482    return sss->sstream[stream_index].gv_mean[vector_index];
483 }
484 
485 /* HTS_SStreamSet_get_gv_mean: get GV variance parameter */
486 double HTS_SStreamSet_get_gv_vari(HTS_SStreamSet * sss, size_t stream_index, size_t vector_index)
487 {
488    return sss->sstream[stream_index].gv_vari[vector_index];
489 }
490 
491 /* HTS_SStreamSet_set_gv_switch: set GV switch */
492 void HTS_SStreamSet_set_gv_switch(HTS_SStreamSet * sss, size_t stream_index, size_t state_index, HTS_Boolean i)
493 {
494    sss->sstream[stream_index].gv_switch[state_index] = i;
495 }
496 
497 /* HTS_SStreamSet_get_gv_switch: get GV switch */
498 HTS_Boolean HTS_SStreamSet_get_gv_switch(HTS_SStreamSet * sss, size_t stream_index, size_t state_index)
499 {
500    return sss->sstream[stream_index].gv_switch[state_index];
501 }
502 
503 /* HTS_SStreamSet_clear: free state stream set */
504 void HTS_SStreamSet_clear(HTS_SStreamSet * sss)
505 {
506    size_t i, j;
507    HTS_SStream *sst;
508 
509    if (sss->sstream) {
510       for (i = 0; i < sss->nstream; i++) {
511          sst = &sss->sstream[i];
512          for (j = 0; j < sss->total_state; j++) {
513             HTS_free(sst->mean[j]);
514             HTS_free(sst->vari[j]);
515          }
516          if (sst->msd)
517             HTS_free(sst->msd);
518          HTS_free(sst->mean);
519          HTS_free(sst->vari);
520          for (j = 0; j < sst->win_size; j++) {
521             sst->win_coefficient[j] += sst->win_l_width[j];
522             HTS_free(sst->win_coefficient[j]);
523          }
524          HTS_free(sst->win_coefficient);
525          HTS_free(sst->win_l_width);
526          HTS_free(sst->win_r_width);
527          if (sst->gv_mean)
528             HTS_free(sst->gv_mean);
529          if (sst->gv_vari)
530             HTS_free(sst->gv_vari);
531          if (sst->gv_switch)
532             HTS_free(sst->gv_switch);
533       }
534       HTS_free(sss->sstream);
535    }
536    if (sss->duration)
537       HTS_free(sss->duration);
538 
539    HTS_SStreamSet_initialize(sss);
540 }
541 
542 HTS_SSTREAM_C_END;
543 
544 #endif                          /* !HTS_SSTREAM_C */
545