1 /* ----------------------------------------------------------------- */
2 /*           The HMM-Based Speech Synthesis Engine "hts_engine API"  */
3 /*           developed by HTS Working Group                          */
4 /*           http://hts-engine.sourceforge.net/                      */
5 /* ----------------------------------------------------------------- */
6 /*                                                                   */
7 /*  Copyright (c) 2001-2015  Nagoya Institute of Technology          */
8 /*                           Department of Computer Science          */
9 /*                                                                   */
10 /*                2001-2008  Tokyo Institute of Technology           */
11 /*                           Interdisciplinary Graduate School of    */
12 /*                           Science and Engineering                 */
13 /*                                                                   */
14 /* All rights reserved.                                              */
15 /*                                                                   */
16 /* Redistribution and use in source and binary forms, with or        */
17 /* without modification, are permitted provided that the following   */
18 /* conditions are met:                                               */
19 /*                                                                   */
20 /* - Redistributions of source code must retain the above copyright  */
21 /*   notice, this list of conditions and the following disclaimer.   */
22 /* - Redistributions in binary form must reproduce the above         */
23 /*   copyright notice, this list of conditions and the following     */
24 /*   disclaimer in the documentation and/or other materials provided */
25 /*   with the distribution.                                          */
26 /* - Neither the name of the HTS working group nor the names of its  */
27 /*   contributors may be used to endorse or promote products derived */
28 /*   from this software without specific prior written permission.   */
29 /*                                                                   */
30 /* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND            */
31 /* CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES,       */
32 /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF          */
33 /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE          */
34 /* DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS */
35 /* BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,          */
36 /* EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED   */
37 /* TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,     */
38 /* DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON */
39 /* ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,   */
40 /* OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY    */
41 /* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE           */
42 /* POSSIBILITY OF SUCH DAMAGE.                                       */
43 /* ----------------------------------------------------------------- */
44 
45 #ifndef HTS_ENGINE_C
46 #define HTS_ENGINE_C
47 
48 #ifdef __cplusplus
49 #define HTS_ENGINE_C_START extern "C" {
50 #define HTS_ENGINE_C_END   }
51 #else
52 #define HTS_ENGINE_C_START
53 #define HTS_ENGINE_C_END
54 #endif                          /* __CPLUSPLUS */
55 
56 HTS_ENGINE_C_START;
57 
58 #include <stdlib.h>             /* for atof() */
59 #include <string.h>             /* for strcpy() */
60 #include <math.h>               /* for pow() */
61 
62 /* hts_engine libraries */
63 #include "HTS_hidden.h"
64 
65 /* HTS_Engine_initialize: initialize engine */
HTS_Engine_initialize(HTS_Engine * engine)66 void HTS_Engine_initialize(HTS_Engine * engine)
67 {
68    /* global */
69    engine->condition.sampling_frequency = 0;
70    engine->condition.fperiod = 0;
71    engine->condition.audio_buff_size = 0;
72    engine->condition.stop = FALSE;
73    engine->condition.volume = 1.0;
74    engine->condition.msd_threshold = NULL;
75    engine->condition.gv_weight = NULL;
76 
77    /* duration */
78    engine->condition.speed = 1.0;
79    engine->condition.phoneme_alignment_flag = FALSE;
80 
81    /* spectrum */
82    engine->condition.stage = 0;
83    engine->condition.use_log_gain = FALSE;
84    engine->condition.alpha = 0.0;
85    engine->condition.beta = 0.0;
86 
87    /* log F0 */
88    engine->condition.additional_half_tone = 0.0;
89 
90    /* interpolation weights */
91    engine->condition.duration_iw = NULL;
92    engine->condition.parameter_iw = NULL;
93    engine->condition.gv_iw = NULL;
94 
95    /* initialize audio */
96    HTS_Audio_initialize(&engine->audio);
97    /* initialize model set */
98    HTS_ModelSet_initialize(&engine->ms);
99    /* initialize label list */
100    HTS_Label_initialize(&engine->label);
101    /* initialize state sequence set */
102    HTS_SStreamSet_initialize(&engine->sss);
103    /* initialize pstream set */
104    HTS_PStreamSet_initialize(&engine->pss);
105    /* initialize gstream set */
106    HTS_GStreamSet_initialize(&engine->gss);
107 }
108 
109 /* HTS_Engine_load: load HTS voices */
HTS_Engine_load(HTS_Engine * engine,char ** voices,size_t num_voices)110 HTS_Boolean HTS_Engine_load(HTS_Engine * engine, char **voices, size_t num_voices)
111 {
112    size_t i, j;
113    size_t nstream;
114    double average_weight;
115    const char *option, *find;
116 
117    /* reset engine */
118    HTS_Engine_clear(engine);
119 
120    /* load voices */
121    if (HTS_ModelSet_load(&engine->ms, voices, num_voices) != TRUE) {
122       HTS_Engine_clear(engine);
123       return FALSE;
124    }
125    nstream = HTS_ModelSet_get_nstream(&engine->ms);
126    average_weight = 1.0 / num_voices;
127 
128    /* global */
129    engine->condition.sampling_frequency = HTS_ModelSet_get_sampling_frequency(&engine->ms);
130    engine->condition.fperiod = HTS_ModelSet_get_fperiod(&engine->ms);
131    engine->condition.msd_threshold = (double *) HTS_calloc(nstream, sizeof(double));
132    for (i = 0; i < nstream; i++)
133       engine->condition.msd_threshold[i] = 0.5;
134    engine->condition.gv_weight = (double *) HTS_calloc(nstream, sizeof(double));
135    for (i = 0; i < nstream; i++)
136       engine->condition.gv_weight[i] = 1.0;
137 
138    /* spectrum */
139    option = HTS_ModelSet_get_option(&engine->ms, 0);
140    find = strstr(option, "GAMMA=");
141    if (find != NULL)
142       engine->condition.stage = (size_t) atoi(&find[strlen("GAMMA=")]);
143    find = strstr(option, "LN_GAIN=");
144    if (find != NULL)
145       engine->condition.use_log_gain = atoi(&find[strlen("LN_GAIN=")]) == 1 ? TRUE : FALSE;
146    find = strstr(option, "ALPHA=");
147    if (find != NULL)
148       engine->condition.alpha = atof(&find[strlen("ALPHA=")]);
149 
150    /* interpolation weights */
151    engine->condition.duration_iw = (double *) HTS_calloc(num_voices, sizeof(double));
152    for (i = 0; i < num_voices; i++)
153       engine->condition.duration_iw[i] = average_weight;
154    engine->condition.parameter_iw = (double **) HTS_calloc(num_voices, sizeof(double *));
155    for (i = 0; i < num_voices; i++) {
156       engine->condition.parameter_iw[i] = (double *) HTS_calloc(nstream, sizeof(double));
157       for (j = 0; j < nstream; j++)
158          engine->condition.parameter_iw[i][j] = average_weight;
159    }
160    engine->condition.gv_iw = (double **) HTS_calloc(num_voices, sizeof(double *));
161    for (i = 0; i < num_voices; i++) {
162       engine->condition.gv_iw[i] = (double *) HTS_calloc(nstream, sizeof(double));
163       for (j = 0; j < nstream; j++)
164          engine->condition.gv_iw[i][j] = average_weight;
165    }
166 
167    return TRUE;
168 }
169 
170 /* HTS_Engine_set_sampling_frequency: set sampling frequency */
HTS_Engine_set_sampling_frequency(HTS_Engine * engine,size_t i)171 void HTS_Engine_set_sampling_frequency(HTS_Engine * engine, size_t i)
172 {
173    if (i < 1)
174       i = 1;
175    engine->condition.sampling_frequency = i;
176    HTS_Audio_set_parameter(&engine->audio, engine->condition.sampling_frequency, engine->condition.audio_buff_size);
177 }
178 
179 /* HTS_Engine_get_sampling_frequency: get sampling frequency */
HTS_Engine_get_sampling_frequency(HTS_Engine * engine)180 size_t HTS_Engine_get_sampling_frequency(HTS_Engine * engine)
181 {
182    return engine->condition.sampling_frequency;
183 }
184 
185 /* HTS_Engine_set_fperiod: set frame period */
HTS_Engine_set_fperiod(HTS_Engine * engine,size_t i)186 void HTS_Engine_set_fperiod(HTS_Engine * engine, size_t i)
187 {
188    if (i < 1)
189       i = 1;
190    engine->condition.fperiod = i;
191 }
192 
193 /* HTS_Engine_get_fperiod: get frame period */
HTS_Engine_get_fperiod(HTS_Engine * engine)194 size_t HTS_Engine_get_fperiod(HTS_Engine * engine)
195 {
196    return engine->condition.fperiod;
197 }
198 
199 /* HTS_Engine_set_audio_buff_size: set audio buffer size */
HTS_Engine_set_audio_buff_size(HTS_Engine * engine,size_t i)200 void HTS_Engine_set_audio_buff_size(HTS_Engine * engine, size_t i)
201 {
202    engine->condition.audio_buff_size = i;
203    HTS_Audio_set_parameter(&engine->audio, engine->condition.sampling_frequency, engine->condition.audio_buff_size);
204 }
205 
206 /* HTS_Engine_get_audio_buff_size: get audio buffer size */
HTS_Engine_get_audio_buff_size(HTS_Engine * engine)207 size_t HTS_Engine_get_audio_buff_size(HTS_Engine * engine)
208 {
209    return engine->condition.audio_buff_size;
210 }
211 
212 /* HTS_Engine_set_stop_flag: set stop flag */
HTS_Engine_set_stop_flag(HTS_Engine * engine,HTS_Boolean b)213 void HTS_Engine_set_stop_flag(HTS_Engine * engine, HTS_Boolean b)
214 {
215    engine->condition.stop = b;
216 }
217 
218 /* HTS_Engine_get_stop_flag: get stop flag */
HTS_Engine_get_stop_flag(HTS_Engine * engine)219 HTS_Boolean HTS_Engine_get_stop_flag(HTS_Engine * engine)
220 {
221    return engine->condition.stop;
222 }
223 
224 /* HTS_Engine_set_volume: set volume in db */
HTS_Engine_set_volume(HTS_Engine * engine,double f)225 void HTS_Engine_set_volume(HTS_Engine * engine, double f)
226 {
227    engine->condition.volume = exp(f * DB);
228 }
229 
230 /* HTS_Engine_get_volume: get volume in db */
HTS_Engine_get_volume(HTS_Engine * engine)231 double HTS_Engine_get_volume(HTS_Engine * engine)
232 {
233    return log(engine->condition.volume) / DB;
234 }
235 
236 /* HTS_Egnine_set_msd_threshold: set MSD threshold */
HTS_Engine_set_msd_threshold(HTS_Engine * engine,size_t stream_index,double f)237 void HTS_Engine_set_msd_threshold(HTS_Engine * engine, size_t stream_index, double f)
238 {
239    if (f < 0.0)
240       f = 0.0;
241    if (f > 1.0)
242       f = 1.0;
243    engine->condition.msd_threshold[stream_index] = f;
244 }
245 
246 /* HTS_Engine_get_msd_threshold: get MSD threshold */
HTS_Engine_get_msd_threshold(HTS_Engine * engine,size_t stream_index)247 double HTS_Engine_get_msd_threshold(HTS_Engine * engine, size_t stream_index)
248 {
249    return engine->condition.msd_threshold[stream_index];
250 }
251 
252 /* HTS_Engine_set_gv_weight: set GV weight */
HTS_Engine_set_gv_weight(HTS_Engine * engine,size_t stream_index,double f)253 void HTS_Engine_set_gv_weight(HTS_Engine * engine, size_t stream_index, double f)
254 {
255    if (f < 0.0)
256       f = 0.0;
257    engine->condition.gv_weight[stream_index] = f;
258 }
259 
260 /* HTS_Engine_get_gv_weight: get GV weight */
HTS_Engine_get_gv_weight(HTS_Engine * engine,size_t stream_index)261 double HTS_Engine_get_gv_weight(HTS_Engine * engine, size_t stream_index)
262 {
263    return engine->condition.gv_weight[stream_index];
264 }
265 
266 /* HTS_Engine_set_speed: set speech speed */
HTS_Engine_set_speed(HTS_Engine * engine,double f)267 void HTS_Engine_set_speed(HTS_Engine * engine, double f)
268 {
269    if (f < 1.0E-06)
270       f = 1.0E-06;
271    engine->condition.speed = f;
272 }
273 
274 /* HTS_Engine_set_phoneme_alignment_flag: set flag for using phoneme alignment in label */
HTS_Engine_set_phoneme_alignment_flag(HTS_Engine * engine,HTS_Boolean b)275 void HTS_Engine_set_phoneme_alignment_flag(HTS_Engine * engine, HTS_Boolean b)
276 {
277    engine->condition.phoneme_alignment_flag = b;
278 }
279 
280 /* HTS_Engine_set_alpha: set alpha */
HTS_Engine_set_alpha(HTS_Engine * engine,double f)281 void HTS_Engine_set_alpha(HTS_Engine * engine, double f)
282 {
283    if (f < 0.0)
284       f = 0.0;
285    if (f > 1.0)
286       f = 1.0;
287    engine->condition.alpha = f;
288 }
289 
290 /* HTS_Engine_get_alpha: get alpha */
HTS_Engine_get_alpha(HTS_Engine * engine)291 double HTS_Engine_get_alpha(HTS_Engine * engine)
292 {
293    return engine->condition.alpha;
294 }
295 
296 /* HTS_Engine_set_beta: set beta */
HTS_Engine_set_beta(HTS_Engine * engine,double f)297 void HTS_Engine_set_beta(HTS_Engine * engine, double f)
298 {
299    if (f < 0.0)
300       f = 0.0;
301    if (f > 1.0)
302       f = 1.0;
303    engine->condition.beta = f;
304 }
305 
306 /* HTS_Engine_get_beta: get beta */
HTS_Engine_get_beta(HTS_Engine * engine)307 double HTS_Engine_get_beta(HTS_Engine * engine)
308 {
309    return engine->condition.beta;
310 }
311 
312 /* HTS_Engine_add_half_tone: add half tone */
HTS_Engine_add_half_tone(HTS_Engine * engine,double f)313 void HTS_Engine_add_half_tone(HTS_Engine * engine, double f)
314 {
315    engine->condition.additional_half_tone = f;
316 }
317 
318 /* HTS_Engine_set_duration_interpolation_weight: set interpolation weight for duration */
HTS_Engine_set_duration_interpolation_weight(HTS_Engine * engine,size_t voice_index,double f)319 void HTS_Engine_set_duration_interpolation_weight(HTS_Engine * engine, size_t voice_index, double f)
320 {
321    engine->condition.duration_iw[voice_index] = f;
322 }
323 
324 /* HTS_Engine_get_duration_interpolation_weight: get interpolation weight for duration */
HTS_Engine_get_duration_interpolation_weight(HTS_Engine * engine,size_t voice_index)325 double HTS_Engine_get_duration_interpolation_weight(HTS_Engine * engine, size_t voice_index)
326 {
327    return engine->condition.duration_iw[voice_index];
328 }
329 
330 /* HTS_Engine_set_parameter_interpolation_weight: set interpolation weight for parameter */
HTS_Engine_set_parameter_interpolation_weight(HTS_Engine * engine,size_t voice_index,size_t stream_index,double f)331 void HTS_Engine_set_parameter_interpolation_weight(HTS_Engine * engine, size_t voice_index, size_t stream_index, double f)
332 {
333    engine->condition.parameter_iw[voice_index][stream_index] = f;
334 }
335 
336 /* HTS_Engine_get_parameter_interpolation_weight: get interpolation weight for parameter */
HTS_Engine_get_parameter_interpolation_weight(HTS_Engine * engine,size_t voice_index,size_t stream_index)337 double HTS_Engine_get_parameter_interpolation_weight(HTS_Engine * engine, size_t voice_index, size_t stream_index)
338 {
339    return engine->condition.parameter_iw[voice_index][stream_index];
340 }
341 
342 /* HTS_Engine_set_gv_interpolation_weight: set interpolation weight for GV */
HTS_Engine_set_gv_interpolation_weight(HTS_Engine * engine,size_t voice_index,size_t stream_index,double f)343 void HTS_Engine_set_gv_interpolation_weight(HTS_Engine * engine, size_t voice_index, size_t stream_index, double f)
344 {
345    engine->condition.gv_iw[voice_index][stream_index] = f;
346 }
347 
348 /* HTS_Engine_get_gv_interpolation_weight: get interpolation weight for GV */
HTS_Engine_get_gv_interpolation_weight(HTS_Engine * engine,size_t voice_index,size_t stream_index)349 double HTS_Engine_get_gv_interpolation_weight(HTS_Engine * engine, size_t voice_index, size_t stream_index)
350 {
351    return engine->condition.gv_iw[voice_index][stream_index];
352 }
353 
354 /* HTS_Engine_get_total_state: get total number of state */
HTS_Engine_get_total_state(HTS_Engine * engine)355 size_t HTS_Engine_get_total_state(HTS_Engine * engine)
356 {
357    return HTS_SStreamSet_get_total_state(&engine->sss);
358 }
359 
360 /* HTS_Engine_set_state_mean: set mean value of state */
HTS_Engine_set_state_mean(HTS_Engine * engine,size_t stream_index,size_t state_index,size_t vector_index,double f)361 void HTS_Engine_set_state_mean(HTS_Engine * engine, size_t stream_index, size_t state_index, size_t vector_index, double f)
362 {
363    HTS_SStreamSet_set_mean(&engine->sss, stream_index, state_index, vector_index, f);
364 }
365 
366 /* HTS_Engine_get_state_mean: get mean value of state */
HTS_Engine_get_state_mean(HTS_Engine * engine,size_t stream_index,size_t state_index,size_t vector_index)367 double HTS_Engine_get_state_mean(HTS_Engine * engine, size_t stream_index, size_t state_index, size_t vector_index)
368 {
369    return HTS_SStreamSet_get_mean(&engine->sss, stream_index, state_index, vector_index);
370 }
371 
372 /* HTS_Engine_get_state_duration: get state duration */
HTS_Engine_get_state_duration(HTS_Engine * engine,size_t state_index)373 size_t HTS_Engine_get_state_duration(HTS_Engine * engine, size_t state_index)
374 {
375    return HTS_SStreamSet_get_duration(&engine->sss, state_index);
376 }
377 
378 /* HTS_Engine_get_nvoices: get number of voices */
HTS_Engine_get_nvoices(HTS_Engine * engine)379 size_t HTS_Engine_get_nvoices(HTS_Engine * engine)
380 {
381    return HTS_ModelSet_get_nvoices(&engine->ms);
382 }
383 
384 /* HTS_Engine_get_nstream: get number of stream */
HTS_Engine_get_nstream(HTS_Engine * engine)385 size_t HTS_Engine_get_nstream(HTS_Engine * engine)
386 {
387    return HTS_ModelSet_get_nstream(&engine->ms);
388 }
389 
390 /* HTS_Engine_get_nstate: get number of state */
HTS_Engine_get_nstate(HTS_Engine * engine)391 size_t HTS_Engine_get_nstate(HTS_Engine * engine)
392 {
393    return HTS_ModelSet_get_nstate(&engine->ms);
394 }
395 
396 /* HTS_Engine_get_fullcontext_label_format: get full context label format */
HTS_Engine_get_fullcontext_label_format(HTS_Engine * engine)397 const char *HTS_Engine_get_fullcontext_label_format(HTS_Engine * engine)
398 {
399    return HTS_ModelSet_get_fullcontext_label_format(&engine->ms);
400 }
401 
402 /* HTS_Engine_get_fullcontext_label_version: get full context label version */
HTS_Engine_get_fullcontext_label_version(HTS_Engine * engine)403 const char *HTS_Engine_get_fullcontext_label_version(HTS_Engine * engine)
404 {
405    return HTS_ModelSet_get_fullcontext_label_version(&engine->ms);
406 }
407 
408 /* HTS_Engine_get_total_frame: get total number of frame */
HTS_Engine_get_total_frame(HTS_Engine * engine)409 size_t HTS_Engine_get_total_frame(HTS_Engine * engine)
410 {
411    return HTS_GStreamSet_get_total_frame(&engine->gss);
412 }
413 
414 /* HTS_Engine_get_nsamples: get number of samples */
HTS_Engine_get_nsamples(HTS_Engine * engine)415 size_t HTS_Engine_get_nsamples(HTS_Engine * engine)
416 {
417    return HTS_GStreamSet_get_total_nsamples(&engine->gss);
418 }
419 
420 /* HTS_Engine_get_generated_parameter: output generated parameter */
HTS_Engine_get_generated_parameter(HTS_Engine * engine,size_t stream_index,size_t frame_index,size_t vector_index)421 double HTS_Engine_get_generated_parameter(HTS_Engine * engine, size_t stream_index, size_t frame_index, size_t vector_index)
422 {
423    return HTS_GStreamSet_get_parameter(&engine->gss, stream_index, frame_index, vector_index);
424 }
425 
426 /* HTS_Engine_get_generated_speech: output generated speech */
HTS_Engine_get_generated_speech(HTS_Engine * engine,size_t index)427 double HTS_Engine_get_generated_speech(HTS_Engine * engine, size_t index)
428 {
429    return HTS_GStreamSet_get_speech(&engine->gss, index);
430 }
431 
432 /* HTS_Engine_generate_state_sequence: genereate state sequence (1st synthesis step) */
HTS_Engine_generate_state_sequence(HTS_Engine * engine)433 static HTS_Boolean HTS_Engine_generate_state_sequence(HTS_Engine * engine)
434 {
435    size_t i, state_index, model_index;
436    double f;
437 
438    if (HTS_SStreamSet_create(&engine->sss, &engine->ms, &engine->label, engine->condition.phoneme_alignment_flag, engine->condition.speed, engine->condition.duration_iw, engine->condition.parameter_iw, engine->condition.gv_iw) != TRUE) {
439       HTS_Engine_refresh(engine);
440       return FALSE;
441    }
442    if (engine->condition.additional_half_tone != 0.0) {
443       state_index = 0;
444       model_index = 0;
445       for (i = 0; i < HTS_Engine_get_total_state(engine); i++) {
446          f = HTS_Engine_get_state_mean(engine, 1, i, 0);
447          f += engine->condition.additional_half_tone * HALF_TONE;
448          if (f < MIN_LF0)
449             f = MIN_LF0;
450          else if (f > MAX_LF0)
451             f = MAX_LF0;
452          HTS_Engine_set_state_mean(engine, 1, i, 0, f);
453          state_index++;
454          if (state_index >= HTS_Engine_get_nstate(engine)) {
455             state_index = 0;
456             model_index++;
457          }
458       }
459    }
460    return TRUE;
461 }
462 
463 /* HTS_Engine_generate_state_sequence_from_fn: genereate state sequence from file name (1st synthesis step) */
HTS_Engine_generate_state_sequence_from_fn(HTS_Engine * engine,const char * fn)464 HTS_Boolean HTS_Engine_generate_state_sequence_from_fn(HTS_Engine * engine, const char *fn)
465 {
466    HTS_Engine_refresh(engine);
467    HTS_Label_load_from_fn(&engine->label, engine->condition.sampling_frequency, engine->condition.fperiod, fn);
468    return HTS_Engine_generate_state_sequence(engine);
469 }
470 
471 /* HTS_Engine_generate_state_sequence_from_strings: generate state sequence from strings (1st synthesis step) */
HTS_Engine_generate_state_sequence_from_strings(HTS_Engine * engine,char ** lines,size_t num_lines)472 HTS_Boolean HTS_Engine_generate_state_sequence_from_strings(HTS_Engine * engine, char **lines, size_t num_lines)
473 {
474    HTS_Engine_refresh(engine);
475    HTS_Label_load_from_strings(&engine->label, engine->condition.sampling_frequency, engine->condition.fperiod, lines, num_lines);
476    return HTS_Engine_generate_state_sequence(engine);
477 }
478 
479 /* HTS_Engine_generate_parameter_sequence: generate parameter sequence (2nd synthesis step) */
HTS_Engine_generate_parameter_sequence(HTS_Engine * engine)480 HTS_Boolean HTS_Engine_generate_parameter_sequence(HTS_Engine * engine)
481 {
482    return HTS_PStreamSet_create(&engine->pss, &engine->sss, engine->condition.msd_threshold, engine->condition.gv_weight);
483 }
484 
485 /* HTS_Engine_generate_sample_sequence: generate sample sequence (3rd synthesis step) */
HTS_Engine_generate_sample_sequence(HTS_Engine * engine)486 HTS_Boolean HTS_Engine_generate_sample_sequence(HTS_Engine * engine)
487 {
488    return HTS_GStreamSet_create(&engine->gss, &engine->pss, engine->condition.stage, engine->condition.use_log_gain, engine->condition.sampling_frequency, engine->condition.fperiod, engine->condition.alpha, engine->condition.beta, &engine->condition.stop, engine->condition.volume, engine->condition.audio_buff_size > 0 ? &engine->audio : NULL);
489 }
490 
491 /* HTS_Engine_synthesize: synthesize speech */
HTS_Engine_synthesize(HTS_Engine * engine)492 static HTS_Boolean HTS_Engine_synthesize(HTS_Engine * engine)
493 {
494    if (HTS_Engine_generate_state_sequence(engine) != TRUE) {
495       HTS_Engine_refresh(engine);
496       return FALSE;
497    }
498    if (HTS_Engine_generate_parameter_sequence(engine) != TRUE) {
499       HTS_Engine_refresh(engine);
500       return FALSE;
501    }
502    if (HTS_Engine_generate_sample_sequence(engine) != TRUE) {
503       HTS_Engine_refresh(engine);
504       return FALSE;
505    }
506    return TRUE;
507 }
508 
509 /* HTS_Engine_synthesize_from_fn: synthesize speech from file name */
HTS_Engine_synthesize_from_fn(HTS_Engine * engine,const char * fn)510 HTS_Boolean HTS_Engine_synthesize_from_fn(HTS_Engine * engine, const char *fn)
511 {
512    HTS_Engine_refresh(engine);
513    HTS_Label_load_from_fn(&engine->label, engine->condition.sampling_frequency, engine->condition.fperiod, fn);
514    return HTS_Engine_synthesize(engine);
515 }
516 
517 /* HTS_Engine_synthesize_from_strings: synthesize speech from strings */
HTS_Engine_synthesize_from_strings(HTS_Engine * engine,char ** lines,size_t num_lines)518 HTS_Boolean HTS_Engine_synthesize_from_strings(HTS_Engine * engine, char **lines, size_t num_lines)
519 {
520    HTS_Engine_refresh(engine);
521    HTS_Label_load_from_strings(&engine->label, engine->condition.sampling_frequency, engine->condition.fperiod, lines, num_lines);
522    return HTS_Engine_synthesize(engine);
523 }
524 
525 /* HTS_Engine_save_information: save trace information */
HTS_Engine_save_information(HTS_Engine * engine,FILE * fp)526 void HTS_Engine_save_information(HTS_Engine * engine, FILE * fp)
527 {
528    size_t i, j, k, l, m, n;
529    double temp;
530    HTS_Condition *condition = &engine->condition;
531    HTS_ModelSet *ms = &engine->ms;
532    HTS_Label *label = &engine->label;
533    HTS_SStreamSet *sss = &engine->sss;
534    HTS_PStreamSet *pss = &engine->pss;
535 
536    /* global parameter */
537    fprintf(fp, "[Global parameter]\n");
538    fprintf(fp, "Sampring frequency                     -> %8lu(Hz)\n", (unsigned long) condition->sampling_frequency);
539    fprintf(fp, "Frame period                           -> %8lu(point)\n", (unsigned long) condition->fperiod);
540    fprintf(fp, "                                          %8.5f(msec)\n", 1e+3 * condition->fperiod / condition->sampling_frequency);
541    fprintf(fp, "All-pass constant                      -> %8.5f\n", (float) condition->alpha);
542    fprintf(fp, "Gamma                                  -> %8.5f\n", (float) (condition->stage == 0 ? 0.0 : -1.0 / condition->stage));
543    if (condition->stage != 0) {
544       if (condition->use_log_gain == TRUE)
545          fprintf(fp, "Log gain flag                          ->     TRUE\n");
546       else
547          fprintf(fp, "Log gain flag                          ->    FALSE\n");
548    }
549    fprintf(fp, "Postfiltering coefficient              -> %8.5f\n", (float) condition->beta);
550    fprintf(fp, "Audio buffer size                      -> %8lu(sample)\n", (unsigned long) condition->audio_buff_size);
551    fprintf(fp, "\n");
552 
553    /* duration parameter */
554    fprintf(fp, "[Duration parameter]\n");
555    fprintf(fp, "Number of states                       -> %8lu\n", (unsigned long) HTS_ModelSet_get_nstate(ms));
556    fprintf(fp, "         Interpolation size            -> %8lu\n", (unsigned long) HTS_ModelSet_get_nvoices(ms));
557    /* check interpolation */
558    for (i = 0, temp = 0.0; i < HTS_ModelSet_get_nvoices(ms); i++)
559       temp += condition->duration_iw[i];
560    for (i = 0; i < HTS_ModelSet_get_nvoices(ms); i++)
561       if (condition->duration_iw[i] != 0.0)
562          condition->duration_iw[i] /= temp;
563    for (i = 0; i < HTS_ModelSet_get_nvoices(ms); i++)
564       fprintf(fp, "         Interpolation weight[%2lu]      -> %8.0f(%%)\n", (unsigned long) i, (float) (100 * condition->duration_iw[i]));
565    fprintf(fp, "\n");
566 
567    fprintf(fp, "[Stream parameter]\n");
568    for (i = 0; i < HTS_ModelSet_get_nstream(ms); i++) {
569       /* stream parameter */
570       fprintf(fp, "Stream[%2lu] vector length               -> %8lu\n", (unsigned long) i, (unsigned long) HTS_ModelSet_get_vector_length(ms, i));
571       fprintf(fp, "           Dynamic window size         -> %8lu\n", (unsigned long) HTS_ModelSet_get_window_size(ms, i));
572       /* interpolation */
573       fprintf(fp, "           Interpolation size          -> %8lu\n", (unsigned long) HTS_ModelSet_get_nvoices(ms));
574       for (j = 0, temp = 0.0; j < HTS_ModelSet_get_nvoices(ms); j++)
575          temp += condition->parameter_iw[j][i];
576       for (j = 0; j < HTS_ModelSet_get_nvoices(ms); j++)
577          if (condition->parameter_iw[j][i] != 0.0)
578             condition->parameter_iw[j][i] /= temp;
579       for (j = 0; j < HTS_ModelSet_get_nvoices(ms); j++)
580          fprintf(fp, "           Interpolation weight[%2lu]    -> %8.0f(%%)\n", (unsigned long) j, (float) (100 * condition->parameter_iw[j][i]));
581       /* MSD */
582       if (HTS_ModelSet_is_msd(ms, i)) { /* for MSD */
583          fprintf(fp, "           MSD flag                    ->     TRUE\n");
584          fprintf(fp, "           MSD threshold               -> %8.5f\n", condition->msd_threshold[i]);
585       } else {                  /* for non MSD */
586          fprintf(fp, "           MSD flag                    ->    FALSE\n");
587       }
588       /* GV */
589       if (HTS_ModelSet_use_gv(ms, i)) {
590          fprintf(fp, "           GV flag                     ->     TRUE\n");
591          fprintf(fp, "           GV weight                   -> %8.0f(%%)\n", (float) (100 * condition->gv_weight[i]));
592          fprintf(fp, "           GV interpolation size       -> %8lu\n", (unsigned long) HTS_ModelSet_get_nvoices(ms));
593          /* interpolation */
594          for (j = 0, temp = 0.0; j < HTS_ModelSet_get_nvoices(ms); j++)
595             temp += condition->gv_iw[j][i];
596          for (j = 0; j < HTS_ModelSet_get_nvoices(ms); j++)
597             if (condition->gv_iw[j][i] != 0.0)
598                condition->gv_iw[j][i] /= temp;
599          for (j = 0; j < HTS_ModelSet_get_nvoices(ms); j++)
600             fprintf(fp, "           GV interpolation weight[%2lu] -> %8.0f(%%)\n", (unsigned long) j, (float) (100 * condition->gv_iw[j][i]));
601       } else {
602          fprintf(fp, "           GV flag                     ->    FALSE\n");
603       }
604    }
605    fprintf(fp, "\n");
606 
607    /* generated sequence */
608    fprintf(fp, "[Generated sequence]\n");
609    fprintf(fp, "Number of HMMs                         -> %8lu\n", (unsigned long) HTS_Label_get_size(label));
610    fprintf(fp, "Number of stats                        -> %8lu\n", (unsigned long) HTS_Label_get_size(label) * HTS_ModelSet_get_nstate(ms));
611    fprintf(fp, "Length of this speech                  -> %8.3f(sec)\n", (float) ((double) HTS_PStreamSet_get_total_frame(pss) * condition->fperiod / condition->sampling_frequency));
612    fprintf(fp, "                                       -> %8lu(frames)\n", (unsigned long) HTS_PStreamSet_get_total_frame(pss) * condition->fperiod);
613 
614    for (i = 0; i < HTS_Label_get_size(label); i++) {
615       fprintf(fp, "HMM[%2lu]\n", (unsigned long) i);
616       fprintf(fp, "  Name                                 -> %s\n", HTS_Label_get_string(label, i));
617       fprintf(fp, "  Duration\n");
618       for (j = 0; j < HTS_ModelSet_get_nvoices(ms); j++) {
619          fprintf(fp, "    Interpolation[%2lu]\n", (unsigned long) j);
620          HTS_ModelSet_get_duration_index(ms, j, HTS_Label_get_string(label, i), &k, &l);
621          fprintf(fp, "      Tree index                       -> %8lu\n", (unsigned long) k);
622          fprintf(fp, "      PDF index                        -> %8lu\n", (unsigned long) l);
623       }
624       for (j = 0; j < HTS_ModelSet_get_nstate(ms); j++) {
625          fprintf(fp, "  State[%2lu]\n", (unsigned long) j + 2);
626          fprintf(fp, "    Length                             -> %8lu(frames)\n", (unsigned long) HTS_SStreamSet_get_duration(sss, i * HTS_ModelSet_get_nstate(ms) + j));
627          for (k = 0; k < HTS_ModelSet_get_nstream(ms); k++) {
628             fprintf(fp, "    Stream[%2lu]\n", (unsigned long) k);
629             if (HTS_ModelSet_is_msd(ms, k)) {
630                if (HTS_SStreamSet_get_msd(sss, k, i * HTS_ModelSet_get_nstate(ms) + j) > condition->msd_threshold[k])
631                   fprintf(fp, "      MSD flag                         ->     TRUE\n");
632                else
633                   fprintf(fp, "      MSD flag                         ->    FALSE\n");
634             }
635             for (l = 0; l < HTS_ModelSet_get_nvoices(ms); l++) {
636                fprintf(fp, "      Interpolation[%2lu]\n", (unsigned long) l);
637                HTS_ModelSet_get_parameter_index(ms, l, k, j + 2, HTS_Label_get_string(label, i), &m, &n);
638                fprintf(fp, "        Tree index                     -> %8lu\n", (unsigned long) m);
639                fprintf(fp, "        PDF index                      -> %8lu\n", (unsigned long) n);
640             }
641          }
642       }
643    }
644 }
645 
646 /* HTS_Engine_save_label: save label with time */
HTS_Engine_save_label(HTS_Engine * engine,FILE * fp)647 void HTS_Engine_save_label(HTS_Engine * engine, FILE * fp)
648 {
649    size_t i, j;
650    size_t frame, state, duration;
651 
652    HTS_Label *label = &engine->label;
653    HTS_SStreamSet *sss = &engine->sss;
654    size_t nstate = HTS_ModelSet_get_nstate(&engine->ms);
655    double rate = engine->condition.fperiod * 1.0e+07 / engine->condition.sampling_frequency;
656 
657    for (i = 0, state = 0, frame = 0; i < HTS_Label_get_size(label); i++) {
658       for (j = 0, duration = 0; j < nstate; j++)
659          duration += HTS_SStreamSet_get_duration(sss, state++);
660       fprintf(fp, "%lu %lu %s\n", (unsigned long) (frame * rate), (unsigned long) ((frame + duration) * rate), HTS_Label_get_string(label, i));
661       frame += duration;
662    }
663 }
664 
665 /* HTS_Engine_save_generated_parameter: save generated parameter */
HTS_Engine_save_generated_parameter(HTS_Engine * engine,size_t stream_index,FILE * fp)666 void HTS_Engine_save_generated_parameter(HTS_Engine * engine, size_t stream_index, FILE * fp)
667 {
668    size_t i, j;
669    float temp;
670    HTS_GStreamSet *gss = &engine->gss;
671 
672    for (i = 0; i < HTS_GStreamSet_get_total_frame(gss); i++)
673       for (j = 0; j < HTS_GStreamSet_get_vector_length(gss, stream_index); j++) {
674          temp = (float) HTS_GStreamSet_get_parameter(gss, stream_index, i, j);
675          fwrite(&temp, sizeof(float), 1, fp);
676       }
677 }
678 
679 /* HTS_Engine_save_generated_speech: save generated speech */
HTS_Engine_save_generated_speech(HTS_Engine * engine,FILE * fp)680 void HTS_Engine_save_generated_speech(HTS_Engine * engine, FILE * fp)
681 {
682    size_t i;
683    double x;
684    short temp;
685    HTS_GStreamSet *gss = &engine->gss;
686 
687    for (i = 0; i < HTS_GStreamSet_get_total_nsamples(gss); i++) {
688       x = HTS_GStreamSet_get_speech(gss, i);
689       if (x > 32767.0)
690          temp = 32767;
691       else if (x < -32768.0)
692          temp = -32768;
693       else
694          temp = (short) x;
695       fwrite(&temp, sizeof(short), 1, fp);
696    }
697 }
698 
699 /* HTS_Engine_save_riff: save RIFF format file */
HTS_Engine_save_riff(HTS_Engine * engine,FILE * fp)700 void HTS_Engine_save_riff(HTS_Engine * engine, FILE * fp)
701 {
702    size_t i;
703    double x;
704    short temp;
705 
706    HTS_GStreamSet *gss = &engine->gss;
707    char data_01_04[] = { 'R', 'I', 'F', 'F' };
708    int data_05_08 = HTS_GStreamSet_get_total_nsamples(gss) * sizeof(short) + 36;
709    char data_09_12[] = { 'W', 'A', 'V', 'E' };
710    char data_13_16[] = { 'f', 'm', 't', ' ' };
711    int data_17_20 = 16;
712    short data_21_22 = 1;        /* PCM */
713    short data_23_24 = 1;        /* monoral */
714    int data_25_28 = engine->condition.sampling_frequency;
715    int data_29_32 = engine->condition.sampling_frequency * sizeof(short);
716    short data_33_34 = sizeof(short);
717    short data_35_36 = (short) (sizeof(short) * 8);
718    char data_37_40[] = { 'd', 'a', 't', 'a' };
719    int data_41_44 = HTS_GStreamSet_get_total_nsamples(gss) * sizeof(short);
720 
721    /* write header */
722    HTS_fwrite_little_endian(data_01_04, sizeof(char), 4, fp);
723    HTS_fwrite_little_endian(&data_05_08, sizeof(int), 1, fp);
724    HTS_fwrite_little_endian(data_09_12, sizeof(char), 4, fp);
725    HTS_fwrite_little_endian(data_13_16, sizeof(char), 4, fp);
726    HTS_fwrite_little_endian(&data_17_20, sizeof(int), 1, fp);
727    HTS_fwrite_little_endian(&data_21_22, sizeof(short), 1, fp);
728    HTS_fwrite_little_endian(&data_23_24, sizeof(short), 1, fp);
729    HTS_fwrite_little_endian(&data_25_28, sizeof(int), 1, fp);
730    HTS_fwrite_little_endian(&data_29_32, sizeof(int), 1, fp);
731    HTS_fwrite_little_endian(&data_33_34, sizeof(short), 1, fp);
732    HTS_fwrite_little_endian(&data_35_36, sizeof(short), 1, fp);
733    HTS_fwrite_little_endian(data_37_40, sizeof(char), 4, fp);
734    HTS_fwrite_little_endian(&data_41_44, sizeof(int), 1, fp);
735    /* write data */
736    for (i = 0; i < HTS_GStreamSet_get_total_nsamples(gss); i++) {
737       x = HTS_GStreamSet_get_speech(gss, i);
738       if (x > 32767.0)
739          temp = 32767;
740       else if (x < -32768.0)
741          temp = -32768;
742       else
743          temp = (short) x;
744       HTS_fwrite_little_endian(&temp, sizeof(short), 1, fp);
745    }
746 }
747 
748 /* HTS_Engine_refresh: free model per one time synthesis */
HTS_Engine_refresh(HTS_Engine * engine)749 void HTS_Engine_refresh(HTS_Engine * engine)
750 {
751    /* free generated parameter stream set */
752    HTS_GStreamSet_clear(&engine->gss);
753    /* free parameter stream set */
754    HTS_PStreamSet_clear(&engine->pss);
755    /* free state stream set */
756    HTS_SStreamSet_clear(&engine->sss);
757    /* free label list */
758    HTS_Label_clear(&engine->label);
759    /* stop flag */
760    engine->condition.stop = FALSE;
761 }
762 
763 /* HTS_Engine_clear: free engine */
HTS_Engine_clear(HTS_Engine * engine)764 void HTS_Engine_clear(HTS_Engine * engine)
765 {
766    size_t i;
767 
768    if (engine->condition.msd_threshold != NULL)
769       HTS_free(engine->condition.msd_threshold);
770    if (engine->condition.duration_iw != NULL)
771       HTS_free(engine->condition.duration_iw);
772    if (engine->condition.gv_weight != NULL)
773       HTS_free(engine->condition.gv_weight);
774    if (engine->condition.parameter_iw != NULL) {
775       for (i = 0; i < HTS_ModelSet_get_nvoices(&engine->ms); i++)
776          HTS_free(engine->condition.parameter_iw[i]);
777       HTS_free(engine->condition.parameter_iw);
778    }
779    if (engine->condition.gv_iw != NULL) {
780       for (i = 0; i < HTS_ModelSet_get_nvoices(&engine->ms); i++)
781          HTS_free(engine->condition.gv_iw[i]);
782       HTS_free(engine->condition.gv_iw);
783    }
784 
785    HTS_ModelSet_clear(&engine->ms);
786    HTS_Audio_clear(&engine->audio);
787    HTS_Engine_initialize(engine);
788 }
789 
790 HTS_ENGINE_C_END;
791 
792 #endif                          /* !HTS_ENGINE_C */
793