1 /* ----------------------------------------------------------------- */
2 /* The HMM-Based Speech Synthesis Engine "hts_engine API" */
3 /* developed by HTS Working Group */
4 /* http://hts-engine.sourceforge.net/ */
5 /* ----------------------------------------------------------------- */
6 /* */
7 /* Copyright (c) 2001-2015 Nagoya Institute of Technology */
8 /* Department of Computer Science */
9 /* */
10 /* 2001-2008 Tokyo Institute of Technology */
11 /* Interdisciplinary Graduate School of */
12 /* Science and Engineering */
13 /* */
14 /* All rights reserved. */
15 /* */
16 /* Redistribution and use in source and binary forms, with or */
17 /* without modification, are permitted provided that the following */
18 /* conditions are met: */
19 /* */
20 /* - Redistributions of source code must retain the above copyright */
21 /* notice, this list of conditions and the following disclaimer. */
22 /* - Redistributions in binary form must reproduce the above */
23 /* copyright notice, this list of conditions and the following */
24 /* disclaimer in the documentation and/or other materials provided */
25 /* with the distribution. */
26 /* - Neither the name of the HTS working group nor the names of its */
27 /* contributors may be used to endorse or promote products derived */
28 /* from this software without specific prior written permission. */
29 /* */
30 /* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND */
31 /* CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, */
32 /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */
33 /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */
34 /* DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS */
35 /* BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, */
36 /* EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED */
37 /* TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, */
38 /* DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON */
39 /* ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, */
40 /* OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY */
41 /* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */
42 /* POSSIBILITY OF SUCH DAMAGE. */
43 /* ----------------------------------------------------------------- */
44
45 #ifndef HTS_ENGINE_C
46 #define HTS_ENGINE_C
47
48 #ifdef __cplusplus
49 #define HTS_ENGINE_C_START extern "C" {
50 #define HTS_ENGINE_C_END }
51 #else
52 #define HTS_ENGINE_C_START
53 #define HTS_ENGINE_C_END
54 #endif /* __CPLUSPLUS */
55
56 HTS_ENGINE_C_START;
57
58 #include <stdlib.h> /* for atof() */
59 #include <string.h> /* for strcpy() */
60 #include <math.h> /* for pow() */
61
62 /* hts_engine libraries */
63 #include "HTS_hidden.h"
64
65 /* HTS_Engine_initialize: initialize engine */
HTS_Engine_initialize(HTS_Engine * engine)66 void HTS_Engine_initialize(HTS_Engine * engine)
67 {
68 /* global */
69 engine->condition.sampling_frequency = 0;
70 engine->condition.fperiod = 0;
71 engine->condition.audio_buff_size = 0;
72 engine->condition.stop = FALSE;
73 engine->condition.volume = 1.0;
74 engine->condition.msd_threshold = NULL;
75 engine->condition.gv_weight = NULL;
76
77 /* duration */
78 engine->condition.speed = 1.0;
79 engine->condition.phoneme_alignment_flag = FALSE;
80
81 /* spectrum */
82 engine->condition.stage = 0;
83 engine->condition.use_log_gain = FALSE;
84 engine->condition.alpha = 0.0;
85 engine->condition.beta = 0.0;
86
87 /* log F0 */
88 engine->condition.additional_half_tone = 0.0;
89
90 /* interpolation weights */
91 engine->condition.duration_iw = NULL;
92 engine->condition.parameter_iw = NULL;
93 engine->condition.gv_iw = NULL;
94
95 /* initialize audio */
96 HTS_Audio_initialize(&engine->audio);
97 /* initialize model set */
98 HTS_ModelSet_initialize(&engine->ms);
99 /* initialize label list */
100 HTS_Label_initialize(&engine->label);
101 /* initialize state sequence set */
102 HTS_SStreamSet_initialize(&engine->sss);
103 /* initialize pstream set */
104 HTS_PStreamSet_initialize(&engine->pss);
105 /* initialize gstream set */
106 HTS_GStreamSet_initialize(&engine->gss);
107 }
108
109 /* HTS_Engine_load: load HTS voices */
HTS_Engine_load(HTS_Engine * engine,char ** voices,size_t num_voices)110 HTS_Boolean HTS_Engine_load(HTS_Engine * engine, char **voices, size_t num_voices)
111 {
112 size_t i, j;
113 size_t nstream;
114 double average_weight;
115 const char *option, *find;
116
117 /* reset engine */
118 HTS_Engine_clear(engine);
119
120 /* load voices */
121 if (HTS_ModelSet_load(&engine->ms, voices, num_voices) != TRUE) {
122 HTS_Engine_clear(engine);
123 return FALSE;
124 }
125 nstream = HTS_ModelSet_get_nstream(&engine->ms);
126 average_weight = 1.0 / num_voices;
127
128 /* global */
129 engine->condition.sampling_frequency = HTS_ModelSet_get_sampling_frequency(&engine->ms);
130 engine->condition.fperiod = HTS_ModelSet_get_fperiod(&engine->ms);
131 engine->condition.msd_threshold = (double *) HTS_calloc(nstream, sizeof(double));
132 for (i = 0; i < nstream; i++)
133 engine->condition.msd_threshold[i] = 0.5;
134 engine->condition.gv_weight = (double *) HTS_calloc(nstream, sizeof(double));
135 for (i = 0; i < nstream; i++)
136 engine->condition.gv_weight[i] = 1.0;
137
138 /* spectrum */
139 option = HTS_ModelSet_get_option(&engine->ms, 0);
140 find = strstr(option, "GAMMA=");
141 if (find != NULL)
142 engine->condition.stage = (size_t) atoi(&find[strlen("GAMMA=")]);
143 find = strstr(option, "LN_GAIN=");
144 if (find != NULL)
145 engine->condition.use_log_gain = atoi(&find[strlen("LN_GAIN=")]) == 1 ? TRUE : FALSE;
146 find = strstr(option, "ALPHA=");
147 if (find != NULL)
148 engine->condition.alpha = atof(&find[strlen("ALPHA=")]);
149
150 /* interpolation weights */
151 engine->condition.duration_iw = (double *) HTS_calloc(num_voices, sizeof(double));
152 for (i = 0; i < num_voices; i++)
153 engine->condition.duration_iw[i] = average_weight;
154 engine->condition.parameter_iw = (double **) HTS_calloc(num_voices, sizeof(double *));
155 for (i = 0; i < num_voices; i++) {
156 engine->condition.parameter_iw[i] = (double *) HTS_calloc(nstream, sizeof(double));
157 for (j = 0; j < nstream; j++)
158 engine->condition.parameter_iw[i][j] = average_weight;
159 }
160 engine->condition.gv_iw = (double **) HTS_calloc(num_voices, sizeof(double *));
161 for (i = 0; i < num_voices; i++) {
162 engine->condition.gv_iw[i] = (double *) HTS_calloc(nstream, sizeof(double));
163 for (j = 0; j < nstream; j++)
164 engine->condition.gv_iw[i][j] = average_weight;
165 }
166
167 return TRUE;
168 }
169
170 /* HTS_Engine_set_sampling_frequency: set sampling frequency */
HTS_Engine_set_sampling_frequency(HTS_Engine * engine,size_t i)171 void HTS_Engine_set_sampling_frequency(HTS_Engine * engine, size_t i)
172 {
173 if (i < 1)
174 i = 1;
175 engine->condition.sampling_frequency = i;
176 HTS_Audio_set_parameter(&engine->audio, engine->condition.sampling_frequency, engine->condition.audio_buff_size);
177 }
178
179 /* HTS_Engine_get_sampling_frequency: get sampling frequency */
HTS_Engine_get_sampling_frequency(HTS_Engine * engine)180 size_t HTS_Engine_get_sampling_frequency(HTS_Engine * engine)
181 {
182 return engine->condition.sampling_frequency;
183 }
184
185 /* HTS_Engine_set_fperiod: set frame period */
HTS_Engine_set_fperiod(HTS_Engine * engine,size_t i)186 void HTS_Engine_set_fperiod(HTS_Engine * engine, size_t i)
187 {
188 if (i < 1)
189 i = 1;
190 engine->condition.fperiod = i;
191 }
192
193 /* HTS_Engine_get_fperiod: get frame period */
HTS_Engine_get_fperiod(HTS_Engine * engine)194 size_t HTS_Engine_get_fperiod(HTS_Engine * engine)
195 {
196 return engine->condition.fperiod;
197 }
198
199 /* HTS_Engine_set_audio_buff_size: set audio buffer size */
HTS_Engine_set_audio_buff_size(HTS_Engine * engine,size_t i)200 void HTS_Engine_set_audio_buff_size(HTS_Engine * engine, size_t i)
201 {
202 engine->condition.audio_buff_size = i;
203 HTS_Audio_set_parameter(&engine->audio, engine->condition.sampling_frequency, engine->condition.audio_buff_size);
204 }
205
206 /* HTS_Engine_get_audio_buff_size: get audio buffer size */
HTS_Engine_get_audio_buff_size(HTS_Engine * engine)207 size_t HTS_Engine_get_audio_buff_size(HTS_Engine * engine)
208 {
209 return engine->condition.audio_buff_size;
210 }
211
212 /* HTS_Engine_set_stop_flag: set stop flag */
HTS_Engine_set_stop_flag(HTS_Engine * engine,HTS_Boolean b)213 void HTS_Engine_set_stop_flag(HTS_Engine * engine, HTS_Boolean b)
214 {
215 engine->condition.stop = b;
216 }
217
218 /* HTS_Engine_get_stop_flag: get stop flag */
HTS_Engine_get_stop_flag(HTS_Engine * engine)219 HTS_Boolean HTS_Engine_get_stop_flag(HTS_Engine * engine)
220 {
221 return engine->condition.stop;
222 }
223
224 /* HTS_Engine_set_volume: set volume in db */
HTS_Engine_set_volume(HTS_Engine * engine,double f)225 void HTS_Engine_set_volume(HTS_Engine * engine, double f)
226 {
227 engine->condition.volume = exp(f * DB);
228 }
229
230 /* HTS_Engine_get_volume: get volume in db */
HTS_Engine_get_volume(HTS_Engine * engine)231 double HTS_Engine_get_volume(HTS_Engine * engine)
232 {
233 return log(engine->condition.volume) / DB;
234 }
235
236 /* HTS_Egnine_set_msd_threshold: set MSD threshold */
HTS_Engine_set_msd_threshold(HTS_Engine * engine,size_t stream_index,double f)237 void HTS_Engine_set_msd_threshold(HTS_Engine * engine, size_t stream_index, double f)
238 {
239 if (f < 0.0)
240 f = 0.0;
241 if (f > 1.0)
242 f = 1.0;
243 engine->condition.msd_threshold[stream_index] = f;
244 }
245
246 /* HTS_Engine_get_msd_threshold: get MSD threshold */
HTS_Engine_get_msd_threshold(HTS_Engine * engine,size_t stream_index)247 double HTS_Engine_get_msd_threshold(HTS_Engine * engine, size_t stream_index)
248 {
249 return engine->condition.msd_threshold[stream_index];
250 }
251
252 /* HTS_Engine_set_gv_weight: set GV weight */
HTS_Engine_set_gv_weight(HTS_Engine * engine,size_t stream_index,double f)253 void HTS_Engine_set_gv_weight(HTS_Engine * engine, size_t stream_index, double f)
254 {
255 if (f < 0.0)
256 f = 0.0;
257 engine->condition.gv_weight[stream_index] = f;
258 }
259
260 /* HTS_Engine_get_gv_weight: get GV weight */
HTS_Engine_get_gv_weight(HTS_Engine * engine,size_t stream_index)261 double HTS_Engine_get_gv_weight(HTS_Engine * engine, size_t stream_index)
262 {
263 return engine->condition.gv_weight[stream_index];
264 }
265
266 /* HTS_Engine_set_speed: set speech speed */
HTS_Engine_set_speed(HTS_Engine * engine,double f)267 void HTS_Engine_set_speed(HTS_Engine * engine, double f)
268 {
269 if (f < 1.0E-06)
270 f = 1.0E-06;
271 engine->condition.speed = f;
272 }
273
274 /* HTS_Engine_set_phoneme_alignment_flag: set flag for using phoneme alignment in label */
HTS_Engine_set_phoneme_alignment_flag(HTS_Engine * engine,HTS_Boolean b)275 void HTS_Engine_set_phoneme_alignment_flag(HTS_Engine * engine, HTS_Boolean b)
276 {
277 engine->condition.phoneme_alignment_flag = b;
278 }
279
280 /* HTS_Engine_set_alpha: set alpha */
HTS_Engine_set_alpha(HTS_Engine * engine,double f)281 void HTS_Engine_set_alpha(HTS_Engine * engine, double f)
282 {
283 if (f < 0.0)
284 f = 0.0;
285 if (f > 1.0)
286 f = 1.0;
287 engine->condition.alpha = f;
288 }
289
290 /* HTS_Engine_get_alpha: get alpha */
HTS_Engine_get_alpha(HTS_Engine * engine)291 double HTS_Engine_get_alpha(HTS_Engine * engine)
292 {
293 return engine->condition.alpha;
294 }
295
296 /* HTS_Engine_set_beta: set beta */
HTS_Engine_set_beta(HTS_Engine * engine,double f)297 void HTS_Engine_set_beta(HTS_Engine * engine, double f)
298 {
299 if (f < 0.0)
300 f = 0.0;
301 if (f > 1.0)
302 f = 1.0;
303 engine->condition.beta = f;
304 }
305
306 /* HTS_Engine_get_beta: get beta */
HTS_Engine_get_beta(HTS_Engine * engine)307 double HTS_Engine_get_beta(HTS_Engine * engine)
308 {
309 return engine->condition.beta;
310 }
311
312 /* HTS_Engine_add_half_tone: add half tone */
HTS_Engine_add_half_tone(HTS_Engine * engine,double f)313 void HTS_Engine_add_half_tone(HTS_Engine * engine, double f)
314 {
315 engine->condition.additional_half_tone = f;
316 }
317
318 /* HTS_Engine_set_duration_interpolation_weight: set interpolation weight for duration */
HTS_Engine_set_duration_interpolation_weight(HTS_Engine * engine,size_t voice_index,double f)319 void HTS_Engine_set_duration_interpolation_weight(HTS_Engine * engine, size_t voice_index, double f)
320 {
321 engine->condition.duration_iw[voice_index] = f;
322 }
323
324 /* HTS_Engine_get_duration_interpolation_weight: get interpolation weight for duration */
HTS_Engine_get_duration_interpolation_weight(HTS_Engine * engine,size_t voice_index)325 double HTS_Engine_get_duration_interpolation_weight(HTS_Engine * engine, size_t voice_index)
326 {
327 return engine->condition.duration_iw[voice_index];
328 }
329
330 /* HTS_Engine_set_parameter_interpolation_weight: set interpolation weight for parameter */
HTS_Engine_set_parameter_interpolation_weight(HTS_Engine * engine,size_t voice_index,size_t stream_index,double f)331 void HTS_Engine_set_parameter_interpolation_weight(HTS_Engine * engine, size_t voice_index, size_t stream_index, double f)
332 {
333 engine->condition.parameter_iw[voice_index][stream_index] = f;
334 }
335
336 /* HTS_Engine_get_parameter_interpolation_weight: get interpolation weight for parameter */
HTS_Engine_get_parameter_interpolation_weight(HTS_Engine * engine,size_t voice_index,size_t stream_index)337 double HTS_Engine_get_parameter_interpolation_weight(HTS_Engine * engine, size_t voice_index, size_t stream_index)
338 {
339 return engine->condition.parameter_iw[voice_index][stream_index];
340 }
341
342 /* HTS_Engine_set_gv_interpolation_weight: set interpolation weight for GV */
HTS_Engine_set_gv_interpolation_weight(HTS_Engine * engine,size_t voice_index,size_t stream_index,double f)343 void HTS_Engine_set_gv_interpolation_weight(HTS_Engine * engine, size_t voice_index, size_t stream_index, double f)
344 {
345 engine->condition.gv_iw[voice_index][stream_index] = f;
346 }
347
348 /* HTS_Engine_get_gv_interpolation_weight: get interpolation weight for GV */
HTS_Engine_get_gv_interpolation_weight(HTS_Engine * engine,size_t voice_index,size_t stream_index)349 double HTS_Engine_get_gv_interpolation_weight(HTS_Engine * engine, size_t voice_index, size_t stream_index)
350 {
351 return engine->condition.gv_iw[voice_index][stream_index];
352 }
353
354 /* HTS_Engine_get_total_state: get total number of state */
HTS_Engine_get_total_state(HTS_Engine * engine)355 size_t HTS_Engine_get_total_state(HTS_Engine * engine)
356 {
357 return HTS_SStreamSet_get_total_state(&engine->sss);
358 }
359
360 /* HTS_Engine_set_state_mean: set mean value of state */
HTS_Engine_set_state_mean(HTS_Engine * engine,size_t stream_index,size_t state_index,size_t vector_index,double f)361 void HTS_Engine_set_state_mean(HTS_Engine * engine, size_t stream_index, size_t state_index, size_t vector_index, double f)
362 {
363 HTS_SStreamSet_set_mean(&engine->sss, stream_index, state_index, vector_index, f);
364 }
365
366 /* HTS_Engine_get_state_mean: get mean value of state */
HTS_Engine_get_state_mean(HTS_Engine * engine,size_t stream_index,size_t state_index,size_t vector_index)367 double HTS_Engine_get_state_mean(HTS_Engine * engine, size_t stream_index, size_t state_index, size_t vector_index)
368 {
369 return HTS_SStreamSet_get_mean(&engine->sss, stream_index, state_index, vector_index);
370 }
371
372 /* HTS_Engine_get_state_duration: get state duration */
HTS_Engine_get_state_duration(HTS_Engine * engine,size_t state_index)373 size_t HTS_Engine_get_state_duration(HTS_Engine * engine, size_t state_index)
374 {
375 return HTS_SStreamSet_get_duration(&engine->sss, state_index);
376 }
377
378 /* HTS_Engine_get_nvoices: get number of voices */
HTS_Engine_get_nvoices(HTS_Engine * engine)379 size_t HTS_Engine_get_nvoices(HTS_Engine * engine)
380 {
381 return HTS_ModelSet_get_nvoices(&engine->ms);
382 }
383
384 /* HTS_Engine_get_nstream: get number of stream */
HTS_Engine_get_nstream(HTS_Engine * engine)385 size_t HTS_Engine_get_nstream(HTS_Engine * engine)
386 {
387 return HTS_ModelSet_get_nstream(&engine->ms);
388 }
389
390 /* HTS_Engine_get_nstate: get number of state */
HTS_Engine_get_nstate(HTS_Engine * engine)391 size_t HTS_Engine_get_nstate(HTS_Engine * engine)
392 {
393 return HTS_ModelSet_get_nstate(&engine->ms);
394 }
395
396 /* HTS_Engine_get_fullcontext_label_format: get full context label format */
HTS_Engine_get_fullcontext_label_format(HTS_Engine * engine)397 const char *HTS_Engine_get_fullcontext_label_format(HTS_Engine * engine)
398 {
399 return HTS_ModelSet_get_fullcontext_label_format(&engine->ms);
400 }
401
402 /* HTS_Engine_get_fullcontext_label_version: get full context label version */
HTS_Engine_get_fullcontext_label_version(HTS_Engine * engine)403 const char *HTS_Engine_get_fullcontext_label_version(HTS_Engine * engine)
404 {
405 return HTS_ModelSet_get_fullcontext_label_version(&engine->ms);
406 }
407
408 /* HTS_Engine_get_total_frame: get total number of frame */
HTS_Engine_get_total_frame(HTS_Engine * engine)409 size_t HTS_Engine_get_total_frame(HTS_Engine * engine)
410 {
411 return HTS_GStreamSet_get_total_frame(&engine->gss);
412 }
413
414 /* HTS_Engine_get_nsamples: get number of samples */
HTS_Engine_get_nsamples(HTS_Engine * engine)415 size_t HTS_Engine_get_nsamples(HTS_Engine * engine)
416 {
417 return HTS_GStreamSet_get_total_nsamples(&engine->gss);
418 }
419
420 /* HTS_Engine_get_generated_parameter: output generated parameter */
HTS_Engine_get_generated_parameter(HTS_Engine * engine,size_t stream_index,size_t frame_index,size_t vector_index)421 double HTS_Engine_get_generated_parameter(HTS_Engine * engine, size_t stream_index, size_t frame_index, size_t vector_index)
422 {
423 return HTS_GStreamSet_get_parameter(&engine->gss, stream_index, frame_index, vector_index);
424 }
425
426 /* HTS_Engine_get_generated_speech: output generated speech */
HTS_Engine_get_generated_speech(HTS_Engine * engine,size_t index)427 double HTS_Engine_get_generated_speech(HTS_Engine * engine, size_t index)
428 {
429 return HTS_GStreamSet_get_speech(&engine->gss, index);
430 }
431
432 /* HTS_Engine_generate_state_sequence: genereate state sequence (1st synthesis step) */
HTS_Engine_generate_state_sequence(HTS_Engine * engine)433 static HTS_Boolean HTS_Engine_generate_state_sequence(HTS_Engine * engine)
434 {
435 size_t i, state_index, model_index;
436 double f;
437
438 if (HTS_SStreamSet_create(&engine->sss, &engine->ms, &engine->label, engine->condition.phoneme_alignment_flag, engine->condition.speed, engine->condition.duration_iw, engine->condition.parameter_iw, engine->condition.gv_iw) != TRUE) {
439 HTS_Engine_refresh(engine);
440 return FALSE;
441 }
442 if (engine->condition.additional_half_tone != 0.0) {
443 state_index = 0;
444 model_index = 0;
445 for (i = 0; i < HTS_Engine_get_total_state(engine); i++) {
446 f = HTS_Engine_get_state_mean(engine, 1, i, 0);
447 f += engine->condition.additional_half_tone * HALF_TONE;
448 if (f < MIN_LF0)
449 f = MIN_LF0;
450 else if (f > MAX_LF0)
451 f = MAX_LF0;
452 HTS_Engine_set_state_mean(engine, 1, i, 0, f);
453 state_index++;
454 if (state_index >= HTS_Engine_get_nstate(engine)) {
455 state_index = 0;
456 model_index++;
457 }
458 }
459 }
460 return TRUE;
461 }
462
463 /* HTS_Engine_generate_state_sequence_from_fn: genereate state sequence from file name (1st synthesis step) */
HTS_Engine_generate_state_sequence_from_fn(HTS_Engine * engine,const char * fn)464 HTS_Boolean HTS_Engine_generate_state_sequence_from_fn(HTS_Engine * engine, const char *fn)
465 {
466 HTS_Engine_refresh(engine);
467 HTS_Label_load_from_fn(&engine->label, engine->condition.sampling_frequency, engine->condition.fperiod, fn);
468 return HTS_Engine_generate_state_sequence(engine);
469 }
470
471 /* HTS_Engine_generate_state_sequence_from_strings: generate state sequence from strings (1st synthesis step) */
HTS_Engine_generate_state_sequence_from_strings(HTS_Engine * engine,char ** lines,size_t num_lines)472 HTS_Boolean HTS_Engine_generate_state_sequence_from_strings(HTS_Engine * engine, char **lines, size_t num_lines)
473 {
474 HTS_Engine_refresh(engine);
475 HTS_Label_load_from_strings(&engine->label, engine->condition.sampling_frequency, engine->condition.fperiod, lines, num_lines);
476 return HTS_Engine_generate_state_sequence(engine);
477 }
478
479 /* HTS_Engine_generate_parameter_sequence: generate parameter sequence (2nd synthesis step) */
HTS_Engine_generate_parameter_sequence(HTS_Engine * engine)480 HTS_Boolean HTS_Engine_generate_parameter_sequence(HTS_Engine * engine)
481 {
482 return HTS_PStreamSet_create(&engine->pss, &engine->sss, engine->condition.msd_threshold, engine->condition.gv_weight);
483 }
484
485 /* HTS_Engine_generate_sample_sequence: generate sample sequence (3rd synthesis step) */
HTS_Engine_generate_sample_sequence(HTS_Engine * engine)486 HTS_Boolean HTS_Engine_generate_sample_sequence(HTS_Engine * engine)
487 {
488 return HTS_GStreamSet_create(&engine->gss, &engine->pss, engine->condition.stage, engine->condition.use_log_gain, engine->condition.sampling_frequency, engine->condition.fperiod, engine->condition.alpha, engine->condition.beta, &engine->condition.stop, engine->condition.volume, engine->condition.audio_buff_size > 0 ? &engine->audio : NULL);
489 }
490
491 /* HTS_Engine_synthesize: synthesize speech */
HTS_Engine_synthesize(HTS_Engine * engine)492 static HTS_Boolean HTS_Engine_synthesize(HTS_Engine * engine)
493 {
494 if (HTS_Engine_generate_state_sequence(engine) != TRUE) {
495 HTS_Engine_refresh(engine);
496 return FALSE;
497 }
498 if (HTS_Engine_generate_parameter_sequence(engine) != TRUE) {
499 HTS_Engine_refresh(engine);
500 return FALSE;
501 }
502 if (HTS_Engine_generate_sample_sequence(engine) != TRUE) {
503 HTS_Engine_refresh(engine);
504 return FALSE;
505 }
506 return TRUE;
507 }
508
509 /* HTS_Engine_synthesize_from_fn: synthesize speech from file name */
HTS_Engine_synthesize_from_fn(HTS_Engine * engine,const char * fn)510 HTS_Boolean HTS_Engine_synthesize_from_fn(HTS_Engine * engine, const char *fn)
511 {
512 HTS_Engine_refresh(engine);
513 HTS_Label_load_from_fn(&engine->label, engine->condition.sampling_frequency, engine->condition.fperiod, fn);
514 return HTS_Engine_synthesize(engine);
515 }
516
517 /* HTS_Engine_synthesize_from_strings: synthesize speech from strings */
HTS_Engine_synthesize_from_strings(HTS_Engine * engine,char ** lines,size_t num_lines)518 HTS_Boolean HTS_Engine_synthesize_from_strings(HTS_Engine * engine, char **lines, size_t num_lines)
519 {
520 HTS_Engine_refresh(engine);
521 HTS_Label_load_from_strings(&engine->label, engine->condition.sampling_frequency, engine->condition.fperiod, lines, num_lines);
522 return HTS_Engine_synthesize(engine);
523 }
524
525 /* HTS_Engine_save_information: save trace information */
HTS_Engine_save_information(HTS_Engine * engine,FILE * fp)526 void HTS_Engine_save_information(HTS_Engine * engine, FILE * fp)
527 {
528 size_t i, j, k, l, m, n;
529 double temp;
530 HTS_Condition *condition = &engine->condition;
531 HTS_ModelSet *ms = &engine->ms;
532 HTS_Label *label = &engine->label;
533 HTS_SStreamSet *sss = &engine->sss;
534 HTS_PStreamSet *pss = &engine->pss;
535
536 /* global parameter */
537 fprintf(fp, "[Global parameter]\n");
538 fprintf(fp, "Sampring frequency -> %8lu(Hz)\n", (unsigned long) condition->sampling_frequency);
539 fprintf(fp, "Frame period -> %8lu(point)\n", (unsigned long) condition->fperiod);
540 fprintf(fp, " %8.5f(msec)\n", 1e+3 * condition->fperiod / condition->sampling_frequency);
541 fprintf(fp, "All-pass constant -> %8.5f\n", (float) condition->alpha);
542 fprintf(fp, "Gamma -> %8.5f\n", (float) (condition->stage == 0 ? 0.0 : -1.0 / condition->stage));
543 if (condition->stage != 0) {
544 if (condition->use_log_gain == TRUE)
545 fprintf(fp, "Log gain flag -> TRUE\n");
546 else
547 fprintf(fp, "Log gain flag -> FALSE\n");
548 }
549 fprintf(fp, "Postfiltering coefficient -> %8.5f\n", (float) condition->beta);
550 fprintf(fp, "Audio buffer size -> %8lu(sample)\n", (unsigned long) condition->audio_buff_size);
551 fprintf(fp, "\n");
552
553 /* duration parameter */
554 fprintf(fp, "[Duration parameter]\n");
555 fprintf(fp, "Number of states -> %8lu\n", (unsigned long) HTS_ModelSet_get_nstate(ms));
556 fprintf(fp, " Interpolation size -> %8lu\n", (unsigned long) HTS_ModelSet_get_nvoices(ms));
557 /* check interpolation */
558 for (i = 0, temp = 0.0; i < HTS_ModelSet_get_nvoices(ms); i++)
559 temp += condition->duration_iw[i];
560 for (i = 0; i < HTS_ModelSet_get_nvoices(ms); i++)
561 if (condition->duration_iw[i] != 0.0)
562 condition->duration_iw[i] /= temp;
563 for (i = 0; i < HTS_ModelSet_get_nvoices(ms); i++)
564 fprintf(fp, " Interpolation weight[%2lu] -> %8.0f(%%)\n", (unsigned long) i, (float) (100 * condition->duration_iw[i]));
565 fprintf(fp, "\n");
566
567 fprintf(fp, "[Stream parameter]\n");
568 for (i = 0; i < HTS_ModelSet_get_nstream(ms); i++) {
569 /* stream parameter */
570 fprintf(fp, "Stream[%2lu] vector length -> %8lu\n", (unsigned long) i, (unsigned long) HTS_ModelSet_get_vector_length(ms, i));
571 fprintf(fp, " Dynamic window size -> %8lu\n", (unsigned long) HTS_ModelSet_get_window_size(ms, i));
572 /* interpolation */
573 fprintf(fp, " Interpolation size -> %8lu\n", (unsigned long) HTS_ModelSet_get_nvoices(ms));
574 for (j = 0, temp = 0.0; j < HTS_ModelSet_get_nvoices(ms); j++)
575 temp += condition->parameter_iw[j][i];
576 for (j = 0; j < HTS_ModelSet_get_nvoices(ms); j++)
577 if (condition->parameter_iw[j][i] != 0.0)
578 condition->parameter_iw[j][i] /= temp;
579 for (j = 0; j < HTS_ModelSet_get_nvoices(ms); j++)
580 fprintf(fp, " Interpolation weight[%2lu] -> %8.0f(%%)\n", (unsigned long) j, (float) (100 * condition->parameter_iw[j][i]));
581 /* MSD */
582 if (HTS_ModelSet_is_msd(ms, i)) { /* for MSD */
583 fprintf(fp, " MSD flag -> TRUE\n");
584 fprintf(fp, " MSD threshold -> %8.5f\n", condition->msd_threshold[i]);
585 } else { /* for non MSD */
586 fprintf(fp, " MSD flag -> FALSE\n");
587 }
588 /* GV */
589 if (HTS_ModelSet_use_gv(ms, i)) {
590 fprintf(fp, " GV flag -> TRUE\n");
591 fprintf(fp, " GV weight -> %8.0f(%%)\n", (float) (100 * condition->gv_weight[i]));
592 fprintf(fp, " GV interpolation size -> %8lu\n", (unsigned long) HTS_ModelSet_get_nvoices(ms));
593 /* interpolation */
594 for (j = 0, temp = 0.0; j < HTS_ModelSet_get_nvoices(ms); j++)
595 temp += condition->gv_iw[j][i];
596 for (j = 0; j < HTS_ModelSet_get_nvoices(ms); j++)
597 if (condition->gv_iw[j][i] != 0.0)
598 condition->gv_iw[j][i] /= temp;
599 for (j = 0; j < HTS_ModelSet_get_nvoices(ms); j++)
600 fprintf(fp, " GV interpolation weight[%2lu] -> %8.0f(%%)\n", (unsigned long) j, (float) (100 * condition->gv_iw[j][i]));
601 } else {
602 fprintf(fp, " GV flag -> FALSE\n");
603 }
604 }
605 fprintf(fp, "\n");
606
607 /* generated sequence */
608 fprintf(fp, "[Generated sequence]\n");
609 fprintf(fp, "Number of HMMs -> %8lu\n", (unsigned long) HTS_Label_get_size(label));
610 fprintf(fp, "Number of stats -> %8lu\n", (unsigned long) HTS_Label_get_size(label) * HTS_ModelSet_get_nstate(ms));
611 fprintf(fp, "Length of this speech -> %8.3f(sec)\n", (float) ((double) HTS_PStreamSet_get_total_frame(pss) * condition->fperiod / condition->sampling_frequency));
612 fprintf(fp, " -> %8lu(frames)\n", (unsigned long) HTS_PStreamSet_get_total_frame(pss) * condition->fperiod);
613
614 for (i = 0; i < HTS_Label_get_size(label); i++) {
615 fprintf(fp, "HMM[%2lu]\n", (unsigned long) i);
616 fprintf(fp, " Name -> %s\n", HTS_Label_get_string(label, i));
617 fprintf(fp, " Duration\n");
618 for (j = 0; j < HTS_ModelSet_get_nvoices(ms); j++) {
619 fprintf(fp, " Interpolation[%2lu]\n", (unsigned long) j);
620 HTS_ModelSet_get_duration_index(ms, j, HTS_Label_get_string(label, i), &k, &l);
621 fprintf(fp, " Tree index -> %8lu\n", (unsigned long) k);
622 fprintf(fp, " PDF index -> %8lu\n", (unsigned long) l);
623 }
624 for (j = 0; j < HTS_ModelSet_get_nstate(ms); j++) {
625 fprintf(fp, " State[%2lu]\n", (unsigned long) j + 2);
626 fprintf(fp, " Length -> %8lu(frames)\n", (unsigned long) HTS_SStreamSet_get_duration(sss, i * HTS_ModelSet_get_nstate(ms) + j));
627 for (k = 0; k < HTS_ModelSet_get_nstream(ms); k++) {
628 fprintf(fp, " Stream[%2lu]\n", (unsigned long) k);
629 if (HTS_ModelSet_is_msd(ms, k)) {
630 if (HTS_SStreamSet_get_msd(sss, k, i * HTS_ModelSet_get_nstate(ms) + j) > condition->msd_threshold[k])
631 fprintf(fp, " MSD flag -> TRUE\n");
632 else
633 fprintf(fp, " MSD flag -> FALSE\n");
634 }
635 for (l = 0; l < HTS_ModelSet_get_nvoices(ms); l++) {
636 fprintf(fp, " Interpolation[%2lu]\n", (unsigned long) l);
637 HTS_ModelSet_get_parameter_index(ms, l, k, j + 2, HTS_Label_get_string(label, i), &m, &n);
638 fprintf(fp, " Tree index -> %8lu\n", (unsigned long) m);
639 fprintf(fp, " PDF index -> %8lu\n", (unsigned long) n);
640 }
641 }
642 }
643 }
644 }
645
646 /* HTS_Engine_save_label: save label with time */
HTS_Engine_save_label(HTS_Engine * engine,FILE * fp)647 void HTS_Engine_save_label(HTS_Engine * engine, FILE * fp)
648 {
649 size_t i, j;
650 size_t frame, state, duration;
651
652 HTS_Label *label = &engine->label;
653 HTS_SStreamSet *sss = &engine->sss;
654 size_t nstate = HTS_ModelSet_get_nstate(&engine->ms);
655 double rate = engine->condition.fperiod * 1.0e+07 / engine->condition.sampling_frequency;
656
657 for (i = 0, state = 0, frame = 0; i < HTS_Label_get_size(label); i++) {
658 for (j = 0, duration = 0; j < nstate; j++)
659 duration += HTS_SStreamSet_get_duration(sss, state++);
660 fprintf(fp, "%lu %lu %s\n", (unsigned long) (frame * rate), (unsigned long) ((frame + duration) * rate), HTS_Label_get_string(label, i));
661 frame += duration;
662 }
663 }
664
665 /* HTS_Engine_save_generated_parameter: save generated parameter */
HTS_Engine_save_generated_parameter(HTS_Engine * engine,size_t stream_index,FILE * fp)666 void HTS_Engine_save_generated_parameter(HTS_Engine * engine, size_t stream_index, FILE * fp)
667 {
668 size_t i, j;
669 float temp;
670 HTS_GStreamSet *gss = &engine->gss;
671
672 for (i = 0; i < HTS_GStreamSet_get_total_frame(gss); i++)
673 for (j = 0; j < HTS_GStreamSet_get_vector_length(gss, stream_index); j++) {
674 temp = (float) HTS_GStreamSet_get_parameter(gss, stream_index, i, j);
675 fwrite(&temp, sizeof(float), 1, fp);
676 }
677 }
678
679 /* HTS_Engine_save_generated_speech: save generated speech */
HTS_Engine_save_generated_speech(HTS_Engine * engine,FILE * fp)680 void HTS_Engine_save_generated_speech(HTS_Engine * engine, FILE * fp)
681 {
682 size_t i;
683 double x;
684 short temp;
685 HTS_GStreamSet *gss = &engine->gss;
686
687 for (i = 0; i < HTS_GStreamSet_get_total_nsamples(gss); i++) {
688 x = HTS_GStreamSet_get_speech(gss, i);
689 if (x > 32767.0)
690 temp = 32767;
691 else if (x < -32768.0)
692 temp = -32768;
693 else
694 temp = (short) x;
695 fwrite(&temp, sizeof(short), 1, fp);
696 }
697 }
698
699 /* HTS_Engine_save_riff: save RIFF format file */
HTS_Engine_save_riff(HTS_Engine * engine,FILE * fp)700 void HTS_Engine_save_riff(HTS_Engine * engine, FILE * fp)
701 {
702 size_t i;
703 double x;
704 short temp;
705
706 HTS_GStreamSet *gss = &engine->gss;
707 char data_01_04[] = { 'R', 'I', 'F', 'F' };
708 int data_05_08 = HTS_GStreamSet_get_total_nsamples(gss) * sizeof(short) + 36;
709 char data_09_12[] = { 'W', 'A', 'V', 'E' };
710 char data_13_16[] = { 'f', 'm', 't', ' ' };
711 int data_17_20 = 16;
712 short data_21_22 = 1; /* PCM */
713 short data_23_24 = 1; /* monoral */
714 int data_25_28 = engine->condition.sampling_frequency;
715 int data_29_32 = engine->condition.sampling_frequency * sizeof(short);
716 short data_33_34 = sizeof(short);
717 short data_35_36 = (short) (sizeof(short) * 8);
718 char data_37_40[] = { 'd', 'a', 't', 'a' };
719 int data_41_44 = HTS_GStreamSet_get_total_nsamples(gss) * sizeof(short);
720
721 /* write header */
722 HTS_fwrite_little_endian(data_01_04, sizeof(char), 4, fp);
723 HTS_fwrite_little_endian(&data_05_08, sizeof(int), 1, fp);
724 HTS_fwrite_little_endian(data_09_12, sizeof(char), 4, fp);
725 HTS_fwrite_little_endian(data_13_16, sizeof(char), 4, fp);
726 HTS_fwrite_little_endian(&data_17_20, sizeof(int), 1, fp);
727 HTS_fwrite_little_endian(&data_21_22, sizeof(short), 1, fp);
728 HTS_fwrite_little_endian(&data_23_24, sizeof(short), 1, fp);
729 HTS_fwrite_little_endian(&data_25_28, sizeof(int), 1, fp);
730 HTS_fwrite_little_endian(&data_29_32, sizeof(int), 1, fp);
731 HTS_fwrite_little_endian(&data_33_34, sizeof(short), 1, fp);
732 HTS_fwrite_little_endian(&data_35_36, sizeof(short), 1, fp);
733 HTS_fwrite_little_endian(data_37_40, sizeof(char), 4, fp);
734 HTS_fwrite_little_endian(&data_41_44, sizeof(int), 1, fp);
735 /* write data */
736 for (i = 0; i < HTS_GStreamSet_get_total_nsamples(gss); i++) {
737 x = HTS_GStreamSet_get_speech(gss, i);
738 if (x > 32767.0)
739 temp = 32767;
740 else if (x < -32768.0)
741 temp = -32768;
742 else
743 temp = (short) x;
744 HTS_fwrite_little_endian(&temp, sizeof(short), 1, fp);
745 }
746 }
747
748 /* HTS_Engine_refresh: free model per one time synthesis */
HTS_Engine_refresh(HTS_Engine * engine)749 void HTS_Engine_refresh(HTS_Engine * engine)
750 {
751 /* free generated parameter stream set */
752 HTS_GStreamSet_clear(&engine->gss);
753 /* free parameter stream set */
754 HTS_PStreamSet_clear(&engine->pss);
755 /* free state stream set */
756 HTS_SStreamSet_clear(&engine->sss);
757 /* free label list */
758 HTS_Label_clear(&engine->label);
759 /* stop flag */
760 engine->condition.stop = FALSE;
761 }
762
763 /* HTS_Engine_clear: free engine */
HTS_Engine_clear(HTS_Engine * engine)764 void HTS_Engine_clear(HTS_Engine * engine)
765 {
766 size_t i;
767
768 if (engine->condition.msd_threshold != NULL)
769 HTS_free(engine->condition.msd_threshold);
770 if (engine->condition.duration_iw != NULL)
771 HTS_free(engine->condition.duration_iw);
772 if (engine->condition.gv_weight != NULL)
773 HTS_free(engine->condition.gv_weight);
774 if (engine->condition.parameter_iw != NULL) {
775 for (i = 0; i < HTS_ModelSet_get_nvoices(&engine->ms); i++)
776 HTS_free(engine->condition.parameter_iw[i]);
777 HTS_free(engine->condition.parameter_iw);
778 }
779 if (engine->condition.gv_iw != NULL) {
780 for (i = 0; i < HTS_ModelSet_get_nvoices(&engine->ms); i++)
781 HTS_free(engine->condition.gv_iw[i]);
782 HTS_free(engine->condition.gv_iw);
783 }
784
785 HTS_ModelSet_clear(&engine->ms);
786 HTS_Audio_clear(&engine->audio);
787 HTS_Engine_initialize(engine);
788 }
789
790 HTS_ENGINE_C_END;
791
792 #endif /* !HTS_ENGINE_C */
793