1 /* ----------------------------------------------------------------- */
2 /*           The HMM-Based Speech Synthesis Engine "hts_engine API"  */
3 /*           developed by HTS Working Group                          */
4 /*           http://hts-engine.sourceforge.net/                      */
5 /* ----------------------------------------------------------------- */
6 /*                                                                   */
7 /*  Copyright (c) 2001-2015  Nagoya Institute of Technology          */
8 /*                           Department of Computer Science          */
9 /*                                                                   */
10 /*                2001-2008  Tokyo Institute of Technology           */
11 /*                           Interdisciplinary Graduate School of    */
12 /*                           Science and Engineering                 */
13 /*                                                                   */
14 /* All rights reserved.                                              */
15 /*                                                                   */
16 /* Redistribution and use in source and binary forms, with or        */
17 /* without modification, are permitted provided that the following   */
18 /* conditions are met:                                               */
19 /*                                                                   */
20 /* - Redistributions of source code must retain the above copyright  */
21 /*   notice, this list of conditions and the following disclaimer.   */
22 /* - Redistributions in binary form must reproduce the above         */
23 /*   copyright notice, this list of conditions and the following     */
24 /*   disclaimer in the documentation and/or other materials provided */
25 /*   with the distribution.                                          */
26 /* - Neither the name of the HTS working group nor the names of its  */
27 /*   contributors may be used to endorse or promote products derived */
28 /*   from this software without specific prior written permission.   */
29 /*                                                                   */
30 /* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND            */
31 /* CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES,       */
32 /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF          */
33 /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE          */
34 /* DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS */
35 /* BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,          */
36 /* EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED   */
37 /* TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,     */
38 /* DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON */
39 /* ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,   */
40 /* OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY    */
41 /* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE           */
42 /* POSSIBILITY OF SUCH DAMAGE.                                       */
43 /* ----------------------------------------------------------------- */
44 
45 #ifndef HTS_ENGINE_H
46 #define HTS_ENGINE_H
47 
48 #ifdef __cplusplus
49 #define HTS_ENGINE_H_START extern "C" {
50 #define HTS_ENGINE_H_END   }
51 #else
52 #define HTS_ENGINE_H_START
53 #define HTS_ENGINE_H_END
54 #endif                          /* __CPLUSPLUS */
55 
56 HTS_ENGINE_H_START;
57 
58 #include <stdio.h>
59 
60 /* common ---------------------------------------------------------- */
61 
62 typedef char HTS_Boolean;
63 
64 #ifndef TRUE
65 #define TRUE  1
66 #endif                          /* !TRUE */
67 
68 #ifndef FALSE
69 #define FALSE 0
70 #endif                          /* !FALSE */
71 
72 #ifndef HTS_NODATA
73 #define HTS_NODATA (-1.0e+10)
74 #endif                          /* HTS_NODATA */
75 
76 /* copyright ------------------------------------------------------- */
77 
78 #define HTS_COPYRIGHT "The HMM-Based Speech Synthesis Engine \"hts_engine API\"\nVersion 1.10 (http://hts-engine.sourceforge.net/)\nCopyright (C) 2001-2015 Nagoya Institute of Technology\n              2001-2008 Tokyo Institute of Technology\nAll rights reserved.\n"
79 
80 /* audio ----------------------------------------------------------- */
81 
82 /* HTS_Audio: audio output wrapper */
83 typedef struct _HTS_Audio {
84    size_t sampling_frequency;   /* sampling frequency */
85    size_t max_buff_size;        /* buffer size for audio output interface */
86    short *buff;                 /* current buffer */
87    size_t buff_size;            /* current buffer size */
88    void *audio_interface;       /* audio interface specified in compile step */
89 } HTS_Audio;
90 
91 /* model ----------------------------------------------------------- */
92 
93 /* HTS_Window: window coefficients to calculate dynamic features. */
94 typedef struct _HTS_Window {
95    size_t size;                 /* # of windows (static + deltas) */
96    int *l_width;                /* left width of windows */
97    int *r_width;                /* right width of windows */
98    double **coefficient;        /* window coefficient */
99    size_t max_width;            /* maximum width of windows */
100 } HTS_Window;
101 
102 /* HTS_Pattern: list of patterns in a question and a tree. */
103 typedef struct _HTS_Pattern {
104    char *string;                /* pattern string */
105    struct _HTS_Pattern *next;   /* pointer to the next pattern */
106 } HTS_Pattern;
107 
108 /* HTS_Question: list of questions in a tree. */
109 typedef struct _HTS_Question {
110    char *string;                /* name of this question */
111    HTS_Pattern *head;           /* pointer to the head of pattern list */
112    struct _HTS_Question *next;  /* pointer to the next question */
113 } HTS_Question;
114 
115 /* HTS_Node: list of tree nodes in a tree. */
116 typedef struct _HTS_Node {
117    int index;                   /* index of this node */
118    size_t pdf;                  /* index of PDF for this node (leaf node only) */
119    struct _HTS_Node *yes;       /* pointer to its child node (yes) */
120    struct _HTS_Node *no;        /* pointer to its child node (no) */
121    struct _HTS_Node *next;      /* pointer to the next node */
122    HTS_Question *quest;         /* question applied at this node */
123 } HTS_Node;
124 
125 /* HTS_Tree: list of decision trees in a model. */
126 typedef struct _HTS_Tree {
127    HTS_Pattern *head;           /* pointer to the head of pattern list for this tree */
128    struct _HTS_Tree *next;      /* pointer to next tree */
129    HTS_Node *root;              /* root node of this tree */
130    size_t state;                /* state index of this tree */
131 } HTS_Tree;
132 
133 /* HTS_Model: set of PDFs, decision trees and questions. */
134 typedef struct _HTS_Model {
135    size_t vector_length;        /* vector length (static features only) */
136    size_t num_windows;          /* # of windows for delta */
137    HTS_Boolean is_msd;          /* flag for MSD */
138    size_t ntree;                /* # of trees */
139    size_t *npdf;                /* # of PDFs at each tree */
140    float ***pdf;                /* PDFs */
141    HTS_Tree *tree;              /* pointer to the list of trees */
142    HTS_Question *question;      /* pointer to the list of questions */
143 } HTS_Model;
144 
145 /* HTS_ModelSet: set of duration models, HMMs and GV models. */
146 typedef struct _HTS_ModelSet {
147    char *hts_voice_version;     /* version of HTS voice format */
148    size_t sampling_frequency;   /* sampling frequency */
149    size_t frame_period;         /* frame period */
150    size_t num_voices;           /* # of HTS voices */
151    size_t num_states;           /* # of HMM states */
152    size_t num_streams;          /* # of streams */
153    char *stream_type;           /* stream type */
154    char *fullcontext_format;    /* fullcontext label format */
155    char *fullcontext_version;   /* version of fullcontext label */
156    HTS_Question *gv_off_context;        /* GV switch */
157    char **option;               /* options for each stream */
158    HTS_Model *duration;         /* duration PDFs and trees */
159    HTS_Window *window;          /* window coefficients for delta */
160    HTS_Model **stream;          /* parameter PDFs and trees */
161    HTS_Model **gv;              /* GV PDFs and trees */
162 } HTS_ModelSet;
163 
164 /* label ----------------------------------------------------------- */
165 
166 /* HTS_LabelString: individual label string with time information */
167 typedef struct _HTS_LabelString {
168    struct _HTS_LabelString *next;       /* pointer to next label string */
169    char *name;                  /* label string */
170    double start;                /* start frame specified in the given label */
171    double end;                  /* end frame specified in the given label */
172 } HTS_LabelString;
173 
174 /* HTS_Label: list of label strings */
175 typedef struct _HTS_Label {
176    HTS_LabelString *head;       /* pointer to the head of label string */
177    size_t size;                 /* # of label strings */
178 } HTS_Label;
179 
180 /* sstream --------------------------------------------------------- */
181 
182 /* HTS_SStream: individual state stream */
183 typedef struct _HTS_SStream {
184    size_t vector_length;        /* vector length (static features only) */
185    double **mean;               /* mean vector sequence */
186    double **vari;               /* variance vector sequence */
187    double *msd;                 /* MSD parameter sequence */
188    size_t win_size;             /* # of windows (static + deltas) */
189    int *win_l_width;            /* left width of windows */
190    int *win_r_width;            /* right width of windows */
191    double **win_coefficient;    /* window cofficients */
192    size_t win_max_width;        /* maximum width of windows */
193    double *gv_mean;             /* mean vector of GV */
194    double *gv_vari;             /* variance vector of GV */
195    HTS_Boolean *gv_switch;      /* GV flag sequence */
196 } HTS_SStream;
197 
198 /* HTS_SStreamSet: set of state stream */
199 typedef struct _HTS_SStreamSet {
200    HTS_SStream *sstream;        /* state streams */
201    size_t nstream;              /* # of streams */
202    size_t nstate;               /* # of states */
203    size_t *duration;            /* duration sequence */
204    size_t total_state;          /* total state */
205    size_t total_frame;          /* total frame */
206 } HTS_SStreamSet;
207 
208 /* pstream --------------------------------------------------------- */
209 
210 /* HTS_SMatrices: matrices/vectors used in the speech parameter generation algorithm. */
211 typedef struct _HTS_SMatrices {
212    double **mean;               /* mean vector sequence */
213    double **ivar;               /* inverse diag variance sequence */
214    double *g;                   /* vector used in the forward substitution */
215    double **wuw;                /* W' U^-1 W  */
216    double *wum;                 /* W' U^-1 mu */
217 } HTS_SMatrices;
218 
219 /* HTS_PStream: individual PDF stream. */
220 typedef struct _HTS_PStream {
221    size_t vector_length;        /* vector length (static features only) */
222    size_t length;               /* stream length */
223    size_t width;                /* width of dynamic window */
224    double **par;                /* output parameter vector */
225    HTS_SMatrices sm;            /* matrices for parameter generation */
226    size_t win_size;             /* # of windows (static + deltas) */
227    int *win_l_width;            /* left width of windows */
228    int *win_r_width;            /* right width of windows */
229    double **win_coefficient;    /* window coefficients */
230    HTS_Boolean *msd_flag;       /* Boolean sequence for MSD */
231    double *gv_mean;             /* mean vector of GV */
232    double *gv_vari;             /* variance vector of GV */
233    HTS_Boolean *gv_switch;      /* GV flag sequence */
234    size_t gv_length;            /* frame length for GV calculation */
235 } HTS_PStream;
236 
237 /* HTS_PStreamSet: set of PDF streams. */
238 typedef struct _HTS_PStreamSet {
239    HTS_PStream *pstream;        /* PDF streams */
240    size_t nstream;              /* # of PDF streams */
241    size_t total_frame;          /* total frame */
242 } HTS_PStreamSet;
243 
244 /* gstream --------------------------------------------------------- */
245 
246 /* HTS_GStream: generated parameter stream. */
247 typedef struct _HTS_GStream {
248    size_t vector_length;        /* vector length (static features only) */
249    double **par;                /* generated parameter */
250 } HTS_GStream;
251 
252 /* HTS_GStreamSet: set of generated parameter stream. */
253 typedef struct _HTS_GStreamSet {
254    size_t total_nsample;        /* total sample */
255    size_t total_frame;          /* total frame */
256    size_t nstream;              /* # of streams */
257    HTS_GStream *gstream;        /* generated parameter streams */
258    double *gspeech;             /* generated speech */
259 } HTS_GStreamSet;
260 
261 /* engine ---------------------------------------------------------- */
262 
263 /* HTS_Condition: synthesis condition */
264 typedef struct _HTS_Condition {
265    /* global */
266    size_t sampling_frequency;   /* sampling frequency */
267    size_t fperiod;              /* frame period */
268    size_t audio_buff_size;      /* audio buffer size (for audio device) */
269    HTS_Boolean stop;            /* stop flag */
270    double volume;               /* volume */
271    double *msd_threshold;       /* MSD thresholds */
272    double *gv_weight;           /* GV weights */
273 
274    /* duration */
275    HTS_Boolean phoneme_alignment_flag;  /* flag for using phoneme alignment in label */
276    double speed;                /* speech speed */
277 
278    /* spectrum */
279    size_t stage;                /* if stage=0 then gamma=0 else gamma=-1/stage */
280    HTS_Boolean use_log_gain;    /* log gain flag (for LSP) */
281    double alpha;                /* all-pass constant */
282    double beta;                 /* postfiltering coefficient */
283 
284    /* log F0 */
285    double additional_half_tone; /* additional half tone */
286 
287    /* interpolation weights */
288    double *duration_iw;         /* weights for duration interpolation */
289    double **parameter_iw;       /* weights for parameter interpolation */
290    double **gv_iw;              /* weights for GV interpolation */
291 } HTS_Condition;
292 
293 /* HTS_Engine: Engine itself. */
294 typedef struct _HTS_Engine {
295    HTS_Condition condition;     /* synthesis condition */
296    HTS_Audio audio;             /* audio output */
297    HTS_ModelSet ms;             /* set of duration models, HMMs and GV models */
298    HTS_Label label;             /* label */
299    HTS_SStreamSet sss;          /* set of state streams */
300    HTS_PStreamSet pss;          /* set of PDF streams */
301    HTS_GStreamSet gss;          /* set of generated parameter streams */
302 } HTS_Engine;
303 
304 /* engine method --------------------------------------------------- */
305 
306 /* HTS_Engine_initialize: initialize engine */
307 void HTS_Engine_initialize(HTS_Engine * engine);
308 
309 /* HTS_Engine_load: load HTS voices */
310 HTS_Boolean HTS_Engine_load(HTS_Engine * engine, char **voices, size_t num_voices);
311 
312 /* HTS_Engine_set_sampling_frequency: set sampling fraquency */
313 void HTS_Engine_set_sampling_frequency(HTS_Engine * engine, size_t i);
314 
315 /* HTS_Engine_get_sampling_frequency: get sampling frequency */
316 size_t HTS_Engine_get_sampling_frequency(HTS_Engine * engine);
317 
318 /* HTS_Engine_set_fperiod: set frame period */
319 void HTS_Engine_set_fperiod(HTS_Engine * engine, size_t i);
320 
321 /* HTS_Engine_get_fperiod: get frame period */
322 size_t HTS_Engine_get_fperiod(HTS_Engine * engine);
323 
324 /* HTS_Engine_set_audio_buff_size: set audio buffer size */
325 void HTS_Engine_set_audio_buff_size(HTS_Engine * engine, size_t i);
326 
327 /* HTS_Engine_get_audio_buff_size: get audio buffer size */
328 size_t HTS_Engine_get_audio_buff_size(HTS_Engine * engine);
329 
330 /* HTS_Engine_set_stop_flag: set stop flag */
331 void HTS_Engine_set_stop_flag(HTS_Engine * engine, HTS_Boolean b);
332 
333 /* HTS_Engine_get_stop_flag: get stop flag */
334 HTS_Boolean HTS_Engine_get_stop_flag(HTS_Engine * engine);
335 
336 /* HTS_Engine_set_volume: set volume in db */
337 void HTS_Engine_set_volume(HTS_Engine * engine, double f);
338 
339 /* HTS_Engine_get_volume: get volume in db */
340 double HTS_Engine_get_volume(HTS_Engine * engine);
341 
342 /* HTS_Egnine_set_msd_threshold: set MSD threshold */
343 void HTS_Engine_set_msd_threshold(HTS_Engine * engine, size_t stream_index, double f);
344 
345 /* HTS_Engine_get_msd_threshold: get MSD threshold */
346 double HTS_Engine_get_msd_threshold(HTS_Engine * engine, size_t stream_index);
347 
348 /* HTS_Engine_set_gv_weight: set GV weight */
349 void HTS_Engine_set_gv_weight(HTS_Engine * engine, size_t stream_index, double f);
350 
351 /* HTS_Engine_get_gv_weight: get GV weight */
352 double HTS_Engine_get_gv_weight(HTS_Engine * engine, size_t stream_index);
353 
354 /* HTS_Engine_set_speed: set speech speed */
355 void HTS_Engine_set_speed(HTS_Engine * engine, double f);
356 
357 /* HTS_Engine_set_phoneme_alignment_flag: set flag for using phoneme alignment in label */
358 void HTS_Engine_set_phoneme_alignment_flag(HTS_Engine * engine, HTS_Boolean b);
359 
360 /* HTS_Engine_set_alpha: set alpha */
361 void HTS_Engine_set_alpha(HTS_Engine * engine, double f);
362 
363 /* HTS_Engine_get_alpha: get alpha */
364 double HTS_Engine_get_alpha(HTS_Engine * engine);
365 
366 /* HTS_Engine_set_beta: set beta */
367 void HTS_Engine_set_beta(HTS_Engine * engine, double f);
368 
369 /* HTS_Engine_get_beta: get beta */
370 double HTS_Engine_get_beta(HTS_Engine * engine);
371 
372 /* HTS_Engine_add_half_tone: add half tone */
373 void HTS_Engine_add_half_tone(HTS_Engine * engine, double f);
374 
375 /* HTS_Engine_set_duration_interpolation_weight: set interpolation weight for duration */
376 void HTS_Engine_set_duration_interpolation_weight(HTS_Engine * engine, size_t voice_index, double f);
377 
378 /* HTS_Engine_get_duration_interpolation_weight: get interpolation weight for duration */
379 double HTS_Engine_get_duration_interpolation_weight(HTS_Engine * engine, size_t voice_index);
380 
381 /* HTS_Engine_set_parameter_interpolation_weight: set interpolation weight for parameter */
382 void HTS_Engine_set_parameter_interpolation_weight(HTS_Engine * engine, size_t voice_index, size_t stream_index, double f);
383 
384 /* HTS_Engine_get_parameter_interpolation_weight: get interpolation weight for parameter */
385 double HTS_Engine_get_parameter_interpolation_weight(HTS_Engine * engine, size_t voice_index, size_t stream_index);
386 
387 /* HTS_Engine_set_gv_interpolation_weight: set interpolation weight for GV */
388 void HTS_Engine_set_gv_interpolation_weight(HTS_Engine * engine, size_t voice_index, size_t stream_index, double f);
389 
390 /* HTS_Engine_get_gv_interpolation_weight: get interpolation weight for GV */
391 double HTS_Engine_get_gv_interpolation_weight(HTS_Engine * engine, size_t voice_index, size_t stream_index);
392 
393 /* HTS_Engine_get_total_state: get total number of state */
394 size_t HTS_Engine_get_total_state(HTS_Engine * engine);
395 
396 /* HTS_Engine_set_state_mean: set mean value of state */
397 void HTS_Engine_set_state_mean(HTS_Engine * engine, size_t stream_index, size_t state_index, size_t vector_index, double f);
398 
399 /* HTS_Engine_get_state_mean: get mean value of state */
400 double HTS_Engine_get_state_mean(HTS_Engine * engine, size_t stream_index, size_t state_index, size_t vector_index);
401 
402 /* HTS_Engine_get_state_duration: get state duration */
403 size_t HTS_Engine_get_state_duration(HTS_Engine * engine, size_t state_index);
404 
405 /* HTS_Engine_get_nvoices: get number of voices */
406 size_t HTS_Engine_get_nvoices(HTS_Engine * engine);
407 
408 /* HTS_Engine_get_nstream: get number of stream */
409 size_t HTS_Engine_get_nstream(HTS_Engine * engine);
410 
411 /* HTS_Engine_get_nstate: get number of state */
412 size_t HTS_Engine_get_nstate(HTS_Engine * engine);
413 
414 /* HTS_Engine_get_fullcontext_label_format: get full context label format */
415 const char *HTS_Engine_get_fullcontext_label_format(HTS_Engine * engine);
416 
417 /* HTS_Engine_get_fullcontext_label_version: get full context label version */
418 const char *HTS_Engine_get_fullcontext_label_version(HTS_Engine * engine);
419 
420 /* HTS_Engine_get_total_frame: get total number of frame */
421 size_t HTS_Engine_get_total_frame(HTS_Engine * engine);
422 
423 /* HTS_Engine_get_nsamples: get number of samples */
424 size_t HTS_Engine_get_nsamples(HTS_Engine * engine);
425 
426 /* HTS_Engine_get_generated_parameter: output generated parameter */
427 double HTS_Engine_get_generated_parameter(HTS_Engine * engine, size_t stream_index, size_t frame_index, size_t vector_index);
428 
429 /* HTS_Engine_get_generated_speech: output generated speech */
430 double HTS_Engine_get_generated_speech(HTS_Engine * engine, size_t index);
431 
432 /* HTS_Engine_synthesize_from_fn: synthesize speech from file name */
433 HTS_Boolean HTS_Engine_synthesize_from_fn(HTS_Engine * engine, const char *fn);
434 
435 /* HTS_Engine_synthesize_from_strings: synthesize speech from string list */
436 HTS_Boolean HTS_Engine_synthesize_from_strings(HTS_Engine * engine, char **lines, size_t num_lines);
437 
438 /* HTS_Engine_generate_state_sequence_from_fn: generate state sequence from file name (1st synthesis step) */
439 HTS_Boolean HTS_Engine_generate_state_sequence_from_fn(HTS_Engine * engine, const char *fn);
440 
441 /* HTS_Engine_generate_state_sequence_from_strings: generate state sequence from string list (1st synthesis step) */
442 HTS_Boolean HTS_Engine_generate_state_sequence_from_strings(HTS_Engine * engine, char **lines, size_t num_lines);
443 
444 /* HTS_Engine_generate_parameter_sequence: generate parameter sequence (2nd synthesis step) */
445 HTS_Boolean HTS_Engine_generate_parameter_sequence(HTS_Engine * engine);
446 
447 /* HTS_Engine_generate_sample_sequence: generate sample sequence (3rd synthesis step) */
448 HTS_Boolean HTS_Engine_generate_sample_sequence(HTS_Engine * engine);
449 
450 /* HTS_Engine_save_information: save trace information */
451 void HTS_Engine_save_information(HTS_Engine * engine, FILE * fp);
452 
453 /* HTS_Engine_save_label: save label with time */
454 void HTS_Engine_save_label(HTS_Engine * engine, FILE * fp);
455 
456 /* HTS_Engine_save_generated_parameter: save generated parameter */
457 void HTS_Engine_save_generated_parameter(HTS_Engine * engine, size_t stream_index, FILE * fp);
458 
459 /* HTS_Engine_save_generated_speech: save generated speech */
460 void HTS_Engine_save_generated_speech(HTS_Engine * engine, FILE * fp);
461 
462 /* HTS_Engine_save_riff: save RIFF format file */
463 void HTS_Engine_save_riff(HTS_Engine * engine, FILE * fp);
464 
465 /* HTS_Engine_refresh: free memory per one time synthesis */
466 void HTS_Engine_refresh(HTS_Engine * engine);
467 
468 /* HTS_Engine_clear: free engine */
469 void HTS_Engine_clear(HTS_Engine * engine);
470 
471 HTS_ENGINE_H_END;
472 
473 #endif                          /* !HTS_ENGINE_H */
474