1 /* ----------------------------------------------------------------- */ 2 /* The HMM-Based Speech Synthesis Engine "hts_engine API" */ 3 /* developed by HTS Working Group */ 4 /* http://hts-engine.sourceforge.net/ */ 5 /* ----------------------------------------------------------------- */ 6 /* */ 7 /* Copyright (c) 2001-2015 Nagoya Institute of Technology */ 8 /* Department of Computer Science */ 9 /* */ 10 /* 2001-2008 Tokyo Institute of Technology */ 11 /* Interdisciplinary Graduate School of */ 12 /* Science and Engineering */ 13 /* */ 14 /* All rights reserved. */ 15 /* */ 16 /* Redistribution and use in source and binary forms, with or */ 17 /* without modification, are permitted provided that the following */ 18 /* conditions are met: */ 19 /* */ 20 /* - Redistributions of source code must retain the above copyright */ 21 /* notice, this list of conditions and the following disclaimer. */ 22 /* - Redistributions in binary form must reproduce the above */ 23 /* copyright notice, this list of conditions and the following */ 24 /* disclaimer in the documentation and/or other materials provided */ 25 /* with the distribution. */ 26 /* - Neither the name of the HTS working group nor the names of its */ 27 /* contributors may be used to endorse or promote products derived */ 28 /* from this software without specific prior written permission. */ 29 /* */ 30 /* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND */ 31 /* CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, */ 32 /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ 33 /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ 34 /* DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS */ 35 /* BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, */ 36 /* EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED */ 37 /* TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, */ 38 /* DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON */ 39 /* ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, */ 40 /* OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY */ 41 /* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ 42 /* POSSIBILITY OF SUCH DAMAGE. */ 43 /* ----------------------------------------------------------------- */ 44 45 #ifndef HTS_ENGINE_H 46 #define HTS_ENGINE_H 47 48 #ifdef __cplusplus 49 #define HTS_ENGINE_H_START extern "C" { 50 #define HTS_ENGINE_H_END } 51 #else 52 #define HTS_ENGINE_H_START 53 #define HTS_ENGINE_H_END 54 #endif /* __CPLUSPLUS */ 55 56 HTS_ENGINE_H_START; 57 58 #include <stdio.h> 59 60 /* common ---------------------------------------------------------- */ 61 62 typedef char HTS_Boolean; 63 64 #ifndef TRUE 65 #define TRUE 1 66 #endif /* !TRUE */ 67 68 #ifndef FALSE 69 #define FALSE 0 70 #endif /* !FALSE */ 71 72 #ifndef HTS_NODATA 73 #define HTS_NODATA (-1.0e+10) 74 #endif /* HTS_NODATA */ 75 76 /* copyright ------------------------------------------------------- */ 77 78 #define HTS_COPYRIGHT "The HMM-Based Speech Synthesis Engine \"hts_engine API\"\nVersion 1.10 (http://hts-engine.sourceforge.net/)\nCopyright (C) 2001-2015 Nagoya Institute of Technology\n 2001-2008 Tokyo Institute of Technology\nAll rights reserved.\n" 79 80 /* audio ----------------------------------------------------------- */ 81 82 /* HTS_Audio: audio output wrapper */ 83 typedef struct _HTS_Audio { 84 size_t sampling_frequency; /* sampling frequency */ 85 size_t max_buff_size; /* buffer size for audio output interface */ 86 short *buff; /* current buffer */ 87 size_t buff_size; /* current buffer size */ 88 void *audio_interface; /* audio interface specified in compile step */ 89 } HTS_Audio; 90 91 /* model ----------------------------------------------------------- */ 92 93 /* HTS_Window: window coefficients to calculate dynamic features. */ 94 typedef struct _HTS_Window { 95 size_t size; /* # of windows (static + deltas) */ 96 int *l_width; /* left width of windows */ 97 int *r_width; /* right width of windows */ 98 double **coefficient; /* window coefficient */ 99 size_t max_width; /* maximum width of windows */ 100 } HTS_Window; 101 102 /* HTS_Pattern: list of patterns in a question and a tree. */ 103 typedef struct _HTS_Pattern { 104 char *string; /* pattern string */ 105 struct _HTS_Pattern *next; /* pointer to the next pattern */ 106 } HTS_Pattern; 107 108 /* HTS_Question: list of questions in a tree. */ 109 typedef struct _HTS_Question { 110 char *string; /* name of this question */ 111 HTS_Pattern *head; /* pointer to the head of pattern list */ 112 struct _HTS_Question *next; /* pointer to the next question */ 113 } HTS_Question; 114 115 /* HTS_Node: list of tree nodes in a tree. */ 116 typedef struct _HTS_Node { 117 int index; /* index of this node */ 118 size_t pdf; /* index of PDF for this node (leaf node only) */ 119 struct _HTS_Node *yes; /* pointer to its child node (yes) */ 120 struct _HTS_Node *no; /* pointer to its child node (no) */ 121 struct _HTS_Node *next; /* pointer to the next node */ 122 HTS_Question *quest; /* question applied at this node */ 123 } HTS_Node; 124 125 /* HTS_Tree: list of decision trees in a model. */ 126 typedef struct _HTS_Tree { 127 HTS_Pattern *head; /* pointer to the head of pattern list for this tree */ 128 struct _HTS_Tree *next; /* pointer to next tree */ 129 HTS_Node *root; /* root node of this tree */ 130 size_t state; /* state index of this tree */ 131 } HTS_Tree; 132 133 /* HTS_Model: set of PDFs, decision trees and questions. */ 134 typedef struct _HTS_Model { 135 size_t vector_length; /* vector length (static features only) */ 136 size_t num_windows; /* # of windows for delta */ 137 HTS_Boolean is_msd; /* flag for MSD */ 138 size_t ntree; /* # of trees */ 139 size_t *npdf; /* # of PDFs at each tree */ 140 float ***pdf; /* PDFs */ 141 HTS_Tree *tree; /* pointer to the list of trees */ 142 HTS_Question *question; /* pointer to the list of questions */ 143 } HTS_Model; 144 145 /* HTS_ModelSet: set of duration models, HMMs and GV models. */ 146 typedef struct _HTS_ModelSet { 147 char *hts_voice_version; /* version of HTS voice format */ 148 size_t sampling_frequency; /* sampling frequency */ 149 size_t frame_period; /* frame period */ 150 size_t num_voices; /* # of HTS voices */ 151 size_t num_states; /* # of HMM states */ 152 size_t num_streams; /* # of streams */ 153 char *stream_type; /* stream type */ 154 char *fullcontext_format; /* fullcontext label format */ 155 char *fullcontext_version; /* version of fullcontext label */ 156 HTS_Question *gv_off_context; /* GV switch */ 157 char **option; /* options for each stream */ 158 HTS_Model *duration; /* duration PDFs and trees */ 159 HTS_Window *window; /* window coefficients for delta */ 160 HTS_Model **stream; /* parameter PDFs and trees */ 161 HTS_Model **gv; /* GV PDFs and trees */ 162 } HTS_ModelSet; 163 164 /* label ----------------------------------------------------------- */ 165 166 /* HTS_LabelString: individual label string with time information */ 167 typedef struct _HTS_LabelString { 168 struct _HTS_LabelString *next; /* pointer to next label string */ 169 char *name; /* label string */ 170 double start; /* start frame specified in the given label */ 171 double end; /* end frame specified in the given label */ 172 } HTS_LabelString; 173 174 /* HTS_Label: list of label strings */ 175 typedef struct _HTS_Label { 176 HTS_LabelString *head; /* pointer to the head of label string */ 177 size_t size; /* # of label strings */ 178 } HTS_Label; 179 180 /* sstream --------------------------------------------------------- */ 181 182 /* HTS_SStream: individual state stream */ 183 typedef struct _HTS_SStream { 184 size_t vector_length; /* vector length (static features only) */ 185 double **mean; /* mean vector sequence */ 186 double **vari; /* variance vector sequence */ 187 double *msd; /* MSD parameter sequence */ 188 size_t win_size; /* # of windows (static + deltas) */ 189 int *win_l_width; /* left width of windows */ 190 int *win_r_width; /* right width of windows */ 191 double **win_coefficient; /* window cofficients */ 192 size_t win_max_width; /* maximum width of windows */ 193 double *gv_mean; /* mean vector of GV */ 194 double *gv_vari; /* variance vector of GV */ 195 HTS_Boolean *gv_switch; /* GV flag sequence */ 196 } HTS_SStream; 197 198 /* HTS_SStreamSet: set of state stream */ 199 typedef struct _HTS_SStreamSet { 200 HTS_SStream *sstream; /* state streams */ 201 size_t nstream; /* # of streams */ 202 size_t nstate; /* # of states */ 203 size_t *duration; /* duration sequence */ 204 size_t total_state; /* total state */ 205 size_t total_frame; /* total frame */ 206 } HTS_SStreamSet; 207 208 /* pstream --------------------------------------------------------- */ 209 210 /* HTS_SMatrices: matrices/vectors used in the speech parameter generation algorithm. */ 211 typedef struct _HTS_SMatrices { 212 double **mean; /* mean vector sequence */ 213 double **ivar; /* inverse diag variance sequence */ 214 double *g; /* vector used in the forward substitution */ 215 double **wuw; /* W' U^-1 W */ 216 double *wum; /* W' U^-1 mu */ 217 } HTS_SMatrices; 218 219 /* HTS_PStream: individual PDF stream. */ 220 typedef struct _HTS_PStream { 221 size_t vector_length; /* vector length (static features only) */ 222 size_t length; /* stream length */ 223 size_t width; /* width of dynamic window */ 224 double **par; /* output parameter vector */ 225 HTS_SMatrices sm; /* matrices for parameter generation */ 226 size_t win_size; /* # of windows (static + deltas) */ 227 int *win_l_width; /* left width of windows */ 228 int *win_r_width; /* right width of windows */ 229 double **win_coefficient; /* window coefficients */ 230 HTS_Boolean *msd_flag; /* Boolean sequence for MSD */ 231 double *gv_mean; /* mean vector of GV */ 232 double *gv_vari; /* variance vector of GV */ 233 HTS_Boolean *gv_switch; /* GV flag sequence */ 234 size_t gv_length; /* frame length for GV calculation */ 235 } HTS_PStream; 236 237 /* HTS_PStreamSet: set of PDF streams. */ 238 typedef struct _HTS_PStreamSet { 239 HTS_PStream *pstream; /* PDF streams */ 240 size_t nstream; /* # of PDF streams */ 241 size_t total_frame; /* total frame */ 242 } HTS_PStreamSet; 243 244 /* gstream --------------------------------------------------------- */ 245 246 /* HTS_GStream: generated parameter stream. */ 247 typedef struct _HTS_GStream { 248 size_t vector_length; /* vector length (static features only) */ 249 double **par; /* generated parameter */ 250 } HTS_GStream; 251 252 /* HTS_GStreamSet: set of generated parameter stream. */ 253 typedef struct _HTS_GStreamSet { 254 size_t total_nsample; /* total sample */ 255 size_t total_frame; /* total frame */ 256 size_t nstream; /* # of streams */ 257 HTS_GStream *gstream; /* generated parameter streams */ 258 double *gspeech; /* generated speech */ 259 } HTS_GStreamSet; 260 261 /* engine ---------------------------------------------------------- */ 262 263 /* HTS_Condition: synthesis condition */ 264 typedef struct _HTS_Condition { 265 /* global */ 266 size_t sampling_frequency; /* sampling frequency */ 267 size_t fperiod; /* frame period */ 268 size_t audio_buff_size; /* audio buffer size (for audio device) */ 269 HTS_Boolean stop; /* stop flag */ 270 double volume; /* volume */ 271 double *msd_threshold; /* MSD thresholds */ 272 double *gv_weight; /* GV weights */ 273 274 /* duration */ 275 HTS_Boolean phoneme_alignment_flag; /* flag for using phoneme alignment in label */ 276 double speed; /* speech speed */ 277 278 /* spectrum */ 279 size_t stage; /* if stage=0 then gamma=0 else gamma=-1/stage */ 280 HTS_Boolean use_log_gain; /* log gain flag (for LSP) */ 281 double alpha; /* all-pass constant */ 282 double beta; /* postfiltering coefficient */ 283 284 /* log F0 */ 285 double additional_half_tone; /* additional half tone */ 286 287 /* interpolation weights */ 288 double *duration_iw; /* weights for duration interpolation */ 289 double **parameter_iw; /* weights for parameter interpolation */ 290 double **gv_iw; /* weights for GV interpolation */ 291 } HTS_Condition; 292 293 /* HTS_Engine: Engine itself. */ 294 typedef struct _HTS_Engine { 295 HTS_Condition condition; /* synthesis condition */ 296 HTS_Audio audio; /* audio output */ 297 HTS_ModelSet ms; /* set of duration models, HMMs and GV models */ 298 HTS_Label label; /* label */ 299 HTS_SStreamSet sss; /* set of state streams */ 300 HTS_PStreamSet pss; /* set of PDF streams */ 301 HTS_GStreamSet gss; /* set of generated parameter streams */ 302 } HTS_Engine; 303 304 /* engine method --------------------------------------------------- */ 305 306 /* HTS_Engine_initialize: initialize engine */ 307 void HTS_Engine_initialize(HTS_Engine * engine); 308 309 /* HTS_Engine_load: load HTS voices */ 310 HTS_Boolean HTS_Engine_load(HTS_Engine * engine, char **voices, size_t num_voices); 311 312 /* HTS_Engine_set_sampling_frequency: set sampling fraquency */ 313 void HTS_Engine_set_sampling_frequency(HTS_Engine * engine, size_t i); 314 315 /* HTS_Engine_get_sampling_frequency: get sampling frequency */ 316 size_t HTS_Engine_get_sampling_frequency(HTS_Engine * engine); 317 318 /* HTS_Engine_set_fperiod: set frame period */ 319 void HTS_Engine_set_fperiod(HTS_Engine * engine, size_t i); 320 321 /* HTS_Engine_get_fperiod: get frame period */ 322 size_t HTS_Engine_get_fperiod(HTS_Engine * engine); 323 324 /* HTS_Engine_set_audio_buff_size: set audio buffer size */ 325 void HTS_Engine_set_audio_buff_size(HTS_Engine * engine, size_t i); 326 327 /* HTS_Engine_get_audio_buff_size: get audio buffer size */ 328 size_t HTS_Engine_get_audio_buff_size(HTS_Engine * engine); 329 330 /* HTS_Engine_set_stop_flag: set stop flag */ 331 void HTS_Engine_set_stop_flag(HTS_Engine * engine, HTS_Boolean b); 332 333 /* HTS_Engine_get_stop_flag: get stop flag */ 334 HTS_Boolean HTS_Engine_get_stop_flag(HTS_Engine * engine); 335 336 /* HTS_Engine_set_volume: set volume in db */ 337 void HTS_Engine_set_volume(HTS_Engine * engine, double f); 338 339 /* HTS_Engine_get_volume: get volume in db */ 340 double HTS_Engine_get_volume(HTS_Engine * engine); 341 342 /* HTS_Egnine_set_msd_threshold: set MSD threshold */ 343 void HTS_Engine_set_msd_threshold(HTS_Engine * engine, size_t stream_index, double f); 344 345 /* HTS_Engine_get_msd_threshold: get MSD threshold */ 346 double HTS_Engine_get_msd_threshold(HTS_Engine * engine, size_t stream_index); 347 348 /* HTS_Engine_set_gv_weight: set GV weight */ 349 void HTS_Engine_set_gv_weight(HTS_Engine * engine, size_t stream_index, double f); 350 351 /* HTS_Engine_get_gv_weight: get GV weight */ 352 double HTS_Engine_get_gv_weight(HTS_Engine * engine, size_t stream_index); 353 354 /* HTS_Engine_set_speed: set speech speed */ 355 void HTS_Engine_set_speed(HTS_Engine * engine, double f); 356 357 /* HTS_Engine_set_phoneme_alignment_flag: set flag for using phoneme alignment in label */ 358 void HTS_Engine_set_phoneme_alignment_flag(HTS_Engine * engine, HTS_Boolean b); 359 360 /* HTS_Engine_set_alpha: set alpha */ 361 void HTS_Engine_set_alpha(HTS_Engine * engine, double f); 362 363 /* HTS_Engine_get_alpha: get alpha */ 364 double HTS_Engine_get_alpha(HTS_Engine * engine); 365 366 /* HTS_Engine_set_beta: set beta */ 367 void HTS_Engine_set_beta(HTS_Engine * engine, double f); 368 369 /* HTS_Engine_get_beta: get beta */ 370 double HTS_Engine_get_beta(HTS_Engine * engine); 371 372 /* HTS_Engine_add_half_tone: add half tone */ 373 void HTS_Engine_add_half_tone(HTS_Engine * engine, double f); 374 375 /* HTS_Engine_set_duration_interpolation_weight: set interpolation weight for duration */ 376 void HTS_Engine_set_duration_interpolation_weight(HTS_Engine * engine, size_t voice_index, double f); 377 378 /* HTS_Engine_get_duration_interpolation_weight: get interpolation weight for duration */ 379 double HTS_Engine_get_duration_interpolation_weight(HTS_Engine * engine, size_t voice_index); 380 381 /* HTS_Engine_set_parameter_interpolation_weight: set interpolation weight for parameter */ 382 void HTS_Engine_set_parameter_interpolation_weight(HTS_Engine * engine, size_t voice_index, size_t stream_index, double f); 383 384 /* HTS_Engine_get_parameter_interpolation_weight: get interpolation weight for parameter */ 385 double HTS_Engine_get_parameter_interpolation_weight(HTS_Engine * engine, size_t voice_index, size_t stream_index); 386 387 /* HTS_Engine_set_gv_interpolation_weight: set interpolation weight for GV */ 388 void HTS_Engine_set_gv_interpolation_weight(HTS_Engine * engine, size_t voice_index, size_t stream_index, double f); 389 390 /* HTS_Engine_get_gv_interpolation_weight: get interpolation weight for GV */ 391 double HTS_Engine_get_gv_interpolation_weight(HTS_Engine * engine, size_t voice_index, size_t stream_index); 392 393 /* HTS_Engine_get_total_state: get total number of state */ 394 size_t HTS_Engine_get_total_state(HTS_Engine * engine); 395 396 /* HTS_Engine_set_state_mean: set mean value of state */ 397 void HTS_Engine_set_state_mean(HTS_Engine * engine, size_t stream_index, size_t state_index, size_t vector_index, double f); 398 399 /* HTS_Engine_get_state_mean: get mean value of state */ 400 double HTS_Engine_get_state_mean(HTS_Engine * engine, size_t stream_index, size_t state_index, size_t vector_index); 401 402 /* HTS_Engine_get_state_duration: get state duration */ 403 size_t HTS_Engine_get_state_duration(HTS_Engine * engine, size_t state_index); 404 405 /* HTS_Engine_get_nvoices: get number of voices */ 406 size_t HTS_Engine_get_nvoices(HTS_Engine * engine); 407 408 /* HTS_Engine_get_nstream: get number of stream */ 409 size_t HTS_Engine_get_nstream(HTS_Engine * engine); 410 411 /* HTS_Engine_get_nstate: get number of state */ 412 size_t HTS_Engine_get_nstate(HTS_Engine * engine); 413 414 /* HTS_Engine_get_fullcontext_label_format: get full context label format */ 415 const char *HTS_Engine_get_fullcontext_label_format(HTS_Engine * engine); 416 417 /* HTS_Engine_get_fullcontext_label_version: get full context label version */ 418 const char *HTS_Engine_get_fullcontext_label_version(HTS_Engine * engine); 419 420 /* HTS_Engine_get_total_frame: get total number of frame */ 421 size_t HTS_Engine_get_total_frame(HTS_Engine * engine); 422 423 /* HTS_Engine_get_nsamples: get number of samples */ 424 size_t HTS_Engine_get_nsamples(HTS_Engine * engine); 425 426 /* HTS_Engine_get_generated_parameter: output generated parameter */ 427 double HTS_Engine_get_generated_parameter(HTS_Engine * engine, size_t stream_index, size_t frame_index, size_t vector_index); 428 429 /* HTS_Engine_get_generated_speech: output generated speech */ 430 double HTS_Engine_get_generated_speech(HTS_Engine * engine, size_t index); 431 432 /* HTS_Engine_synthesize_from_fn: synthesize speech from file name */ 433 HTS_Boolean HTS_Engine_synthesize_from_fn(HTS_Engine * engine, const char *fn); 434 435 /* HTS_Engine_synthesize_from_strings: synthesize speech from string list */ 436 HTS_Boolean HTS_Engine_synthesize_from_strings(HTS_Engine * engine, char **lines, size_t num_lines); 437 438 /* HTS_Engine_generate_state_sequence_from_fn: generate state sequence from file name (1st synthesis step) */ 439 HTS_Boolean HTS_Engine_generate_state_sequence_from_fn(HTS_Engine * engine, const char *fn); 440 441 /* HTS_Engine_generate_state_sequence_from_strings: generate state sequence from string list (1st synthesis step) */ 442 HTS_Boolean HTS_Engine_generate_state_sequence_from_strings(HTS_Engine * engine, char **lines, size_t num_lines); 443 444 /* HTS_Engine_generate_parameter_sequence: generate parameter sequence (2nd synthesis step) */ 445 HTS_Boolean HTS_Engine_generate_parameter_sequence(HTS_Engine * engine); 446 447 /* HTS_Engine_generate_sample_sequence: generate sample sequence (3rd synthesis step) */ 448 HTS_Boolean HTS_Engine_generate_sample_sequence(HTS_Engine * engine); 449 450 /* HTS_Engine_save_information: save trace information */ 451 void HTS_Engine_save_information(HTS_Engine * engine, FILE * fp); 452 453 /* HTS_Engine_save_label: save label with time */ 454 void HTS_Engine_save_label(HTS_Engine * engine, FILE * fp); 455 456 /* HTS_Engine_save_generated_parameter: save generated parameter */ 457 void HTS_Engine_save_generated_parameter(HTS_Engine * engine, size_t stream_index, FILE * fp); 458 459 /* HTS_Engine_save_generated_speech: save generated speech */ 460 void HTS_Engine_save_generated_speech(HTS_Engine * engine, FILE * fp); 461 462 /* HTS_Engine_save_riff: save RIFF format file */ 463 void HTS_Engine_save_riff(HTS_Engine * engine, FILE * fp); 464 465 /* HTS_Engine_refresh: free memory per one time synthesis */ 466 void HTS_Engine_refresh(HTS_Engine * engine); 467 468 /* HTS_Engine_clear: free engine */ 469 void HTS_Engine_clear(HTS_Engine * engine); 470 471 HTS_ENGINE_H_END; 472 473 #endif /* !HTS_ENGINE_H */ 474