1 /* ----------------------------------------------------------------- */ 2 /* The HMM-Based Speech Synthesis Engine "hts_engine API" */ 3 /* developed by HTS Working Group */ 4 /* http://hts-engine.sourceforge.net/ */ 5 /* ----------------------------------------------------------------- */ 6 /* */ 7 /* Copyright (c) 2001-2015 Nagoya Institute of Technology */ 8 /* Department of Computer Science */ 9 /* */ 10 /* 2001-2008 Tokyo Institute of Technology */ 11 /* Interdisciplinary Graduate School of */ 12 /* Science and Engineering */ 13 /* */ 14 /* All rights reserved. */ 15 /* */ 16 /* Redistribution and use in source and binary forms, with or */ 17 /* without modification, are permitted provided that the following */ 18 /* conditions are met: */ 19 /* */ 20 /* - Redistributions of source code must retain the above copyright */ 21 /* notice, this list of conditions and the following disclaimer. */ 22 /* - Redistributions in binary form must reproduce the above */ 23 /* copyright notice, this list of conditions and the following */ 24 /* disclaimer in the documentation and/or other materials provided */ 25 /* with the distribution. */ 26 /* - Neither the name of the HTS working group nor the names of its */ 27 /* contributors may be used to endorse or promote products derived */ 28 /* from this software without specific prior written permission. */ 29 /* */ 30 /* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND */ 31 /* CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, */ 32 /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ 33 /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ 34 /* DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS */ 35 /* BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, */ 36 /* EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED */ 37 /* TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, */ 38 /* DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON */ 39 /* ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, */ 40 /* OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY */ 41 /* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ 42 /* POSSIBILITY OF SUCH DAMAGE. */ 43 /* ----------------------------------------------------------------- */ 44 45 #ifndef HTS_SSTREAM_C 46 #define HTS_SSTREAM_C 47 48 #ifdef __cplusplus 49 #define HTS_SSTREAM_C_START extern "C" { 50 #define HTS_SSTREAM_C_END } 51 #else 52 #define HTS_SSTREAM_C_START 53 #define HTS_SSTREAM_C_END 54 #endif /* __CPLUSPLUS */ 55 56 HTS_SSTREAM_C_START; 57 58 #include <stdlib.h> 59 #include <math.h> 60 61 /* hts_engine libraries */ 62 #include "HTS_hidden.h" 63 64 /* HTS_set_default_duration: set default duration from state duration probability distribution */ 65 static double HTS_set_default_duration(size_t * duration, double *mean, double *vari, size_t size) 66 { 67 size_t i; 68 double temp; 69 size_t sum = 0; 70 71 for (i = 0; i < size; i++) { 72 temp = mean[i] + 0.5; 73 if (temp < 1.0) 74 duration[i] = 1; 75 else 76 duration[i] = (size_t) temp; 77 sum += duration[i]; 78 } 79 80 return (double) sum; 81 } 82 83 static double HTS_set_duration_by_speed(size_t * duration, double *mean, double *vari, size_t size, double speed) 84 { 85 size_t i; 86 double temp; 87 size_t sum = 0; 88 89 for (i = 0; i < size; i++) { 90 temp = mean[i]/speed + 0.5; 91 if (temp < 1.0) 92 duration[i] = 1; 93 else 94 duration[i] = (size_t) temp; 95 sum += duration[i]; 96 } 97 98 return (double) sum; 99 } 100 101 /* HTS_set_specified_duration: set duration from state duration probability distribution and specified frame length */ 102 static double HTS_set_specified_duration(size_t * duration, double *mean, double *vari, size_t size, double frame_length) 103 { 104 size_t i; 105 int j; 106 double temp1, temp2; 107 double rho = 0.0; 108 size_t sum = 0; 109 size_t target_length; 110 111 /* get the target frame length */ 112 if (frame_length + 0.5 < 1.0) 113 target_length = 1; 114 else 115 target_length = (size_t) (frame_length + 0.5); 116 117 /* check the specified duration */ 118 if (target_length <= size) { 119 if (target_length < size) 120 HTS_error(-1, "HTS_set_specified_duration: Specified frame length is too short.\n"); 121 for (i = 0; i < size; i++) 122 duration[i] = 1; 123 return (double) size; 124 } 125 126 /* RHO calculation */ 127 temp1 = 0.0; 128 temp2 = 0.0; 129 for (i = 0; i < size; i++) { 130 temp1 += mean[i]; 131 temp2 += vari[i]; 132 } 133 rho = ((double) target_length - temp1) / temp2; 134 135 /* first estimation */ 136 for (i = 0; i < size; i++) { 137 temp1 = mean[i] + rho * vari[i] + 0.5; 138 if (temp1 < 1.0) 139 duration[i] = 1; 140 else 141 duration[i] = (size_t) temp1; 142 sum += duration[i]; 143 } 144 145 /* loop estimation */ 146 while (target_length != sum) { 147 /* sarch flexible state and modify its duration */ 148 if (target_length > sum) { 149 j = -1; 150 for (i = 0; i < size; i++) { 151 temp2 = fabs(rho - ((double) duration[i] + 1 - mean[i]) / vari[i]); 152 if (j < 0 || temp1 > temp2) { 153 j = i; 154 temp1 = temp2; 155 } 156 } 157 sum++; 158 duration[j]++; 159 } else { 160 j = -1; 161 for (i = 0; i < size; i++) { 162 if (duration[i] > 1) { 163 temp2 = fabs(rho - ((double) duration[i] - 1 - mean[i]) / vari[i]); 164 if (j < 0 || temp1 > temp2) { 165 j = i; 166 temp1 = temp2; 167 } 168 } 169 } 170 sum--; 171 duration[j]--; 172 } 173 } 174 175 return (double) target_length; 176 } 177 178 /* HTS_SStreamSet_initialize: initialize state stream set */ 179 void HTS_SStreamSet_initialize(HTS_SStreamSet * sss) 180 { 181 sss->nstream = 0; 182 sss->nstate = 0; 183 sss->sstream = NULL; 184 sss->duration = NULL; 185 sss->total_state = 0; 186 sss->total_frame = 0; 187 } 188 189 /* HTS_SStreamSet_create: parse label and determine state duration */ 190 HTS_Boolean HTS_SStreamSet_create(HTS_SStreamSet * sss, HTS_ModelSet * ms, HTS_Label * label, HTS_Boolean phoneme_alignment_flag, double speed, double *duration_iw, double **parameter_iw, double **gv_iw) 191 { 192 size_t i, j, k; 193 double temp; 194 int shift; 195 size_t state; 196 HTS_SStream *sst; 197 double *duration_mean, *duration_vari; 198 double frame_length; 199 size_t next_time; 200 size_t next_state; 201 double label_dur_mod; 202 203 if (HTS_Label_get_size(label) == 0) 204 return FALSE; 205 206 /* check interpolation weights */ 207 for (i = 0, temp = 0.0; i < HTS_ModelSet_get_nvoices(ms); i++) 208 temp += duration_iw[i]; 209 if (temp == 0.0) { 210 return FALSE; 211 } else if (temp != 1.0) { 212 for (i = 0; i < HTS_ModelSet_get_nvoices(ms); i++) 213 if (duration_iw[i] != 0.0) 214 duration_iw[i] /= temp; 215 } 216 217 for (i = 0; i < HTS_ModelSet_get_nstream(ms); i++) { 218 for (j = 0, temp = 0.0; j < HTS_ModelSet_get_nvoices(ms); j++) 219 temp += parameter_iw[j][i]; 220 if (temp == 0.0) { 221 return FALSE; 222 } else if (temp != 1.0) { 223 for (j = 0; j < HTS_ModelSet_get_nvoices(ms); j++) 224 if (parameter_iw[j][i] != 0.0) 225 parameter_iw[j][i] /= temp; 226 } 227 if (HTS_ModelSet_use_gv(ms, i)) { 228 for (j = 0, temp = 0.0; j < HTS_ModelSet_get_nvoices(ms); j++) 229 temp += gv_iw[j][i]; 230 if (temp == 0.0) 231 return FALSE; 232 else if (temp != 1.0) 233 for (j = 0; j < HTS_ModelSet_get_nvoices(ms); j++) 234 if (gv_iw[j][i] != 0.0) 235 gv_iw[j][i] /= temp; 236 } 237 } 238 239 /* initialize state sequence */ 240 sss->nstate = HTS_ModelSet_get_nstate(ms); 241 sss->nstream = HTS_ModelSet_get_nstream(ms); 242 sss->total_frame = 0; 243 sss->total_state = HTS_Label_get_size(label) * sss->nstate; 244 sss->duration = (size_t *) HTS_calloc(sss->total_state, sizeof(size_t)); 245 sss->sstream = (HTS_SStream *) HTS_calloc(sss->nstream, sizeof(HTS_SStream)); 246 for (i = 0; i < sss->nstream; i++) { 247 sst = &sss->sstream[i]; 248 sst->vector_length = HTS_ModelSet_get_vector_length(ms, i); 249 sst->mean = (double **) HTS_calloc(sss->total_state, sizeof(double *)); 250 sst->vari = (double **) HTS_calloc(sss->total_state, sizeof(double *)); 251 if (HTS_ModelSet_is_msd(ms, i)) 252 sst->msd = (double *) HTS_calloc(sss->total_state, sizeof(double)); 253 else 254 sst->msd = NULL; 255 for (j = 0; j < sss->total_state; j++) { 256 sst->mean[j] = (double *) HTS_calloc(sst->vector_length * HTS_ModelSet_get_window_size(ms, i), sizeof(double)); 257 sst->vari[j] = (double *) HTS_calloc(sst->vector_length * HTS_ModelSet_get_window_size(ms, i), sizeof(double)); 258 } 259 if (HTS_ModelSet_use_gv(ms, i)) { 260 sst->gv_switch = (HTS_Boolean *) HTS_calloc(sss->total_state, sizeof(HTS_Boolean)); 261 for (j = 0; j < sss->total_state; j++) 262 sst->gv_switch[j] = TRUE; 263 } else { 264 sst->gv_switch = NULL; 265 } 266 } 267 268 /* determine state duration */ 269 duration_mean = (double *) HTS_calloc(sss->total_state, sizeof(double)); 270 duration_vari = (double *) HTS_calloc(sss->total_state, sizeof(double)); 271 for (i = 0; i < HTS_Label_get_size(label); i++) 272 { 273 HTS_ModelSet_get_duration(ms, HTS_Label_get_string(label, i), HTS_Label_get_parsed(label, i), duration_iw, &duration_mean[i * sss->nstate], &duration_vari[i * sss->nstate]); 274 label_dur_mod=HTS_Label_get_dur_mod(label, i); 275 for(j=0; j < sss->nstate; ++j) 276 duration_mean[i*sss->nstate+j]*=label_dur_mod; 277 } 278 if (phoneme_alignment_flag == TRUE) { 279 /* use duration set by user */ 280 next_time = 0; 281 next_state = 0; 282 state = 0; 283 for (i = 0; i < HTS_Label_get_size(label); i++) { 284 temp = HTS_Label_get_end_frame(label, i); 285 if (temp >= 0) { 286 next_time += (size_t) HTS_set_specified_duration(&sss->duration[next_state], &duration_mean[next_state], &duration_vari[next_state], state + sss->nstate - next_state, temp - next_time); 287 next_state = state + sss->nstate; 288 } else if (i + 1 == HTS_Label_get_size(label)) { 289 HTS_error(-1, "HTS_SStreamSet_create: The time of final label is not specified.\n"); 290 HTS_set_default_duration(&sss->duration[next_state], &duration_mean[next_state], &duration_vari[next_state], state + sss->nstate - next_state); 291 } 292 state += sss->nstate; 293 } 294 } else { 295 /* determine frame length */ 296 if (speed != 1.0) { 297 /* temp = 0.0; */ 298 /* for (i = 0; i < sss->total_state; i++) { */ 299 /* temp += duration_mean[i]; */ 300 /* } */ 301 /* frame_length = temp / speed; */ 302 /* HTS_set_specified_duration(sss->duration, duration_mean, duration_vari, sss->total_state, frame_length); */ 303 HTS_set_duration_by_speed(sss->duration, duration_mean, duration_vari, sss->total_state, speed); 304 } else { 305 HTS_set_default_duration(sss->duration, duration_mean, duration_vari, sss->total_state); 306 } 307 } 308 HTS_free(duration_mean); 309 HTS_free(duration_vari); 310 311 for(i=0;i<(sss->nstate-1);++i) 312 { 313 sss->duration[i]=1; 314 } 315 if(sss->duration[sss->nstate-1]>1) 316 sss->duration[sss->nstate-1]=1; 317 318 /* get parameter */ 319 for (i = 0, state = 0; i < HTS_Label_get_size(label); i++) { 320 for (j = 2; j <= sss->nstate + 1; j++) { 321 sss->total_frame += sss->duration[state]; 322 for (k = 0; k < sss->nstream; k++) { 323 sst = &sss->sstream[k]; 324 if (sst->msd) 325 HTS_ModelSet_get_parameter(ms, k, j, HTS_Label_get_string(label, i), HTS_Label_get_parsed(label, i), (const double *const *) parameter_iw, sst->mean[state], sst->vari[state], &sst->msd[state]); 326 else 327 HTS_ModelSet_get_parameter(ms, k, j, HTS_Label_get_string(label, i), HTS_Label_get_parsed(label, i), (const double *const *) parameter_iw, sst->mean[state], sst->vari[state], NULL); 328 } 329 state++; 330 } 331 } 332 333 /* copy dynamic window */ 334 for (i = 0; i < sss->nstream; i++) { 335 sst = &sss->sstream[i]; 336 sst->win_size = HTS_ModelSet_get_window_size(ms, i); 337 sst->win_max_width = HTS_ModelSet_get_window_max_width(ms, i); 338 sst->win_l_width = (int *) HTS_calloc(sst->win_size, sizeof(int)); 339 sst->win_r_width = (int *) HTS_calloc(sst->win_size, sizeof(int)); 340 sst->win_coefficient = (double **) HTS_calloc(sst->win_size, sizeof(double)); 341 for (j = 0; j < sst->win_size; j++) { 342 sst->win_l_width[j] = HTS_ModelSet_get_window_left_width(ms, i, j); 343 sst->win_r_width[j] = HTS_ModelSet_get_window_right_width(ms, i, j); 344 if (sst->win_l_width[j] + sst->win_r_width[j] == 0) 345 sst->win_coefficient[j] = (double *) HTS_calloc(-2 * sst->win_l_width[j] + 1, sizeof(double)); 346 else 347 sst->win_coefficient[j] = (double *) HTS_calloc(-2 * sst->win_l_width[j], sizeof(double)); 348 sst->win_coefficient[j] -= sst->win_l_width[j]; 349 for (shift = sst->win_l_width[j]; shift <= sst->win_r_width[j]; shift++) 350 sst->win_coefficient[j][shift] = HTS_ModelSet_get_window_coefficient(ms, i, j, shift); 351 } 352 } 353 354 /* determine GV */ 355 for (i = 0; i < sss->nstream; i++) { 356 sst = &sss->sstream[i]; 357 if (HTS_ModelSet_use_gv(ms, i)) { 358 sst->gv_mean = (double *) HTS_calloc(sst->vector_length, sizeof(double)); 359 sst->gv_vari = (double *) HTS_calloc(sst->vector_length, sizeof(double)); 360 HTS_ModelSet_get_gv(ms, i, HTS_Label_get_string(label, 0), HTS_Label_get_parsed(label, 0), (const double *const *) gv_iw, sst->gv_mean, sst->gv_vari); 361 } else { 362 sst->gv_mean = NULL; 363 sst->gv_vari = NULL; 364 } 365 } 366 367 for (i = 0; i < HTS_Label_get_size(label); i++) 368 if (HTS_ModelSet_get_gv_flag(ms, HTS_Label_get_string(label, i), HTS_Label_get_parsed(label, i)) == FALSE) 369 for (j = 0; j < sss->nstream; j++) 370 if (HTS_ModelSet_use_gv(ms, j) == TRUE) 371 for (k = 0; k < sss->nstate; k++) 372 sss->sstream[j].gv_switch[i * sss->nstate + k] = FALSE; 373 374 return TRUE; 375 } 376 377 /* HTS_SStreamSet_get_nstream: get number of stream */ 378 size_t HTS_SStreamSet_get_nstream(HTS_SStreamSet * sss) 379 { 380 return sss->nstream; 381 } 382 383 /* HTS_SStreamSet_get_vector_length: get vector length */ 384 size_t HTS_SStreamSet_get_vector_length(HTS_SStreamSet * sss, size_t stream_index) 385 { 386 return sss->sstream[stream_index].vector_length; 387 } 388 389 /* HTS_SStreamSet_is_msd: get MSD flag */ 390 HTS_Boolean HTS_SStreamSet_is_msd(HTS_SStreamSet * sss, size_t stream_index) 391 { 392 return sss->sstream[stream_index].msd ? TRUE : FALSE; 393 } 394 395 /* HTS_SStreamSet_get_total_state: get total number of state */ 396 size_t HTS_SStreamSet_get_total_state(HTS_SStreamSet * sss) 397 { 398 return sss->total_state; 399 } 400 401 /* HTS_SStreamSet_get_total_frame: get total number of frame */ 402 size_t HTS_SStreamSet_get_total_frame(HTS_SStreamSet * sss) 403 { 404 return sss->total_frame; 405 } 406 407 /* HTS_SStreamSet_get_msd: get MSD parameter */ 408 double HTS_SStreamSet_get_msd(HTS_SStreamSet * sss, size_t stream_index, size_t state_index) 409 { 410 return sss->sstream[stream_index].msd[state_index]; 411 } 412 413 /* HTS_SStreamSet_window_size: get dynamic window size */ 414 size_t HTS_SStreamSet_get_window_size(HTS_SStreamSet * sss, size_t stream_index) 415 { 416 return sss->sstream[stream_index].win_size; 417 } 418 419 /* HTS_SStreamSet_get_window_left_width: get left width of dynamic window */ 420 int HTS_SStreamSet_get_window_left_width(HTS_SStreamSet * sss, size_t stream_index, size_t window_index) 421 { 422 return sss->sstream[stream_index].win_l_width[window_index]; 423 } 424 425 /* HTS_SStreamSet_get_winodow_right_width: get right width of dynamic window */ 426 int HTS_SStreamSet_get_window_right_width(HTS_SStreamSet * sss, size_t stream_index, size_t window_index) 427 { 428 return sss->sstream[stream_index].win_r_width[window_index]; 429 } 430 431 /* HTS_SStreamSet_get_window_coefficient: get coefficient of dynamic window */ 432 double HTS_SStreamSet_get_window_coefficient(HTS_SStreamSet * sss, size_t stream_index, size_t window_index, int coefficient_index) 433 { 434 return sss->sstream[stream_index].win_coefficient[window_index][coefficient_index]; 435 } 436 437 /* HTS_SStreamSet_get_window_max_width: get max width of dynamic window */ 438 size_t HTS_SStreamSet_get_window_max_width(HTS_SStreamSet * sss, size_t stream_index) 439 { 440 return sss->sstream[stream_index].win_max_width; 441 } 442 443 /* HTS_SStreamSet_use_gv: get GV flag */ 444 HTS_Boolean HTS_SStreamSet_use_gv(HTS_SStreamSet * sss, size_t stream_index) 445 { 446 return sss->sstream[stream_index].gv_mean ? TRUE : FALSE; 447 } 448 449 /* HTS_SStreamSet_get_duration: get state duration */ 450 size_t HTS_SStreamSet_get_duration(HTS_SStreamSet * sss, size_t state_index) 451 { 452 return sss->duration[state_index]; 453 } 454 455 /* HTS_SStreamSet_get_mean: get mean parameter */ 456 double HTS_SStreamSet_get_mean(HTS_SStreamSet * sss, size_t stream_index, size_t state_index, size_t vector_index) 457 { 458 return sss->sstream[stream_index].mean[state_index][vector_index]; 459 } 460 461 /* HTS_SStreamSet_set_mean: set mean parameter */ 462 void HTS_SStreamSet_set_mean(HTS_SStreamSet * sss, size_t stream_index, size_t state_index, size_t vector_index, double f) 463 { 464 sss->sstream[stream_index].mean[state_index][vector_index] = f; 465 } 466 467 /* HTS_SStreamSet_get_vari: get variance parameter */ 468 double HTS_SStreamSet_get_vari(HTS_SStreamSet * sss, size_t stream_index, size_t state_index, size_t vector_index) 469 { 470 return sss->sstream[stream_index].vari[state_index][vector_index]; 471 } 472 473 /* HTS_SStreamSet_set_vari: set variance parameter */ 474 void HTS_SStreamSet_set_vari(HTS_SStreamSet * sss, size_t stream_index, size_t state_index, size_t vector_index, double f) 475 { 476 sss->sstream[stream_index].vari[state_index][vector_index] = f; 477 } 478 479 /* HTS_SStreamSet_get_gv_mean: get GV mean parameter */ 480 double HTS_SStreamSet_get_gv_mean(HTS_SStreamSet * sss, size_t stream_index, size_t vector_index) 481 { 482 return sss->sstream[stream_index].gv_mean[vector_index]; 483 } 484 485 /* HTS_SStreamSet_get_gv_mean: get GV variance parameter */ 486 double HTS_SStreamSet_get_gv_vari(HTS_SStreamSet * sss, size_t stream_index, size_t vector_index) 487 { 488 return sss->sstream[stream_index].gv_vari[vector_index]; 489 } 490 491 /* HTS_SStreamSet_set_gv_switch: set GV switch */ 492 void HTS_SStreamSet_set_gv_switch(HTS_SStreamSet * sss, size_t stream_index, size_t state_index, HTS_Boolean i) 493 { 494 sss->sstream[stream_index].gv_switch[state_index] = i; 495 } 496 497 /* HTS_SStreamSet_get_gv_switch: get GV switch */ 498 HTS_Boolean HTS_SStreamSet_get_gv_switch(HTS_SStreamSet * sss, size_t stream_index, size_t state_index) 499 { 500 return sss->sstream[stream_index].gv_switch[state_index]; 501 } 502 503 /* HTS_SStreamSet_clear: free state stream set */ 504 void HTS_SStreamSet_clear(HTS_SStreamSet * sss) 505 { 506 size_t i, j; 507 HTS_SStream *sst; 508 509 if (sss->sstream) { 510 for (i = 0; i < sss->nstream; i++) { 511 sst = &sss->sstream[i]; 512 for (j = 0; j < sss->total_state; j++) { 513 HTS_free(sst->mean[j]); 514 HTS_free(sst->vari[j]); 515 } 516 if (sst->msd) 517 HTS_free(sst->msd); 518 HTS_free(sst->mean); 519 HTS_free(sst->vari); 520 for (j = 0; j < sst->win_size; j++) { 521 sst->win_coefficient[j] += sst->win_l_width[j]; 522 HTS_free(sst->win_coefficient[j]); 523 } 524 HTS_free(sst->win_coefficient); 525 HTS_free(sst->win_l_width); 526 HTS_free(sst->win_r_width); 527 if (sst->gv_mean) 528 HTS_free(sst->gv_mean); 529 if (sst->gv_vari) 530 HTS_free(sst->gv_vari); 531 if (sst->gv_switch) 532 HTS_free(sst->gv_switch); 533 } 534 HTS_free(sss->sstream); 535 } 536 if (sss->duration) 537 HTS_free(sss->duration); 538 539 HTS_SStreamSet_initialize(sss); 540 } 541 542 HTS_SSTREAM_C_END; 543 544 #endif /* !HTS_SSTREAM_C */ 545