1 /* 2 * Copyright (C) 2005 to 2014 by Jonathan Duddington 3 * email: jonsd@users.sourceforge.net 4 * Copyright (C) 2015-2017 Reece H. Dunn 5 * 6 * This program is free software; you can redistribute it and/or modify 7 * it under the terms of the GNU General Public License as published by 8 * the Free Software Foundation; either version 3 of the License, or 9 * (at your option) any later version. 10 * 11 * This program is distributed in the hope that it will be useful, 12 * but WITHOUT ANY WARRANTY; without even the implied warranty of 13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 14 * GNU General Public License for more details. 15 * 16 * You should have received a copy of the GNU General Public License 17 * along with this program; if not, see: <http://www.gnu.org/licenses/>. 18 */ 19 20 //#include <stdbool.h> 21 22 #include "phoneme.h" 23 #include "voice.h" 24 #ifdef __cplusplus 25 extern "C" 26 { 27 #endif 28 29 #define espeakINITIALIZE_PHONEME_IPA 0x0002 // move this to speak_lib.h, after eSpeak version 1.46.02 30 31 #define N_PHONEME_LIST 1000 // enough for source[N_TR_SOURCE] full of text, else it will truncate 32 33 #define MAX_HARMONIC 400 // 400 * 50Hz = 20 kHz, more than enough 34 #define N_SEQ_FRAMES 25 // max frames in a spectrum sequence (real max is ablut 8) 35 #define STEPSIZE 64 // 2.9mS at 22 kHz sample rate 36 37 // flags set for frames within a spectrum sequence 38 #define FRFLAG_KLATT 0x01 // this frame includes extra data for Klatt synthesizer 39 #define FRFLAG_VOWEL_CENTRE 0x02 // centre point of vowel 40 #define FRFLAG_LEN_MOD 0x04 // reduce effect of length adjustment 41 #define FRFLAG_BREAK_LF 0x08 // but keep f3 upwards 42 #define FRFLAG_BREAK 0x10 // don't merge with next frame 43 #define FRFLAG_BREAK_2 0x18 // FRFLAG_BREAK_LF or FRFLAG_BREAK 44 #define FRFLAG_FORMANT_RATE 0x20 // Flag5 allow increased rate of change of formant freq 45 #define FRFLAG_MODULATE 0x40 // Flag6 modulate amplitude of some cycles to give trill 46 #define FRFLAG_DEFER_WAV 0x80 // Flag7 defer mixing WAV until the next frame 47 #define FRFLAG_LEN_MOD2 0x4000 // reduce effect of length adjustment, used for the start of a vowel 48 #define FRFLAG_COPIED 0x8000 // This frame has been copied into temporary rw memory 49 50 #define SFLAG_SEQCONTINUE 0x01 // a liquid or nasal after a vowel, but not followed by a vowel 51 #define SFLAG_EMBEDDED 0x02 // there are embedded commands before this phoneme 52 #define SFLAG_SYLLABLE 0x04 // vowel or syllabic consonant 53 #define SFLAG_LENGTHEN 0x08 // lengthen symbol : included after this phoneme 54 #define SFLAG_DICTIONARY 0x10 // the pronunciation of this word was listed in the xx_list dictionary 55 #define SFLAG_SWITCHED_LANG 0x20 // this word uses phonemes from a different language 56 #define SFLAG_PROMOTE_STRESS 0x40 // this unstressed word can be promoted to stressed 57 58 #define SFLAG_PREV_PAUSE 0x1000 // consider previous phoneme as pause 59 #define SFLAG_NEXT_PAUSE 0x2000 // consider next phoneme as pause 60 61 // embedded command numbers 62 #define EMBED_P 1 // pitch 63 #define EMBED_S 2 // speed (used in setlengths) 64 #define EMBED_A 3 // amplitude/volume 65 #define EMBED_R 4 // pitch range/expression 66 #define EMBED_H 5 // echo/reverberation 67 #define EMBED_T 6 // different tone for announcing punctuation (not used) 68 #define EMBED_I 7 // sound icon 69 #define EMBED_S2 8 // speed (used in synthesize) 70 #define EMBED_Y 9 // say-as commands 71 #define EMBED_M 10 // mark name 72 #define EMBED_U 11 // audio uri 73 #define EMBED_B 12 // break 74 #define EMBED_F 13 // emphasis 75 #define EMBED_C 14 // capital letter indication 76 77 #define N_EMBEDDED_VALUES 15 78 extern int embedded_value[N_EMBEDDED_VALUES]; 79 extern int embedded_default[N_EMBEDDED_VALUES]; 80 81 #define N_MARKERS 8 82 83 #define N_KLATTP 10 // this affects the phoneme data file format 84 #define N_KLATTP2 14 // used in vowel files, with extra parameters for future extensions 85 86 #define KLATT_AV 0 87 #define KLATT_FNZ 1 // nasal zero freq 88 #define KLATT_Tilt 2 89 #define KLATT_Aspr 3 90 #define KLATT_Skew 4 91 92 #define KLATT_Kopen 5 93 #define KLATT_AVp 6 94 #define KLATT_Fric 7 95 #define KLATT_FricBP 8 96 #define KLATT_Turb 9 97 98 typedef struct { // 64 bytes 99 short frflags; 100 short ffreq[7]; 101 unsigned char length; 102 unsigned char rms; 103 unsigned char fheight[8]; 104 unsigned char fwidth[6]; // width/4 f0-5 105 unsigned char fright[3]; // width/4 f0-2 106 unsigned char bw[4]; // Klatt bandwidth BNZ /2, f1,f2,f3 107 unsigned char klattp[5]; // AV, FNZ, Tilt, Aspr, Skew 108 unsigned char klattp2[5]; // continuation of klattp[], Avp, Fric, FricBP, Turb 109 unsigned char klatt_ap[7]; // Klatt parallel amplitude 110 unsigned char klatt_bp[7]; // Klatt parallel bandwidth /2 111 unsigned char spare; // pad to multiple of 4 bytes 112 } frame_t; // with extra Klatt parameters for parallel resonators 113 114 typedef struct { // 44 bytes 115 short frflags; 116 short ffreq[7]; 117 unsigned char length; 118 unsigned char rms; 119 unsigned char fheight[8]; 120 unsigned char fwidth[6]; // width/4 f0-5 121 unsigned char fright[3]; // width/4 f0-2 122 unsigned char bw[4]; // Klatt bandwidth BNZ /2, f1,f2,f3 123 unsigned char klattp[5]; // AV, FNZ, Tilt, Aspr, Skew 124 } frame_t2; // without the extra Klatt parameters 125 126 // formant data used by wavegen 127 typedef struct { 128 int freq; // Hz<<16 129 int height; // height<<15 130 int left; // Hz<<16 131 int right; // Hz<<16 132 DOUBLEX freq1; // floating point versions of the above 133 DOUBLEX height1; 134 DOUBLEX left1; 135 DOUBLEX right1; 136 DOUBLEX freq_inc; // increment by this every 64 samples 137 DOUBLEX height_inc; 138 DOUBLEX left_inc; 139 DOUBLEX right_inc; 140 } wavegen_peaks_t; 141 142 typedef struct { 143 unsigned char *pitch_env; 144 int pitch; // pitch Hz*256 145 int pitch_ix; // index into pitch envelope (*256) 146 int pitch_inc; // increment to pitch_ix 147 int pitch_base; // Hz*256 low, before modified by envelope 148 int pitch_range; // Hz*256 range of envelope 149 150 unsigned char *mix_wavefile; // wave file to be added to synthesis 151 int n_mix_wavefile; // length in bytes 152 int mix_wave_scale; // 0=2 byte samples 153 int mix_wave_amp; 154 int mix_wavefile_ix; 155 int mix_wavefile_max; // length of available WAV data (in bytes) 156 int mix_wavefile_offset; 157 158 int amplitude; 159 int amplitude_v; 160 int amplitude_fmt; // percentage amplitude adjustment for formant synthesis 161 } WGEN_DATA; 162 163 typedef struct { 164 double a; 165 double b; 166 double c; 167 double x1; 168 double x2; 169 } RESONATOR; 170 171 typedef struct { 172 short length_total; // not used 173 unsigned char n_frames; 174 unsigned char sqflags; 175 frame_t2 frame[N_SEQ_FRAMES]; // max. frames in a spectrum sequence 176 } SPECT_SEQ; // sequence of espeak formant frames 177 178 typedef struct { 179 short length_total; // not used 180 unsigned char n_frames; 181 unsigned char sqflags; 182 frame_t frame[N_SEQ_FRAMES]; // max. frames in a spectrum sequence 183 } SPECT_SEQK; // sequence of klatt formants frames 184 185 typedef struct { 186 short length; 187 short frflags; 188 frame_t *frame; 189 } frameref_t; 190 191 // a clause translated into phoneme codes (first stage) 192 typedef struct { 193 unsigned short synthflags; // NOTE Put shorts on 32bit boundaries, because of RISC OS compiler bug? 194 unsigned char phcode; 195 unsigned char stresslevel; 196 unsigned short sourceix; // ix into the original source text string, only set at the start of a word 197 unsigned char wordstress; // the highest level stress in this word 198 unsigned char tone_ph; // tone phoneme to use with this vowel 199 } PHONEME_LIST2; 200 201 typedef struct { 202 // The first section is a copy of PHONEME_LIST2 203 unsigned short synthflags; 204 unsigned char phcode; 205 unsigned char stresslevel; 206 unsigned short sourceix; // ix into the original source text string, only set at the start of a word 207 unsigned char wordstress; // the highest level stress in this word 208 unsigned char tone_ph; // tone phoneme to use with this vowel 209 210 PHONEME_TAB *ph; 211 unsigned int length; // length_mod 212 unsigned char env; // pitch envelope number 213 unsigned char type; 214 unsigned char prepause; 215 unsigned char postpause; 216 unsigned char amp; 217 unsigned char newword; // bit 0=start of word, bit 1=end of clause, bit 2=start of sentence 218 unsigned char pitch1; 219 unsigned char pitch2; 220 unsigned char std_length; 221 unsigned int phontab_addr; 222 int sound_param; 223 } PHONEME_LIST; 224 225 #define pd_FMT 0 226 #define pd_WAV 1 227 #define pd_VWLSTART 2 228 #define pd_VWLEND 3 229 #define pd_ADDWAV 4 230 231 #define N_PHONEME_DATA_PARAM 16 232 #define pd_INSERTPHONEME i_INSERT_PHONEME 233 #define pd_APPENDPHONEME i_APPEND_PHONEME 234 #define pd_CHANGEPHONEME i_CHANGE_PHONEME 235 #define pd_CHANGE_NEXTPHONEME i_REPLACE_NEXT_PHONEME 236 #define pd_LENGTHMOD i_SET_LENGTH 237 238 #define pd_FORNEXTPH 0x2 239 #define pd_DONTLENGTHEN 0x4 240 #define pd_REDUCELENGTHCHANGE 0x8 241 typedef struct { 242 int pd_control; 243 int pd_param[N_PHONEME_DATA_PARAM]; // set from group 0 instructions 244 int sound_addr[5]; 245 int sound_param[5]; 246 int vowel_transition[4]; 247 int pitch_env; 248 int amp_env; 249 char ipa_string[18]; 250 } PHONEME_DATA; 251 252 typedef struct { 253 int fmt_control; 254 int use_vowelin; 255 int fmt_addr; 256 int fmt_length; 257 int fmt_amp; 258 int fmt2_addr; 259 int fmt2_lenadj; 260 int wav_addr; 261 int wav_amp; 262 int transition0; 263 int transition1; 264 int std_length; 265 } FMT_PARAMS; 266 267 typedef struct { 268 PHONEME_LIST prev_vowel; 269 } WORD_PH_DATA; 270 271 // instructions 272 273 #define INSTN_RETURN 0x0001 274 #define INSTN_CONTINUE 0x0002 275 276 // Group 0 instrcutions with 8 bit operand. These values go into bits 8-15 of the instruction 277 #define i_CHANGE_PHONEME 0x01 278 #define i_REPLACE_NEXT_PHONEME 0x02 279 #define i_INSERT_PHONEME 0x03 280 #define i_APPEND_PHONEME 0x04 281 #define i_APPEND_IFNEXTVOWEL 0x05 282 #define i_VOICING_SWITCH 0x06 283 #define i_PAUSE_BEFORE 0x07 284 #define i_PAUSE_AFTER 0x08 285 #define i_LENGTH_MOD 0x09 286 #define i_SET_LENGTH 0x0a 287 #define i_LONG_LENGTH 0x0b 288 #define i_ADD_LENGTH 0x0c 289 #define i_IPA_NAME 0x0d 290 291 #define i_CHANGE_IF 0x10 // 0x10 to 0x14 292 293 // conditions and jumps 294 #define i_CONDITION 0x2000 295 #define i_OR 0x1000 // added to i_CONDITION 296 #define i_NOT 0x0003 297 298 #define i_JUMP 0x6000 299 #define i_JUMP_FALSE 0x6800 300 #define i_SWITCH_NEXTVOWEL 0x6a00 301 #define i_SWITCH_PREVVOWEL 0x6c00 302 #define MAX_JUMP 255 // max jump distance 303 304 // multi-word instructions 305 #define i_CALLPH 0x9100 306 #define i_PITCHENV 0x9200 307 #define i_AMPENV 0x9300 308 #define i_VOWELIN 0xa100 309 #define i_VOWELOUT 0xa200 310 #define i_FMT 0xb000 311 #define i_WAV 0xc000 312 #define i_VWLSTART 0xd000 313 #define i_VWLENDING 0xe000 314 #define i_WAVADD 0xf000 315 316 // conditions 317 #define CONDITION_IS_PHONEME_TYPE 0x00 318 #define CONDITION_IS_PLACE_OF_ARTICULATION 0x20 319 #define CONDITION_IS_PHFLAG_SET 0x40 320 #define CONDITION_IS_OTHER 0x80 321 322 // other conditions (stress) 323 #define STRESS_IS_DIMINISHED 0 // diminished, unstressed within a word 324 #define STRESS_IS_UNSTRESSED 1 // unstressed, weak 325 #define STRESS_IS_NOT_STRESSED 2 // default, not stressed 326 #define STRESS_IS_SECONDARY 3 // secondary stress 327 #define STRESS_IS_PRIMARY 4 // primary (main) stress 328 #define STRESS_IS_PRIORITY 5 // replaces primary markers 329 #define STRESS_IS_EMPHASIZED 6 // emphasized 330 331 // other conditions 332 #define isAfterStress 9 333 #define isNotVowel 10 334 #define isFinalVowel 11 335 #define isVoiced 12 // voiced consonant, or vowel 336 #define isFirstVowel 13 337 #define isSecondVowel 14 338 #define isTranslationGiven 16 // phoneme translation given in **_list or as [[...]] 339 #define isBreak 17 // pause phoneme or (stop/vstop/vfric not followed by vowel or (liquid in same word)) 340 #define isWordStart 18 341 #define isWordEnd 19 342 343 #define i_StressLevel 0x800 344 345 typedef struct { 346 int name; 347 int length; 348 char *data; 349 char *filename; 350 } SOUND_ICON; 351 352 typedef struct { 353 int name; 354 unsigned int next_phoneme; 355 int mbr_name; 356 int mbr_name2; 357 int percent; // percentage length of first component 358 int control; 359 } MBROLA_TAB; 360 361 typedef struct { 362 int pause_factor; 363 int clause_pause_factor; 364 unsigned int min_pause; 365 int wav_factor; 366 int lenmod_factor; 367 int lenmod2_factor; 368 int min_sample_len; 369 int loud_consonants; 370 int fast_settings[8]; 371 } SPEED_FACTORS; 372 373 typedef struct { 374 char name[12]; 375 unsigned char flags[4]; 376 signed char head_extend[8]; 377 378 unsigned char prehead_start; 379 unsigned char prehead_end; 380 unsigned char stressed_env; 381 unsigned char stressed_drop; 382 unsigned char secondary_drop; 383 unsigned char unstressed_shape; 384 385 unsigned char onset; 386 unsigned char head_start; 387 unsigned char head_end; 388 unsigned char head_last; 389 390 unsigned char head_max_steps; 391 unsigned char n_head_extend; 392 393 signed char unstr_start[3]; // for: onset, head, last 394 signed char unstr_end[3]; 395 396 unsigned char nucleus0_env; // pitch envelope, tonic syllable is at end, no tail 397 unsigned char nucleus0_max; 398 unsigned char nucleus0_min; 399 400 unsigned char nucleus1_env; // when followed by a tail 401 unsigned char nucleus1_max; 402 unsigned char nucleus1_min; 403 unsigned char tail_start; 404 unsigned char tail_end; 405 406 unsigned char split_nucleus_env; 407 unsigned char split_nucleus_max; 408 unsigned char split_nucleus_min; 409 unsigned char split_tail_start; 410 unsigned char split_tail_end; 411 unsigned char split_tune; 412 413 unsigned char spare[8]; 414 int spare2; // the struct length should be a multiple of 4 bytes 415 } TUNE; 416 417 extern int n_tunes; 418 extern TUNE *tunes; 419 420 // phoneme table 421 extern PHONEME_TAB *phoneme_tab[N_PHONEME_TAB]; 422 423 // list of phonemes in a clause 424 extern int n_phoneme_list; 425 extern PHONEME_LIST phoneme_list[N_PHONEME_LIST+1]; 426 extern unsigned int embedded_list[]; 427 428 extern unsigned char env_fall[128]; 429 extern unsigned char env_rise[128]; 430 extern unsigned char env_frise[128]; 431 432 #define MAX_PITCH_VALUE 101 433 extern unsigned char pitch_adjust_tab[MAX_PITCH_VALUE+1]; 434 435 // queue of commands for wavegen 436 #define WCMD_KLATT 1 437 #define WCMD_KLATT2 2 438 #define WCMD_SPECT 3 439 #define WCMD_SPECT2 4 440 #define WCMD_PAUSE 5 441 #define WCMD_WAVE 6 442 #define WCMD_WAVE2 7 443 #define WCMD_AMPLITUDE 8 444 #define WCMD_PITCH 9 445 #define WCMD_MARKER 10 446 #define WCMD_VOICE 11 447 #define WCMD_EMBEDDED 12 448 #define WCMD_MBROLA_DATA 13 449 #define WCMD_FMT_AMPLITUDE 14 450 #define WCMD_SONIC_SPEED 15 451 452 #define N_WCMDQ 170 453 #define MIN_WCMDQ 25 // need this many free entries before adding new phoneme 454 455 extern intptr_t wcmdq[N_WCMDQ][4]; 456 extern int wcmdq_head; 457 extern int wcmdq_tail; 458 459 // from Wavegen file 460 int WcmdqFree(void); 461 void WcmdqStop(void); 462 int WcmdqUsed(void); 463 void WcmdqInc(void); 464 void WavegenInit(int rate, int wavemult_fact); 465 float polint(float xa[], float ya[], int n, float x); 466 int WavegenFill(void); 467 void MarkerEvent(int type, unsigned int char_position, int value, int value2, unsigned char *out_ptr); 468 int GetAmplitude(void); 469 void SetPitch2(voice_t *voice, int pitch1, int pitch2, int *pitch_base, int *pitch_range); 470 int PeaksToHarmspect(wavegen_peaks_t *peaks, int pitch, int *htab, int control); 471 472 extern unsigned char *wavefile_data; 473 extern int samplerate; 474 extern int samplerate_native; 475 476 extern int wavefile_ix; 477 extern int wavefile_amp; 478 extern int wavefile_ix2; 479 extern int wavefile_amp2; 480 extern int vowel_transition[4]; 481 extern int vowel_transition0, vowel_transition1; 482 483 #define N_ECHO_BUF 5500 // max of 250mS at 22050 Hz 484 extern int echo_head; 485 extern int echo_tail; 486 extern int echo_amp; 487 extern short echo_buf[N_ECHO_BUF]; 488 489 extern int mbrola_delay; 490 extern char mbrola_name[20]; 491 492 // from synthdata file 493 unsigned int LookupSound(PHONEME_TAB *ph1, PHONEME_TAB *ph2, int which, int *match_level, int control); 494 frameref_t *LookupSpect(PHONEME_TAB *this_ph, int which, FMT_PARAMS *fmt_params, int *n_frames, PHONEME_LIST *plist); 495 void FreePhData(void); 496 497 unsigned char *LookupEnvelope(int ix); 498 espeak_ng_STATUS LoadPhData(int *srate, espeak_ng_ERROR_CONTEXT *context); 499 void FreePhData(void); 500 501 void SynthesizeInit(void); 502 int Generate(PHONEME_LIST *phoneme_list, int *n_ph, bool resume); 503 void MakeWave2(PHONEME_LIST *p, int n_ph); 504 int SpeakNextClause(int control); 505 void SetSpeed(int control); 506 void SetEmbedded(int control, int value); 507 void SelectPhonemeTable(int number); 508 int SelectPhonemeTableName(const char *name); 509 510 void Write4Bytes(FILE *f, int value); 511 int Read4Bytes(FILE *f); 512 513 #define ENV_LEN 128 // length of pitch envelopes 514 #define PITCHfall 0 // standard pitch envelopes 515 #define PITCHrise 2 516 #define N_ENVELOPE_DATA 20 517 extern unsigned char *envelope_data[N_ENVELOPE_DATA]; 518 519 extern int formant_rate[]; // max rate of change of each formant 520 extern SPEED_FACTORS speed; 521 522 extern long count_samples; 523 extern unsigned char *out_ptr; 524 extern unsigned char *out_start; 525 extern unsigned char *out_end; 526 extern int event_list_ix; 527 extern espeak_EVENT *event_list; 528 extern t_espeak_callback *synth_callback; 529 extern const char *version_string; 530 extern const int version_phdata; 531 extern double sonicSpeed; 532 533 #define N_SOUNDICON_TAB 80 // total entries in soundicon_tab 534 #define N_SOUNDICON_SLOTS 4 // number of slots reserved for dynamic loading of audio files 535 extern int n_soundicon_tab; 536 extern SOUND_ICON soundicon_tab[N_SOUNDICON_TAB]; 537 538 espeak_ng_STATUS LoadMbrolaTable(const char *mbrola_voice, const char *phtrans, int *srate); 539 espeak_ng_STATUS SetParameter(int parameter, int value, int relative); 540 int MbrolaTranslate(PHONEME_LIST *plist, int n_phonemes, bool resume, FILE *f_mbrola); 541 int MbrolaGenerate(PHONEME_LIST *phoneme_list, int *n_ph, bool resume); 542 int MbrolaFill(int length, bool resume, int amplitude); 543 void MbrolaReset(void); 544 void DoEmbedded(int *embix, int sourceix); 545 void DoMarker(int type, int char_posn, int length, int value); 546 void DoPhonemeMarker(int type, int char_posn, int length, char *name); 547 int DoSample3(PHONEME_DATA *phdata, int length_mod, int amp); 548 int DoSpect2(PHONEME_TAB *this_ph, int which, FMT_PARAMS *fmt_params, PHONEME_LIST *plist, int modulation); 549 int FormantTransition2(frameref_t *seq, int *n_frames, unsigned int data1, unsigned int data2, PHONEME_TAB *other_ph, int which); 550 int PauseLength(int pause, int control); 551 int LookupPhonemeTable(const char *name); 552 unsigned char *GetEnvelope(int index); 553 int NumInstnWords(USHORT *prog); 554 int GetAmplitude(void); 555 556 void InitBreath(void); 557 558 559 560 #ifdef __cplusplus 561 } 562 #endif 563