1 /*
2  * Copyright (C) 2005 to 2014 by Jonathan Duddington
3  * email: jonsd@users.sourceforge.net
4  * Copyright (C) 2015-2017 Reece H. Dunn
5  *
6  * This program is free software; you can redistribute it and/or modify
7  * it under the terms of the GNU General Public License as published by
8  * the Free Software Foundation; either version 3 of the License, or
9  * (at your option) any later version.
10  *
11  * This program is distributed in the hope that it will be useful,
12  * but WITHOUT ANY WARRANTY; without even the implied warranty of
13  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
14  * GNU General Public License for more details.
15  *
16  * You should have received a copy of the GNU General Public License
17  * along with this program; if not, see: <http://www.gnu.org/licenses/>.
18  */
19 
20 //#include <stdbool.h>
21 
22 #include "phoneme.h"
23 #include "voice.h"
24 #ifdef __cplusplus
25 extern "C"
26 {
27 #endif
28 
29 #define espeakINITIALIZE_PHONEME_IPA 0x0002 // move this to speak_lib.h, after eSpeak version 1.46.02
30 
31 #define N_PHONEME_LIST 1000 // enough for source[N_TR_SOURCE] full of text, else it will truncate
32 
33 #define MAX_HARMONIC 400 // 400 * 50Hz = 20 kHz, more than enough
34 #define N_SEQ_FRAMES  25 // max frames in a spectrum sequence (real max is ablut 8)
35 #define STEPSIZE      64 // 2.9mS at 22 kHz sample rate
36 
37 // flags set for frames within a spectrum sequence
38 #define FRFLAG_KLATT           0x01 // this frame includes extra data for Klatt synthesizer
39 #define FRFLAG_VOWEL_CENTRE    0x02 // centre point of vowel
40 #define FRFLAG_LEN_MOD         0x04 // reduce effect of length adjustment
41 #define FRFLAG_BREAK_LF        0x08 // but keep f3 upwards
42 #define FRFLAG_BREAK           0x10 // don't merge with next frame
43 #define FRFLAG_BREAK_2         0x18 // FRFLAG_BREAK_LF or FRFLAG_BREAK
44 #define FRFLAG_FORMANT_RATE    0x20 // Flag5 allow increased rate of change of formant freq
45 #define FRFLAG_MODULATE        0x40 // Flag6 modulate amplitude of some cycles to give trill
46 #define FRFLAG_DEFER_WAV       0x80 // Flag7 defer mixing WAV until the next frame
47 #define FRFLAG_LEN_MOD2      0x4000 // reduce effect of length adjustment, used for the start of a vowel
48 #define FRFLAG_COPIED        0x8000 // This frame has been copied into temporary rw memory
49 
50 #define SFLAG_SEQCONTINUE      0x01 // a liquid or nasal after a vowel, but not followed by a vowel
51 #define SFLAG_EMBEDDED         0x02 // there are embedded commands before this phoneme
52 #define SFLAG_SYLLABLE         0x04 // vowel or syllabic consonant
53 #define SFLAG_LENGTHEN         0x08 // lengthen symbol : included after this phoneme
54 #define SFLAG_DICTIONARY       0x10 // the pronunciation of this word was listed in the xx_list dictionary
55 #define SFLAG_SWITCHED_LANG    0x20 // this word uses phonemes from a different language
56 #define SFLAG_PROMOTE_STRESS   0x40 // this unstressed word can be promoted to stressed
57 
58 #define SFLAG_PREV_PAUSE     0x1000 // consider previous phoneme as pause
59 #define SFLAG_NEXT_PAUSE     0x2000 // consider next phoneme as pause
60 
61 // embedded command numbers
62 #define EMBED_P     1 // pitch
63 #define EMBED_S     2 // speed (used in setlengths)
64 #define EMBED_A     3 // amplitude/volume
65 #define EMBED_R     4 // pitch range/expression
66 #define EMBED_H     5 // echo/reverberation
67 #define EMBED_T     6 // different tone for announcing punctuation (not used)
68 #define EMBED_I     7 // sound icon
69 #define EMBED_S2    8 // speed (used in synthesize)
70 #define EMBED_Y     9 // say-as commands
71 #define EMBED_M    10 // mark name
72 #define EMBED_U    11 // audio uri
73 #define EMBED_B    12 // break
74 #define EMBED_F    13 // emphasis
75 #define EMBED_C    14 // capital letter indication
76 
77 #define N_EMBEDDED_VALUES    15
78 extern int embedded_value[N_EMBEDDED_VALUES];
79 extern int embedded_default[N_EMBEDDED_VALUES];
80 
81 #define N_MARKERS 8
82 
83 #define N_KLATTP   10 // this affects the phoneme data file format
84 #define N_KLATTP2  14 // used in vowel files, with extra parameters for future extensions
85 
86 #define KLATT_AV      0
87 #define KLATT_FNZ     1 // nasal zero freq
88 #define KLATT_Tilt    2
89 #define KLATT_Aspr    3
90 #define KLATT_Skew    4
91 
92 #define KLATT_Kopen   5
93 #define KLATT_AVp     6
94 #define KLATT_Fric    7
95 #define KLATT_FricBP  8
96 #define KLATT_Turb    9
97 
98 typedef struct { // 64 bytes
99 	short frflags;
100 	short ffreq[7];
101 	unsigned char length;
102 	unsigned char rms;
103 	unsigned char fheight[8];
104 	unsigned char fwidth[6];   // width/4  f0-5
105 	unsigned char fright[3];   // width/4  f0-2
106 	unsigned char bw[4];       // Klatt bandwidth BNZ /2, f1,f2,f3
107 	unsigned char klattp[5];   // AV, FNZ, Tilt, Aspr, Skew
108 	unsigned char klattp2[5];  // continuation of klattp[],  Avp, Fric, FricBP, Turb
109 	unsigned char klatt_ap[7]; // Klatt parallel amplitude
110 	unsigned char klatt_bp[7]; // Klatt parallel bandwidth  /2
111 	unsigned char spare;       // pad to multiple of 4 bytes
112 } frame_t; // with extra Klatt parameters for parallel resonators
113 
114 typedef struct { // 44 bytes
115 	short frflags;
116 	short ffreq[7];
117 	unsigned char length;
118 	unsigned char rms;
119 	unsigned char fheight[8];
120 	unsigned char fwidth[6];  // width/4  f0-5
121 	unsigned char fright[3];  // width/4  f0-2
122 	unsigned char bw[4];      // Klatt bandwidth BNZ /2, f1,f2,f3
123 	unsigned char klattp[5];  // AV, FNZ, Tilt, Aspr, Skew
124 } frame_t2; // without the extra Klatt parameters
125 
126 // formant data used by wavegen
127 typedef struct {
128 	int freq;     // Hz<<16
129 	int height;   // height<<15
130 	int left;     // Hz<<16
131 	int right;    // Hz<<16
132 	DOUBLEX freq1; // floating point versions of the above
133 	DOUBLEX height1;
134 	DOUBLEX left1;
135 	DOUBLEX right1;
136 	DOUBLEX freq_inc; // increment by this every 64 samples
137 	DOUBLEX height_inc;
138 	DOUBLEX left_inc;
139 	DOUBLEX right_inc;
140 } wavegen_peaks_t;
141 
142 typedef struct {
143 	unsigned char *pitch_env;
144 	int pitch;      // pitch Hz*256
145 	int pitch_ix;   // index into pitch envelope (*256)
146 	int pitch_inc;  // increment to pitch_ix
147 	int pitch_base; // Hz*256 low, before modified by envelope
148 	int pitch_range; // Hz*256 range of envelope
149 
150 	unsigned char *mix_wavefile; // wave file to be added to synthesis
151 	int n_mix_wavefile; // length in bytes
152 	int mix_wave_scale; // 0=2 byte samples
153 	int mix_wave_amp;
154 	int mix_wavefile_ix;
155 	int mix_wavefile_max; // length of available WAV data (in bytes)
156 	int mix_wavefile_offset;
157 
158 	int amplitude;
159 	int amplitude_v;
160 	int amplitude_fmt; // percentage amplitude adjustment for formant synthesis
161 } WGEN_DATA;
162 
163 typedef struct {
164 	double a;
165 	double b;
166 	double c;
167 	double x1;
168 	double x2;
169 } RESONATOR;
170 
171 typedef struct {
172 	short length_total; // not used
173 	unsigned char n_frames;
174 	unsigned char sqflags;
175 	frame_t2 frame[N_SEQ_FRAMES]; // max. frames in a spectrum sequence
176 } SPECT_SEQ; // sequence of espeak formant frames
177 
178 typedef struct {
179 	short length_total; // not used
180 	unsigned char n_frames;
181 	unsigned char sqflags;
182 	frame_t frame[N_SEQ_FRAMES]; // max. frames in a spectrum sequence
183 } SPECT_SEQK; // sequence of klatt formants frames
184 
185 typedef struct {
186 	short length;
187 	short frflags;
188 	frame_t *frame;
189 } frameref_t;
190 
191 // a clause translated into phoneme codes (first stage)
192 typedef struct {
193 	unsigned short synthflags; // NOTE Put shorts on 32bit boundaries, because of RISC OS compiler bug?
194 	unsigned char phcode;
195 	unsigned char stresslevel;
196 	unsigned short sourceix;  // ix into the original source text string, only set at the start of a word
197 	unsigned char wordstress; // the highest level stress in this word
198 	unsigned char tone_ph;    // tone phoneme to use with this vowel
199 } PHONEME_LIST2;
200 
201 typedef struct {
202 	// The first section is a copy of PHONEME_LIST2
203 	unsigned short synthflags;
204 	unsigned char phcode;
205 	unsigned char stresslevel;
206 	unsigned short sourceix;  // ix into the original source text string, only set at the start of a word
207 	unsigned char wordstress; // the highest level stress in this word
208 	unsigned char tone_ph;    // tone phoneme to use with this vowel
209 
210 	PHONEME_TAB *ph;
211 	unsigned int length;  // length_mod
212 	unsigned char env;    // pitch envelope number
213 	unsigned char type;
214 	unsigned char prepause;
215 	unsigned char postpause;
216 	unsigned char amp;
217 	unsigned char newword;   // bit 0=start of word, bit 1=end of clause, bit 2=start of sentence
218 	unsigned char pitch1;
219 	unsigned char pitch2;
220 	unsigned char std_length;
221 	unsigned int phontab_addr;
222 	int sound_param;
223 } PHONEME_LIST;
224 
225 #define pd_FMT    0
226 #define pd_WAV    1
227 #define pd_VWLSTART 2
228 #define pd_VWLEND 3
229 #define pd_ADDWAV 4
230 
231 #define N_PHONEME_DATA_PARAM 16
232 #define pd_INSERTPHONEME   i_INSERT_PHONEME
233 #define pd_APPENDPHONEME   i_APPEND_PHONEME
234 #define pd_CHANGEPHONEME   i_CHANGE_PHONEME
235 #define pd_CHANGE_NEXTPHONEME  i_REPLACE_NEXT_PHONEME
236 #define pd_LENGTHMOD       i_SET_LENGTH
237 
238 #define pd_FORNEXTPH     0x2
239 #define pd_DONTLENGTHEN  0x4
240 #define pd_REDUCELENGTHCHANGE 0x8
241 typedef struct {
242 	int pd_control;
243 	int pd_param[N_PHONEME_DATA_PARAM];  // set from group 0 instructions
244 	int sound_addr[5];
245 	int sound_param[5];
246 	int vowel_transition[4];
247 	int pitch_env;
248 	int amp_env;
249 	char ipa_string[18];
250 } PHONEME_DATA;
251 
252 typedef struct {
253 	int fmt_control;
254 	int use_vowelin;
255 	int fmt_addr;
256 	int fmt_length;
257 	int fmt_amp;
258 	int fmt2_addr;
259 	int fmt2_lenadj;
260 	int wav_addr;
261 	int wav_amp;
262 	int transition0;
263 	int transition1;
264 	int std_length;
265 } FMT_PARAMS;
266 
267 typedef struct {
268 	PHONEME_LIST prev_vowel;
269 } WORD_PH_DATA;
270 
271 // instructions
272 
273 #define INSTN_RETURN         0x0001
274 #define INSTN_CONTINUE       0x0002
275 
276 // Group 0 instrcutions with 8 bit operand.  These values go into bits 8-15 of the instruction
277 #define i_CHANGE_PHONEME 0x01
278 #define i_REPLACE_NEXT_PHONEME 0x02
279 #define i_INSERT_PHONEME 0x03
280 #define i_APPEND_PHONEME 0x04
281 #define i_APPEND_IFNEXTVOWEL 0x05
282 #define i_VOICING_SWITCH 0x06
283 #define i_PAUSE_BEFORE   0x07
284 #define i_PAUSE_AFTER    0x08
285 #define i_LENGTH_MOD     0x09
286 #define i_SET_LENGTH     0x0a
287 #define i_LONG_LENGTH    0x0b
288 #define i_ADD_LENGTH     0x0c
289 #define i_IPA_NAME       0x0d
290 
291 #define i_CHANGE_IF      0x10  // 0x10 to 0x14
292 
293 // conditions and jumps
294 #define i_CONDITION  0x2000
295 #define i_OR         0x1000  // added to i_CONDITION
296 #define i_NOT        0x0003
297 
298 #define i_JUMP       0x6000
299 #define i_JUMP_FALSE 0x6800
300 #define i_SWITCH_NEXTVOWEL 0x6a00
301 #define i_SWITCH_PREVVOWEL 0x6c00
302 #define MAX_JUMP     255  // max jump distance
303 
304 // multi-word instructions
305 #define i_CALLPH     0x9100
306 #define i_PITCHENV   0x9200
307 #define i_AMPENV     0x9300
308 #define i_VOWELIN    0xa100
309 #define i_VOWELOUT   0xa200
310 #define i_FMT        0xb000
311 #define i_WAV        0xc000
312 #define i_VWLSTART   0xd000
313 #define i_VWLENDING  0xe000
314 #define i_WAVADD     0xf000
315 
316 // conditions
317 #define CONDITION_IS_PHONEME_TYPE 0x00
318 #define CONDITION_IS_PLACE_OF_ARTICULATION 0x20
319 #define CONDITION_IS_PHFLAG_SET 0x40
320 #define CONDITION_IS_OTHER 0x80
321 
322 // other conditions (stress)
323 #define STRESS_IS_DIMINISHED    0       // diminished, unstressed within a word
324 #define STRESS_IS_UNSTRESSED    1       // unstressed, weak
325 #define STRESS_IS_NOT_STRESSED  2       // default, not stressed
326 #define STRESS_IS_SECONDARY     3       // secondary stress
327 #define STRESS_IS_PRIMARY       4       // primary (main) stress
328 #define STRESS_IS_PRIORITY      5       // replaces primary markers
329 #define STRESS_IS_EMPHASIZED	6       // emphasized
330 
331 // other conditions
332 #define isAfterStress  9
333 #define isNotVowel    10
334 #define isFinalVowel  11
335 #define isVoiced      12 // voiced consonant, or vowel
336 #define isFirstVowel  13
337 #define isSecondVowel 14
338 #define isTranslationGiven 16 // phoneme translation given in **_list or as [[...]]
339 #define isBreak        17 // pause phoneme or (stop/vstop/vfric not followed by vowel or (liquid in same word))
340 #define isWordStart    18
341 #define isWordEnd      19
342 
343 #define i_StressLevel  0x800
344 
345 typedef struct {
346 	int name;
347 	int length;
348 	char *data;
349 	char *filename;
350 } SOUND_ICON;
351 
352 typedef struct {
353 	int name;
354 	unsigned int next_phoneme;
355 	int mbr_name;
356 	int mbr_name2;
357 	int percent; // percentage length of first component
358 	int control;
359 } MBROLA_TAB;
360 
361 typedef struct {
362 	int pause_factor;
363 	int clause_pause_factor;
364 	unsigned int min_pause;
365 	int wav_factor;
366 	int lenmod_factor;
367 	int lenmod2_factor;
368 	int min_sample_len;
369 	int loud_consonants;
370 	int fast_settings[8];
371 } SPEED_FACTORS;
372 
373 typedef struct {
374 	char name[12];
375 	unsigned char flags[4];
376 	signed char head_extend[8];
377 
378 	unsigned char prehead_start;
379 	unsigned char prehead_end;
380 	unsigned char stressed_env;
381 	unsigned char stressed_drop;
382 	unsigned char secondary_drop;
383 	unsigned char unstressed_shape;
384 
385 	unsigned char onset;
386 	unsigned char head_start;
387 	unsigned char head_end;
388 	unsigned char head_last;
389 
390 	unsigned char head_max_steps;
391 	unsigned char n_head_extend;
392 
393 	signed char unstr_start[3]; // for: onset, head, last
394 	signed char unstr_end[3];
395 
396 	unsigned char nucleus0_env; // pitch envelope, tonic syllable is at end, no tail
397 	unsigned char nucleus0_max;
398 	unsigned char nucleus0_min;
399 
400 	unsigned char nucleus1_env; // when followed by a tail
401 	unsigned char nucleus1_max;
402 	unsigned char nucleus1_min;
403 	unsigned char tail_start;
404 	unsigned char tail_end;
405 
406 	unsigned char split_nucleus_env;
407 	unsigned char split_nucleus_max;
408 	unsigned char split_nucleus_min;
409 	unsigned char split_tail_start;
410 	unsigned char split_tail_end;
411 	unsigned char split_tune;
412 
413 	unsigned char spare[8];
414 	int spare2; // the struct length should be a multiple of 4 bytes
415 } TUNE;
416 
417 extern int n_tunes;
418 extern TUNE *tunes;
419 
420 // phoneme table
421 extern PHONEME_TAB *phoneme_tab[N_PHONEME_TAB];
422 
423 // list of phonemes in a clause
424 extern int n_phoneme_list;
425 extern PHONEME_LIST phoneme_list[N_PHONEME_LIST+1];
426 extern unsigned int embedded_list[];
427 
428 extern unsigned char env_fall[128];
429 extern unsigned char env_rise[128];
430 extern unsigned char env_frise[128];
431 
432 #define MAX_PITCH_VALUE  101
433 extern unsigned char pitch_adjust_tab[MAX_PITCH_VALUE+1];
434 
435 // queue of commands for wavegen
436 #define WCMD_KLATT  1
437 #define WCMD_KLATT2 2
438 #define WCMD_SPECT  3
439 #define WCMD_SPECT2 4
440 #define WCMD_PAUSE  5
441 #define WCMD_WAVE    6
442 #define WCMD_WAVE2   7
443 #define WCMD_AMPLITUDE 8
444 #define WCMD_PITCH  9
445 #define WCMD_MARKER 10
446 #define WCMD_VOICE   11
447 #define WCMD_EMBEDDED 12
448 #define WCMD_MBROLA_DATA 13
449 #define WCMD_FMT_AMPLITUDE 14
450 #define WCMD_SONIC_SPEED 15
451 
452 #define N_WCMDQ   170
453 #define MIN_WCMDQ  25   // need this many free entries before adding new phoneme
454 
455 extern intptr_t wcmdq[N_WCMDQ][4];
456 extern int wcmdq_head;
457 extern int wcmdq_tail;
458 
459 // from Wavegen file
460 int  WcmdqFree(void);
461 void WcmdqStop(void);
462 int  WcmdqUsed(void);
463 void WcmdqInc(void);
464 void WavegenInit(int rate, int wavemult_fact);
465 float polint(float xa[], float ya[], int n, float x);
466 int WavegenFill(void);
467 void MarkerEvent(int type, unsigned int char_position, int value, int value2, unsigned char *out_ptr);
468 int GetAmplitude(void);
469 void SetPitch2(voice_t *voice, int pitch1, int pitch2, int *pitch_base, int *pitch_range);
470 int PeaksToHarmspect(wavegen_peaks_t *peaks, int pitch, int *htab, int control);
471 
472 extern unsigned char *wavefile_data;
473 extern int samplerate;
474 extern int samplerate_native;
475 
476 extern int wavefile_ix;
477 extern int wavefile_amp;
478 extern int wavefile_ix2;
479 extern int wavefile_amp2;
480 extern int vowel_transition[4];
481 extern int vowel_transition0, vowel_transition1;
482 
483 #define N_ECHO_BUF 5500   // max of 250mS at 22050 Hz
484 extern int echo_head;
485 extern int echo_tail;
486 extern int echo_amp;
487 extern short echo_buf[N_ECHO_BUF];
488 
489 extern int mbrola_delay;
490 extern char mbrola_name[20];
491 
492 // from synthdata file
493 unsigned int LookupSound(PHONEME_TAB *ph1, PHONEME_TAB *ph2, int which, int *match_level, int control);
494 frameref_t *LookupSpect(PHONEME_TAB *this_ph, int which, FMT_PARAMS *fmt_params,  int *n_frames, PHONEME_LIST *plist);
495 void FreePhData(void);
496 
497 unsigned char *LookupEnvelope(int ix);
498 espeak_ng_STATUS LoadPhData(int *srate, espeak_ng_ERROR_CONTEXT *context);
499 void FreePhData(void);
500 
501 void SynthesizeInit(void);
502 int  Generate(PHONEME_LIST *phoneme_list, int *n_ph, bool resume);
503 void MakeWave2(PHONEME_LIST *p, int n_ph);
504 int  SpeakNextClause(int control);
505 void SetSpeed(int control);
506 void SetEmbedded(int control, int value);
507 void SelectPhonemeTable(int number);
508 int  SelectPhonemeTableName(const char *name);
509 
510 void Write4Bytes(FILE *f, int value);
511 int Read4Bytes(FILE *f);
512 
513 #define ENV_LEN  128    // length of pitch envelopes
514 #define PITCHfall   0  // standard pitch envelopes
515 #define PITCHrise   2
516 #define N_ENVELOPE_DATA   20
517 extern unsigned char *envelope_data[N_ENVELOPE_DATA];
518 
519 extern int formant_rate[];         // max rate of change of each formant
520 extern SPEED_FACTORS speed;
521 
522 extern long count_samples;
523 extern unsigned char *out_ptr;
524 extern unsigned char *out_start;
525 extern unsigned char *out_end;
526 extern int event_list_ix;
527 extern espeak_EVENT *event_list;
528 extern t_espeak_callback *synth_callback;
529 extern const char *version_string;
530 extern const int version_phdata;
531 extern double sonicSpeed;
532 
533 #define N_SOUNDICON_TAB  80   // total entries in soundicon_tab
534 #define N_SOUNDICON_SLOTS 4    // number of slots reserved for dynamic loading of audio files
535 extern int n_soundicon_tab;
536 extern SOUND_ICON soundicon_tab[N_SOUNDICON_TAB];
537 
538 espeak_ng_STATUS LoadMbrolaTable(const char *mbrola_voice, const char *phtrans, int *srate);
539 espeak_ng_STATUS SetParameter(int parameter, int value, int relative);
540 int MbrolaTranslate(PHONEME_LIST *plist, int n_phonemes, bool resume, FILE *f_mbrola);
541 int MbrolaGenerate(PHONEME_LIST *phoneme_list, int *n_ph, bool resume);
542 int MbrolaFill(int length, bool resume, int amplitude);
543 void MbrolaReset(void);
544 void DoEmbedded(int *embix, int sourceix);
545 void DoMarker(int type, int char_posn, int length, int value);
546 void DoPhonemeMarker(int type, int char_posn, int length, char *name);
547 int DoSample3(PHONEME_DATA *phdata, int length_mod, int amp);
548 int DoSpect2(PHONEME_TAB *this_ph, int which, FMT_PARAMS *fmt_params,  PHONEME_LIST *plist, int modulation);
549 int FormantTransition2(frameref_t *seq, int *n_frames, unsigned int data1, unsigned int data2, PHONEME_TAB *other_ph, int which);
550 int PauseLength(int pause, int control);
551 int LookupPhonemeTable(const char *name);
552 unsigned char *GetEnvelope(int index);
553 int NumInstnWords(USHORT *prog);
554 int GetAmplitude(void);
555 
556 void InitBreath(void);
557 
558 
559 
560 #ifdef __cplusplus
561 }
562 #endif
563